In [0]:
# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"
# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

In [0]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
%sql
--Disable format checks during the reading of Delta tables
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


In [0]:
from pyspark.sql import *
from pyspark.sql.types import *


#Load the kinesis data from stream to databricks

df_pin = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-12ffc5aba733-pin') \
.option('initialPosition','latest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()
df_pin = df_pin.selectExpr("CAST(data as STRING)")

#Creation of schema to convert and parse.

schema = StructType([
    StructField("index", StringType(), True),
    StructField("unique_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("poster_name", StringType(), True),
    StructField("follower_count", StringType(), True),  
    StructField("tag_list", StringType(), True),
    StructField("is_image_or_video", StringType(), True),
    StructField("image_src", StringType(), True),
    StructField("downloaded", LongType(), True),  
    StructField("save_location", StringType(), True),
    StructField("category", StringType(), True)
])

#Parse data into a table from a JSON string via the schema. 

df_pin = df_pin.withColumn("jsonData", from_json(col("data").cast("string"), schema))

#Rename columns

df_pin_kinesis = df_pin.select(
    col("jsonData.index").alias("index"),
    col("jsonData.unique_id").alias("unique_id"),
    col("jsonData.title").alias("title"),
    col("jsonData.description").alias("description"),
    col("jsonData.poster_name").alias("poster_name"),
    col("jsonData.follower_count").alias("follower_count"),
    col("jsonData.tag_list").alias("tag_list"),
    col("jsonData.is_image_or_video").alias("is_image_or_video"),
    col("jsonData.image_src").alias("image_src"),
    col("jsonData.downloaded").alias("downloaded"),
    col("jsonData.save_location").alias("save_location"),
    col("jsonData.category").alias("category")
)

# Replace empty entries or invalid data with None

df_pin_cleaned = df_pin_kinesis.replace(["", "N/A","User Info Error", "No description available Story format", "Image src error","N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", "No Title Data Available",], None)

# Clean and cast to IntegerType
df_pin_cleaned = df_pin_cleaned.withColumn("follower_count", regexp_replace(col("follower_count"), " ", ""))
df_pin_cleaned = df_pin_cleaned.withColumn("follower_count",
    when(col("follower_count").contains("k"), 
         regexp_replace(col("follower_count"), "k", "").cast("float") * 1000)
    .when(col("follower_count").contains("m"), 
         regexp_replace(col("follower_count"), "m", "").cast("float") * 1000000)
    .when(col("follower_count").contains("M"), 
         regexp_replace(col("follower_count"), "M", "").cast("float") * 1000000)
    .otherwise(col("follower_count").cast("float"))
)

df_pin_cleaned = df_pin_cleaned.withColumn("follower_count", col("follower_count").cast("int"))


# More cleaning
df_pin_cleaned = df_pin_cleaned.withColumn("downloaded", col("downloaded").cast(IntegerType()))
df_pin_cleaned = df_pin_cleaned.withColumn("index", col("index").cast(IntegerType()))

df_pin_cleaned = df_pin_cleaned.withColumn("save_location", split(col("save_location"), " ").getItem(3))
                
df_pin_cleaned = df_pin_cleaned.withColumnRenamed("index", "ind")

df_pin_cleaned = df_pin_cleaned.select("ind", "unique_id", "title", "description", 
                                       "follower_count", "poster_name", "tag_list", 
                                       "is_image_or_video", "image_src", "save_location", "category")
display(df_pin_cleaned)

#Write to delta table

df_pin_cleaned.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("12ffc5aba733_pin_table")


ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
10663,d5208c16-82ae-4fd1-87f0-36e5ef789412,21+ Classic Car Picture of the 1950s - Vintagetopia,There are a lot of options of places to stay so you probably only have to book the initial one so you know the best place to land from the airport,171000,vintagetopia,"Old Classic Cars,Classic Trucks,Old Vintage Cars,Antique Cars,Vintage Trucks,Antique Trucks,Cadillac,Convertible,Mustang",image,https://i.pinimg.com/originals/e9/01/4d/e9014d537dd2b81c606e8466cef3904c.jpg,/data/vehicles,vehicles
5468,91ee49f6-0070-4270-ba19-fde88ba51c91,"Budgets, Spouses & Making It Work! - Tshanina Peterson",Don't fight about budgets and money with your spouse! Here are a few tips that we've found that make it work in our marriage!,313,Annamarie Hilton,"Ways To Save Money,Money Tips,Money Saving Tips,How To Make Money,Managing Money,Saving Time,College Fund,Scholarships For College,Education College",image,https://i.pinimg.com/originals/37/d5/82/37d58296bcef3471a633c76b89d8625e.jpg,/data/finance,finance
205,f4afd543-b9fe-44c3-8064-f2198faa1372,DIY Abstract Heart Painting and a Fun Paint Party,Use our easy step by step tutorial to create a DIY abstract heart painting on any canvas size. Lots of inspiration to help you choose colors.,410000,The Creativity Exchange,"Painting & Drawing,Watercolor Paintings,Original Paintings,Owl Watercolor,Acrylic Paintings,Owl Art,Bird Art,Beginner Painting,Animal Paintings",image,https://i.pinimg.com/originals/33/85/f8/3385f82ee5e8ecdcccdf798bdcbee3d2.jpg,/data/art,art
7790,9cd89fb9-d1af-4e66-85ca-c9d656c918b4,50 Heart Touching Sad Quotes That Will Make You Cry,Are you looking for some heart touching sad quotes and sayings; Here we have collected for you 50 best heart touching sad quotes..,112,AnnaLee Kick,"Crush Quotes For Girls,Sad Crush Quotes,Hurt Quotes,Love Me Quotes,Mood Quotes,Quotes To Live By,Life Quotes,Quotes Quotes,Sad Quotes That Make You Cry",image,https://i.pinimg.com/originals/0c/0f/40/0c0f408d96458080e0c82adb50be8e8f.jpg,/data/quotes,quotes
8606,71757040-bfc8-4631-9cac-22801e61d9e3,30+ Eagle Tattoos Ideas for Women,summcoco gives you inspiration for the women fashion trends you want. Thinking about a new look or lifestyle? This is your ultimate resource to get the hottest trends. 30+ Eagle…,306000,"Sumcoco | Decor Ideas, Hairstyles, Nails Fashion Advice","White Bird Tattoos,Rose Tattoos,Flower Tattoos,Tattoo Black,Bird And Flower Tattoo,Tattoo Floral,Mini Tattoos,Flower Art,Girls With Sleeve Tattoos",image,https://i.pinimg.com/originals/8d/c6/ed/8dc6ed0b212393187a14c7d07a88060f.png,/data/tattoos,tattoos
9759,d105eb6e-0f9f-46e7-8d02-d24b62f6ae90,8 Best Greek Islands You Have To Visit - TheFab20s,"Although you'd think Greek islands are fairly similar, you'd be completely wrong! Each island has it's own personality and appeal. Here are the 8 Best Greek Islands to visit, es…",42000,TheFab20s | Travel+Food+DIY+Listicles,"Greek Islands To Visit,Best Greek Islands,Greece Islands,Cool Places To Visit,Places To Go,Best Places In Portugal,Copenhagen Travel,Paros Island,Santorini Island",image,https://i.pinimg.com/originals/06/1d/ce/061dce38929dec8e74844442116bea4a.jpg,/data/travel,travel
719,d7c53e34-9540-4f48-a31b-89b6ed1852bb,10 Art Sub Lessons that only need a Pencil,10 art sub lessons that only need a pencil. Cover lessons for art teachers. Make the perfect art sub lessson folder with this amazing resources.,25000,The Arty Teacher,"Art Lessons For Kids,Art Lessons Elementary,Art For Kids,Art Sub Plans,Art Lesson Plans,Art Substitute Plans,High School Art,Middle School Art,Primary School Art",image,https://i.pinimg.com/originals/ee/a8/78/eea878911033897d981a69d9f6b2fb7c.png,/data/art,art
1704,5fbf9863-fb79-477c-a5b6-540c3020a55f,Christmas Trees From Pallet Wood | Holiday DIY,Christmas Trees From Pallet Wood | Holiday DIY: Deck the yard with some fun outdoor Christmas Trees! We made these merry and bright decorations from two old pallets we had lying…,3000000,Instructables,"Pallet Wood Christmas Tree,Wooden Christmas Crafts,Diy Christmas Tree,Christmas Projects,Holiday Crafts,Wooden Xmas Trees,Different Christmas Trees,Pallet Tree,Christmas Kitchen",image,https://i.pinimg.com/originals/64/7b/ca/647bca35169b7c144604116c64bcba8a.png,/data/christmas,christmas
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",46000,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,/data/christmas,christmas
8930,a2999c28-f7b2-4577-af87-49d21d6d8a18,135 Sunflower Tattoo Ideas - [Best Rated Designs in 2021],We have put together the Ultimative Sunflower Tattoo Collection in 2020. Check out our highest rated handpicked Sunflower designs here!,800000,Next Luxury,"Sunflower Tattoo Sleeve,Sunflower Tattoo Shoulder,Sunflower Tattoo Small,Sunflower Tattoos,Sunflower Tattoo Design,Shoulder Tattoo,Sunflower Mandala Tattoo,Sunflower Tattoo Meaning,Form Tattoo",image,https://i.pinimg.com/originals/18/de/6c/18de6c81a8637e224c7d63dce1414ceb.jpg,/data/tattoos,tattoos


In [0]:
from pyspark.sql import *
from pyspark.sql.types import *


spark.sql("TRUNCATE TABLE default.12ffc5aba733_user_table")

df_user = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-12ffc5aba733-user') \
.option('initialPosition','latest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()
df_user = df_user.selectExpr("CAST(data as STRING)")

schema = StructType([
    StructField("ind", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("date_joined", StringType(), True),
])

df_user = df_user.withColumn("jsonData", from_json(col("data").cast("string"), schema))

df_user_kinesis = df_user.select(
    col("jsonData.ind").alias("ind"),
    col("jsonData.first_name").alias("first_name"),
    col("jsonData.last_name").alias("last_name"),
    col("jsonData.age").alias("age"),
    col("jsonData.date_joined").alias("date_joined"),
)

df_user_cleaned = df_user_kinesis.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

# Drop the columns from the DataFrame.
df_user_cleaned = df_user_cleaned.drop("first_name").drop("last_name")

# Convert the column from a string to a timestamp data type.
df_user_cleaned = df_user_cleaned.withColumn("date_joined", col("date_joined").cast(TimestampType()))

#  Reorder the DataFrame columns 
df_user_cleaned = df_user_cleaned.select("ind", "user_name", "age", "date_joined")

display(df_user_cleaned)

#Write to delta table

df_user_cleaned.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("12ffc5aba733_user_table")


ind,user_name,age,date_joined
5162,James Jacobson,28,2017-04-30T16:41:26.000+0000
5151,Mary Jordan,26,2017-06-12T16:02:59.000+0000
2498,Jeremy Chase,50,2017-07-31T16:56:52.000+0000
7268,Alejandra Acevedo,20,2015-11-24T21:01:23.000+0000
3351,Eric Taylor,39,2017-04-04T06:05:03.000+0000
7738,Alexis George,34,2016-02-26T00:38:01.000+0000
8886,Abigail Bates,20,2015-11-07T20:59:32.000+0000
5742,Barbara Edwards,24,2016-01-09T19:37:29.000+0000
1362,Nancy Case,23,2016-02-18T08:44:33.000+0000
995,Jason Horne,27,2016-02-06T15:31:57.000+0000


In [0]:
from pyspark.sql import *
from pyspark.sql.types import *

spark.sql("TRUNCATE TABLE default.12ffc5aba733_geo_table")

df_geo = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-12ffc5aba733-geo') \
.option('initialPosition','latest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()
df_geo  = df_geo.selectExpr("CAST(data as STRING)")


schema_geo = StructType([
    StructField("ind", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("country", StringType(), True),
])

df_geo = df_geo.withColumn("jsonData", from_json(col("data").cast("string"), schema_geo))

df_geo_kinesis = df_geo.select(
    col("jsonData.ind").alias("ind"),
    col("jsonData.timestamp").alias("timestamp"),
    col("jsonData.latitude").alias("latitude"),
    col("jsonData.longitude").alias("longitude"),
    col("jsonData.country").alias("country"),
)

df_geo_cleaned = df_geo_kinesis.withColumn("coordinates", array(col("latitude"), col("longitude")))
# Drop the columns from the DataFrame.
df_geo_cleaned = df_geo_cleaned.drop("latitude").drop("longitude")

# Convert the column from a string to a timestamp data type.
df_geo_cleaned = df_geo_cleaned.withColumn("timestamp", col("timestamp").cast(TimestampType()))

# Reorder the DataFrame columns 
df_geo_cleaned = df_geo_cleaned.select("ind", "country", "coordinates", "timestamp")

display(df_geo_cleaned)

# Write the streaming data to a file sink
df_geo_cleaned.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("12ffc5aba733_geo_table")


ind,country,coordinates,timestamp
995,Moldova,"List(-88.0923, -179.319)",2022-09-22T12:17:57.000+0000
6250,Austria,"List(-73.2098, -150.856)",2020-11-22T18:28:02.000+0000
3184,Cape Verde,"List(-41.293, -171.585)",2018-09-25T23:17:46.000+0000
2955,Antigua and Barbuda,"List(23.7768, -23.4838)",2022-10-11T17:21:06.000+0000
4357,Bahamas,"List(-75.4909, -179.908)",2020-03-06T09:56:43.000+0000
3563,Saint Barthelemy,"List(-77.3153, -69.505)",2018-09-05T07:40:01.000+0000
6558,Guadeloupe,"List(-26.4944, -174.009)",2019-09-08T03:27:31.000+0000
4256,Afghanistan,"List(-88.5478, -174.971)",2018-10-10T17:57:16.000+0000
450,Antigua and Barbuda,"List(-81.0108, -165.206)",2018-09-28T05:43:18.000+0000
3945,Barbados,"List(-66.2816, -142.673)",2022-04-17T02:01:53.000+0000


In [0]:
dbutils.fs.rm("/tmp/kinesis/_checkpoints/", True)