In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import urllib

In [0]:
file_type = "csv"
first_row_is_header = "true"
delimiter = ","

aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

KINESIS_REGION = "us-east-1"
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
# df_pin
KINESIS_STREAM_NAME = "streaming-12f7a43505b1-pin"
kinesis_pin = (spark.readStream
  .format("kinesis")
  .option("streamName", KINESIS_STREAM_NAME)
  .option("region", KINESIS_REGION)
  .option("initialPosition", '{"at_timestamp": "05/23/2023 00:00:00 GMT", "format": "MM/dd/yyyy HH:mm:ss ZZZ"}')
  .option("awsAccessKey", ACCESS_KEY)
  .option("awsSecretKey", ENCODED_SECRET_KEY)
  .load()
)

df_pin_stream = kinesis_pin.selectExpr("cast (data as STRING) jsonData", "approximateArrivalTimestamp").withColumn("approximateArrivalDate", to_date(col("approximateArrivalTimestamp")))

df_pin = df_pin_stream.select(json_tuple('jsonData', "index", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src",  "save_location", "category", "downloaded").alias("index", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src",  "save_location", "category", "downloaded"))

#---------------------------------------------------------------------------------------------------------------
# df_geo
KINESIS_STREAM_NAME = "streaming-12f7a43505b1-geo"
kinesis_geo = (spark.readStream
.format("kinesis")
.option("streamName", KINESIS_STREAM_NAME)
.option("region", KINESIS_REGION)
.option("initialPosition", '{"at_timestamp": "05/23/2023 00:00:00 GMT", "format": "MM/dd/yyyy HH:mm:ss ZZZ"}')
.option("awsAccessKey", ACCESS_KEY)
.option("awsSecretKey", ENCODED_SECRET_KEY)
.load()
)

df_geo_stream = kinesis_geo.selectExpr("cast (data as STRING) jsonData", "approximateArrivalTimestamp").withColumn("approximateArrivalDate", to_date(col("approximateArrivalTimestamp")))

df_geo = df_geo_stream.select(json_tuple('jsonData', 'ind', 'country', 'latitude', 'longitude', 'timestamp').alias('ind', 'country', 'latitude', 'longitude', 'timestamp'))

#----------------------------------------------------------------------------------------------------------------
# df_user
KINESIS_STREAM_NAME = "streaming-12f7a43505b1-user"
kinesis_user = (spark.readStream
  .format("kinesis")
  .option("streamName", KINESIS_STREAM_NAME)
  .option("region", KINESIS_REGION)
  .option("initialPosition", '{"at_timestamp": "05/23/2023 00:00:00 GMT", "format": "MM/dd/yyyy HH:mm:ss ZZZ"}')
  .option("awsAccessKey", ACCESS_KEY)
  .option("awsSecretKey", SECRET_KEY)
  .load()
)

df_user_stream = kinesis_user.selectExpr("cast (data as STRING) jsonData", "approximateArrivalTimestamp").withColumn("approximateArrivalDate", to_date(col("approximateArrivalTimestamp")))

df_user = df_user_stream.select(json_tuple('jsonData', 'ind', "first_name", "last_name", 'age', 'date_joined').alias('ind', "first_name", "last_name", 'age', 'date_joined'))

In [0]:
#DATA CLEANING 
# df_pin
df_pin = df_pin.dropDuplicates()
df_pin = df_pin.withColumnRenamed('index', 'ind')
df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", 
"tag_list", "is_image_or_video", "image_src", "save_location", "category", "downloaded")
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%k]', '000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%M]', '000000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%User Info Error%]', ''))
df_pin = df_pin.withColumn('ind', df_pin['ind'].cast(IntegerType()))
df_pin = df_pin.withColumn('downloaded', df_pin['downloaded'].cast(IntegerType()))
df_pin = df_pin.withColumn('follower_count', df_pin['follower_count'].cast(IntegerType()))
df_pin = df_pin.withColumn('save_location', regexp_replace('save_location', 'Local save in *', ''))
df_pin.na.drop(how = "all")

#------------------------------------------------------------------------------------------------------------
#df_geo
df_geo = df_geo.dropDuplicates()
df_geo = df_geo.withColumn('ind', df_geo['ind'].cast(IntegerType()))
df_geo = df_geo.withColumn("timestamp", df_geo["timestamp"].cast(TimestampType()))
df_geo = df_geo.withColumn('longitude', df_geo['longitude'].cast(DoubleType()))
df_geo = df_geo.withColumn('latitude', df_geo['latitude'].cast(DoubleType()))
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
df_geo = df_geo.drop('latitude', 'longitude')
df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")

#----------------------------------------------------------------------------------------------------------------
# df_user
df_user = df_user.dropDuplicates()
df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
df_user = df_user.drop("first_name", "last_name")
df_user = df_user.withColumn('date_joined', df_user['date_joined'].cast(TimestampType()))
df_user = df_user.withColumn('age', df_user['age'].cast(TimestampType()))
df_user = df_user.select("ind", "user_name", "age", "date_joined")

In [0]:
#CREATE DELTA TABLE 

#df_pin
df_pin.writeStream.format("delta").option("checkpointLocation","dbfs:/user/hive/warehouse/12f7a43505b1_pin_table").table('12f7a43505b1_pin_table')

#df_geo
df_geo.writeStream.format("delta").option("checkpointLocation","dbfs:/user/hive/warehouse/12f7a43505b1_geo_table").table('12f7a43505b1_geo_table')

#df_user
df_user.writeStream.format("delta").option("checkpointLocation","dbfs:/user/hive/warehouse/12f7a43505b1_user_table").table('12f7a43505b1_user_table')