In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib
from pyspark.sql.window import Window

In [None]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
    .option("header", first_row_is_header)\
    .option("sep", delimiter)\
    .load("/FileStore/tables/authentication_credentials.csv")

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user')\
    .select('Access key ID')\
    .collect()[0]['Access key ID']

SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user')\
    .select('Secret access key')\
    .collect()[0]['Secret access key']

# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")


In [None]:
# Stream pin post
df_pin = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName','streaming-12853887c065-pin') \
    .option('initialPosition','earliest') \
    .option('region','us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()

# Stream geolocation
df_geo = spark\
    .readStream\
    .format('kinesis')\
    .option('streamName', 'streaming-12853887c065-geo')\
    .option('initialPosition', 'earliest')\
    .option('region', 'us-east-1')\
    .option('awsAccessKey', ACCESS_KEY)\
    .option('awsSecretKey', SECRET_KEY)\
    .load()

#Stream user
df_user = spark\
    .readStream\
    .format('kinesis')\
    .option('streamName', 'streaming-12853887c065-user')\
    .option('initialPosition', 'earliest')\
    .option('region', 'us-east-1')\
    .option('awsAccessKey', ACCESS_KEY)\
    .option('awsSecretKey', SECRET_KEY)\
    .load()


In [None]:
df_pin.display(10)

In [None]:
#Cast to string to read json
df_pin = df_pin.selectExpr("CAST(data as STRING)")
df_geo = df_geo.selectExpr("CAST(data as STRING)")
df_user = df_user.selectExpr("CAST(data as STRING)")

###Construct schema
# pin schema
df_pin_schema = StructType([\
    StructField("index", IntegerType(),True),\
    StructField("unique_id", StringType(),True),\
    StructField("title", StringType(),True),\
    StructField("follower_count", StringType(),True),\
    StructField("poster_name", StringType(),True),\
    StructField("tag_list", StringType(),True),\
    StructField("is_image_or_video", StringType(),True),\
    StructField("image_src", StringType(),True),\
    StructField("save_location", StringType(),True),\
    StructField("category", StringType(),True),\
    StructField("downloaded", IntegerType(),True),\
    StructField("description", StringType(),True)\
])


#geo schema
df_geo_schema = StructType([\
    StructField("ind", IntegerType(),True),\
    StructField("country", StringType(),True),\
    StructField("latitude", StringType(),True),\
    StructField("longitude", StringType(),True),\
    StructField("timestamp", StringType(),True),\
])

#geo schema
df_user_schema = StructType([\
    StructField("ind", IntegerType(),True),\
    StructField("first_name", StringType(),True),\
    StructField("last_name", StringType(),True),\
    StructField("age", StringType(),True),\
    StructField("date_joined", StringType(),True),\
])


#Convert json to dataframe using the defined schemas

df_pin = df_pin.withColumn("data", from_json(col("data"),df_pin_schema))\
    .select("data.*") # data : Kinesis streaming data header name

#geo
df_geo = df_geo.withColumn('data', from_json(col('data'), df_geo_schema))\
    .select('data.*')

#user
df_user = df_user.withColumn('data', from_json(col('data'), df_user_schema))\
    .select('data.*')

In [None]:
# Cast follower_count column to int
df_pin = df_pin.withColumn("follower_count", \
    when((df_pin.follower_count.contains('k')),\
        1000*regexp_replace(df_pin["follower_count"], 'k', '').cast('int'))
    .when((df_pin.follower_count.contains('M')),\
        1000000*regexp_replace(df_pin['follower_count'], 'M', '').cast('int'))
    .otherwise(df_pin.follower_count).cast('int'))

df_pin = df_pin.withColumn("downloaded",col("downloaded").cast("int"))
df_pin = df_pin.withColumn("index",col("index").cast("int"))

#Replace null values
df_pin = df_pin.fillna(0)

#Replace null values with None

# Clean the data in the save_location column to include only the save location path
df_pin = df_pin.withColumn('save_location', split(df_pin.save_location, ' ')\
    .getItem(3))

#Rename the index column to ind.
df_pin = df_pin.withColumn('ind', col('index'))

df_pin_header = ['ind',
  'unique_id',
  'title',
  'description',
  'follower_count',
  'poster_name',
  'tag_list',
  'is_image_or_video',
  'image_src',
  'save_location',
  'category']

df_pin = df_pin.select(df_pin_header)

df_pin.select('ind', 'follower_count', 'save_location').display(10)

ind,follower_count,save_location
7528,0,/data/mens-fashion
7528,0,/data/mens-fashion
2863,124000,/data/diy-and-crafts
5730,0,/data/finance
8304,51000,/data/quotes
8731,211000,/data/tattoos
1313,43000,/data/beauty
4315,25000,/data/education
10794,437,/data/vehicles
5494,26000,/data/finance


In [None]:
# Create a new column coordinates that contains an array based on the latitude and longitude columns
df_geo = df_geo.withColumn('coordinates', array('latitude', 'longitude'))

# Drop the latitude and longitude columns from the DataFrame
df_geo = df_geo.drop('latitude', 'longitude')

# Convert the timestamp column from a string to a timestamp data type
df_geo = df_geo.withColumn('timestamp', col('timestamp')\
    .cast('timestamp'))

df_geo = df_geo.withColumn('timestamp', col('timestamp')\
    .cast('timestamp'))


# Reorder the DataFrame columns to have the following column order:
df_geo_header = [ 'ind',
  'country',
  'coordinates',
  'timestamp']

df_geo = df_geo.select(df_geo_header)
df_geo.display(10)

ind,country,coordinates,timestamp
7528,Albania,"List(-89.9787, -173.293)",2020-08-28T03:52:47.000+0000
2863,Armenia,"List(-5.34445, -177.924)",2020-04-27T13:34:16.000+0000
5730,Colombia,"List(-77.015, -101.437)",2021-04-19T17:37:03.000+0000
8304,French Guiana,"List(-28.8852, -164.87)",2019-09-13T04:50:29.000+0000
8731,Aruba,"List(-83.104, -171.302)",2020-07-17T04:39:09.000+0000
1313,Maldives,"List(77.0447, 61.9119)",2018-06-26T02:39:25.000+0000
4315,Cote d'Ivoire,"List(-45.8508, 66.1003)",2019-12-15T03:51:28.000+0000
10794,Cocos (Keeling) Islands,"List(-89.5236, -154.567)",2022-01-01T02:26:50.000+0000
5494,Bulgaria,"List(-82.6768, -129.202)",2021-07-21T02:02:35.000+0000
5069,Azerbaijan,"List(-63.0063, -157.474)",2021-03-20T09:32:44.000+0000


In [None]:
# Create a new column user_name that concatenates the information found in the first_name and last_name columns
df_user = df_user.withColumn('user_name', concat('first_name', 'last_name'))

# Drop the first_name and last_name columns from the DataFrame
df_user = df_user.drop('first_name', 'last_name')

# Convert the date_joined column from a string to a timestamp data type
df_user = df_user.withColumn('date_joined', col('date_joined')\
    .cast('timestamp'))

# Reorder the DataFrame columns to have the following column order:

df_user_header = ['ind',
  'user_name',
  'age',
  'date_joined']

df_user = df_user.select(df_user_header)

df_user.display(10)

ind,user_name,age,date_joined
7528,AbigailAli,20,2015-10-24T11:23:51.000+0000
2863,DylanHolmes,32,2016-10-23T14:06:51.000+0000
5730,RachelDavis,36,2015-12-08T20:02:43.000+0000
8304,CharlesBerry,25,2015-12-28T04:21:39.000+0000
8731,AndreaAlexander,21,2015-11-10T09:27:42.000+0000
1313,BrittanyJones,32,2016-04-02T03:51:23.000+0000
4315,MichellePrince,36,2015-12-20T16:38:13.000+0000
10794,ThomasTurner,34,2016-12-22T00:02:02.000+0000
5494,AnneAllen,27,2015-12-16T15:20:05.000+0000
5069,AmandaBall,25,2016-01-13T17:36:30.000+0000


In [None]:
# Find the most popular Pinterest category people post to based on their country.
# First join the dataframe
df_pin_geo = df_pin.join(df_geo, df_pin.ind==df_geo.ind, 'inner')

#Groupby country and category
df_pin_geo.groupby('country','category') \
    .agg(count('category')\
    .alias('category_count'))\
    .sort('category_count', ascending=False)

# df_pin_geo.show()

In [None]:
# Remove the checkpoint folder first
dbutils.fs.rm("/tmp/kinesis/_checkpoints/", True)

df_pin.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("12853887c065_pin_table")


df_geo.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("12853887c065_geo_table")


df_user.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("12853887c065_user_table")