## MOUNTING S3 ON DATABRICKS FILE SYSTEM

In [None]:
# pyspark functions
from pyspark.sql.functions import *
from pyspark.sql.types import *
# URL processing
import urllib

In [None]:
# Define the parth
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

#read the path to spark dataframe
access_key_df = spark.read.format("delta").load(delta_table_path)

In [None]:
# Retriving the access key and secret keys
ACCESS_KEY =access_key_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY =access_key_df.select('Secret access key').collect()[0]['Secret access key']
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")
aws_bucket_name = "user-0e4753f224a7-bucket"
mount_name = "pinterest_data"

In [None]:
# Mount the s3 to DBFS
dbutils.fs.mount(f"s3a://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{aws_bucket_name}", f"/mnt/{mount_name}")
display(dbutils.fs.ls(f"/mnt/{mount_name}"))

## Cleaning the pin data

In [None]:
def cleaning_of_pin_data(data):

    # Rename column 'index' to 'ind' in DataFrame df_pin
    df_pin = data.withColumnRenamed('index', 'ind')

    # Select specific columns from DataFrame df_pin and assign to df1
    df1 = df_pin.select(['ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name', 'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category'])

    # Remove 'k' from 'follower_count' column in DataFrame df_pin1
    df_pin1 = df1.withColumn('follower_count', regexp_replace(col('follower_count'), 'k', " "))

    # Remove non-alphanumeric characters from 'poster_name' column in DataFrame df_pin1
    df_pin1 = df_pin1.withColumn('poster_name', regexp_replace(col('poster_name'), '[^a-zA-Z0-9]', " "))

    # Convert 'follower_count' column to IntegerType in DataFrame df_pin1
    df_pin1 = df_pin1.withColumn('follower_count', col('follower_count').cast(IntegerType()))

    # Convert 'ind' column to IntegerType in DataFrame df_pin1
    df_pin1 = df_pin1.withColumn('ind', col('ind').cast(IntegerType()))

    # Filter DataFrame df_pin1 to remove rows where 'poster_name' is 'User Info Error' and 'follower_count' is null
    df_pin1 = df_pin1.filter((col('poster_name') != 'User Info Error') & (col('follower_count').isNotNull()))

    # Extract substring of length 100 starting from index 14 in 'save_location' column of DataFrame df_pin1
    df_pin1 = df_pin1.withColumn('save_location', col('save_location').substr(14, 100))
    #reorder the column to follow the prescribed format
    df_pin_batch = df_pin1.select(['ind','unique_id','title','description','follower_count','poster_name','tag_list','is_image_or_video','image_src','save_location','category'])
    return df_pin_batch

## Cleaning the geo data

In [None]:
def cleaning_of_geo_data(data)
    # Create 'coordinates' column as an array of 'latitude' and 'longitude'
    df_geo1 = data.withColumn('coordinates', array(col('latitude'), col('longitude')))

    # Drop 'latitude' and 'longitude' columns
    df_geo1 = df_geo1.drop('latitude', 'longitude')

    # Convert 'timestamp' column to timestamp type
    df_geo1 = df_geo1.withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss"))

    # Rename 'index' column to 'ind'
    df_geo1 = df_geo1.withColumnRenamed("index", 'ind')

    # Cast 'ind' column to IntegerType
    df_geo1 = df_geo1.withColumn('ind', col('ind').cast(IntegerType()))

    # Select specific columns
    df_geo_batch = df_geo1.select('ind', 'country', 'coordinates', 'timestamp')
    return df_geo_batch


## Cleaning the user data

In [None]:
def cleaning_of_user_data(data):
    # Concatenate 'first_name' and 'last_name' columns with a space in between and create a new column 'user_name'
    df_user1 = data.withColumn('user_name', concat(data['first_name'], lit(' '), data['last_name']))

    # Drop 'first_name' and 'last_name' columns from the DataFrame
    df_user1 = df_user1.drop('first_name', 'last_name')

    # Format 'date_joined' column to the "yyyy-MM-dd" format
    df_user1 = df_user1.withColumn('date_joined', date_format(df_user1.date_joined, "yyyy-MM-dd"))

    # Convert formatted 'date_joined' column to DateType
    df_user1 = df_user1.withColumn('date_joined', to_date(df_user1.date_joined))

    # Rename 'index' column to 'ind'
    df_user1 = df_user1.withColumnRenamed('index', 'ind')

    # Cast 'ind' column to IntegerType
    df_user1 = df_user1.withColumn('ind', col('ind').cast(IntegerType()))

    # Cast 'age' column to IntegerType
    df_user1 = df_user1.withColumn('age', col('age').cast(IntegerType()))

    # Select specific columns 'ind', 'user_name', 'age', and 'date_joined' from the DataFrame
    df_user_batch = df_user1.select('ind', 'user_name', 'age', 'date_joined')
    return df_user_batch

## Reading data (pin, geo, and user) from DBSF and cleaning

In [None]:
#Read Json files from the DBFS 
df_pin = spark.read.json('dbfs:/mnt/pinterest_data/topics/0e4753f224a7.pin/partition=0/*.json', multiLine=True)

# Reading geo data from DBFS
df_geo = spark.read.json('mnt/pinterest_data/topics/0e4753f224a7.geo/partition=0/*.json', multiLine=True)

#Read Json files from the DBFS 
df_user = spark.read.json('mnt/pinterest_data/topics/0e4753f224a7.user/partition=0/*.json', multiLine=True)

# Calling the functions for pin, geo and user cleaning
df_pin_batch = cleaning_of_pin_data(df_pin)
df_geo_batch = cleaning_of_geo_data(df_geo)
df_user_batch = cleaning_of_user_data(df_user)