## MOUNTING S3 BUCKET ON DATABRICKS FILE SYSTEM

In [None]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

In [None]:
# Define the parth
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

#read the path to spark dataframe
access_key_df = spark.read.format("delta").load(delta_table_path)

In [None]:
ACCESS_KEY =access_key_df.select('Access key ID').collect()[0]['Access key ID'] # Store the Access Key ID obtained from the DataFrame.
SECRET_KEY =access_key_df.select('Secret access key').collect()[0]['Secret access key'] #  Store the Secret Access Key from the DataFrame.
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="") # Store the URL-encoded Secret Access Key.

In [None]:
aws_bucket_name = "user-0e4753f224a7-bucket" # Define AWS bucket name.
mounted_name = "pinterest_data_new" # Define mounted directory name.
dbutils.fs.mount(f"s3a://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{aws_bucket_name}", f"/mnt/s3_data/{mounted_name}") #Mount S3 bucket using Access Key, Secret Key, and bucket name.
display(dbutils.fs.ls(f"/mnt/s3_data/{mounted_name}")) # Display the contents of the mounted directory.




## CLEANING OF PIN DATASET

In [None]:
#Cleaning pin data
def Cleaning_of_pin_data(data):
    # Rename Index column to ind
    df_pin = data.withColumnRenamed('Index', 'ind')

    # Selecting required columns
    df_pin1 = df_pin.select(['ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name', 'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category'])

    # Clean follower_count, title and poster_name columns
    df_pin1 = df_pin1.withColumn('follower_count', regexp_replace(col('follower_count'), 'k', ' '))
    df_pin1 = df_pin1.withColumn('poster_name', regexp_replace(col('poster_name'), '[^a-zA-Z0-9]', ' '))
    df_pin1 = df_pin1.withColumn('title', regexp_replace(col('title'), '[^a-zA-Z0-9]', ' '))

    # Cast follower_count and ind columns to IntegerType
    df_pin1 = df_pin1.withColumn('follower_count', col('follower_count').cast(IntegerType()))
    df_pin1 = df_pin1.withColumn('ind', col('ind').cast(IntegerType()))

    # Filter rows where poster_name is not 'User Info Error' and follower_count is not null
    df_pin1 = df_pin1.filter((col('poster_name') != 'User Info Error') & (col('follower_count').isNotNull()))

    # Adjust save_location column
    df_pin_batch = df_pin1.withColumn('save_location', col('save_location').substr(14, 100))
    return df_pin_batch


## CLEANING OF GEO DATASET

In [None]:
from pyspark.sql.functions import array, col, to_timestamp
from pyspark.sql.types import IntegerType

def cleaning_of_geo_data(data2):
    # Reading the geo data from dbfs

    # Create 'coordinates' column as an array of 'latitude' and 'longitude'
    df_geo1 = data2.withColumn('coordinates', array(col('latitude'), col('longitude')))

    # Drop 'latitude' and 'longitude' columns
    df_geo1 = df_geo1.drop('latitude', 'longitude')

    # Convert 'timestamp' column to timestamp type
    df_geo1 = df_geo1.withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss"))

    # Rename 'index' column to 'ind'
    df_geo1 = df_geo1.withColumnRenamed("index", 'ind')

    # Cast 'ind' column to IntegerType
    df_geo1 = df_geo1.withColumn('ind', col('ind').cast(IntegerType()))

    # Select specific columns
    df_geo_batch = df_geo1.select('ind', 'country', 'coordinates', 'timestamp')
    return df_geo_batch

## CLEANING USER DATASET

In [None]:
from pyspark.sql.functions import concat, lit, date_format, to_date
from pyspark.sql.types import IntegerType

def cleaning_of_user_data(data3):
    # Concatenating 'first_name' and 'last_name' into a new column 'user_name'
    df_user1 = data3.withColumn('user_name', concat(data3['first_name'], lit(' '), data3['last_name']))

    # Dropping 'first_name' and 'last_name' columns from the DataFrame
    df_user1 = df_user1.drop('first_name', 'last_name')

    # Formatting 'date_joined' column to "yyyy-MM-dd" format
    df_user1 = df_user1.withColumn('date_joined', date_format(df_user1['date_joined'], "yyyy-MM-dd"))

    # Converting formatted 'date_joined' column to DateType
    df_user1 = df_user1.withColumn('date_joined', to_date(df_user1['date_joined']))

    # Renaming 'index' column to 'ind'
    df_user1 = df_user1.withColumnRenamed('index', 'ind')

    # Casting 'ind' column to IntegerType
    df_user1 = df_user1.withColumn('ind', df_user1['ind'].cast(IntegerType()))

    # Casting 'age' column to IntegerType
    df_user1 = df_user1.withColumn('age', df_user1['age'].cast(IntegerType()))

    # Selecting specific columns ('ind', 'user_name', 'age', 'date_joined') from the DataFrame
    df_user1 = df_user1.select('ind', 'user_name', 'age', 'date_joined')
    return df_user1


## Reading the data from DBFS and Calling the fucntions to clean the data

In [None]:
df_geo_batch = spark.read.json('dbfs:/mnt/data/pinterest_data_new/topics/0e4753f224a7.geo/partition=0/*.json', multiLine=True)
df_user_batch = spark.read.json('dbfs:/mnt/data/pinterest_data_new/topics/0e4753f224a7.user/partition=0/*.json', multiLine=True)
df_pin_batch = spark.read.json('dbfs:/mnt/data/pinterest_data_new/topics/0e4753f224a7.pin/partition=0/*.json', multiLine=True)
df_pin_batch_clean = Cleaning_of_pin_data(df_pin_batch)
df_geo_batch_clean = cleaning_of_geo_data(df_geo_batch)
df_user_batch_clean = cleaning_of_user_data(df_user_batch)

## Writing the cleaned data to parquet table

In [None]:
df_pin_batch_clean.write.format('parquet').save('/mnt/data/pinterest_data_new/batch_datapin1/')
df_geo_batch_clean.write.format('parquet').save('/mnt/data/pinterest_data_new/batch_datageo1/')
df_user_batch_clean.write.format('parquet').save('/mnt/data/pinterest_data_new/batch_datauser1/')
