# Clean the Pin Data 
1. Initialize and drop duplicates
2. Replace entries with no relevant data in each column with nulls
3. Perform the necessary transformations on the follower_count to ensure every entry is a number and the data type of this column is integer
5. Clean the data in the save_location column to include only the save location path
6. Rename the index column to ind
7. Reorder the dataframe
8. Copy cleaned data to Global Temporary View

## Initialize notebook and drop duplicates

In [0]:
%run "./Cleaning Utils"

In [0]:
# Initalize using function
df_pin_clean = init_cleaner("pin")

## Replace irrelevant column data with nulls

In [0]:
list_values_to_replace = ["No description available Story format"
                          , "User Info Error"
                          , "Image src error."
                          , "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"
                          , "No Title Data Available"
                          ]
list_of_columns = ["description"
                   , "follower_count"
                   , "image_src"
                   , "poster_name"
                   , "tag_list"
                   , "title"
                   ]

# Replace values in selected columns with null
df_pin_clean = df_pin_clean.replace(list_values_to_replace, None, subset=list_of_columns)

## Convert follower_count to integer

In [0]:
from pyspark.sql.functions import regexp_replace
dict_values_to_replace = {"k": "000", "M": "000000"} 

# Replace values in follower_count based on dictionary
for value, replace in dict_values_to_replace.items():
    df_pin_clean = df_pin_clean.withColumn("follower_count", regexp_replace("follower_count", value, replace))

# Convert to integer
df_pin_clean = df_pin_clean.withColumn("follower_count", df_pin_clean["follower_count"].cast("int"))

## Clean data in save_location

In [0]:
df_pin_clean = df_pin_clean.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

## Rename index to ind

In [0]:
df_pin_clean = df_pin_clean.withColumnRenamed("index", "ind")

## Reorder columns

In [0]:
df_pin_clean = df_pin_clean.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")
df_pin_clean.limit(5).display()

## Copy cleaned dataframe to global temporary table

In [0]:
# Copy to global temporary view
copy_to_gtv(df_pin_clean, "pin")