In [0]:
from pyspark.sql.functions import *
import urllib

In [0]:
def read_S3(loc):
    file_type = "json"
    infer_schema = "true"

    df = pin_df = spark.read.format(file_type) \
        .option("inferSchema", infer_schema) \
        .load(loc)

    return df

In [0]:
def clean_pin_data(df):
    ## Drop duplicates
    cleaned_df = df.dropDuplicates()

    ## Replace empty entries
    cleaned_df = cleaned_df.replace({"User Info Error": None}, subset=["follower_count", "poster_name"])
    cleaned_df = cleaned_df.replace({"No description available Story format": None}, subset=["description"])
    cleaned_df = cleaned_df.replace({"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e": None}, subset=["tag_list"])
    cleaned_df = cleaned_df.replace({"No Title Data Available": None}, subset=["title"])
    cleaned_df = cleaned_df.replace({"Image src error.": None}, subset=["image_src"])

    ## Ready follower_count for integer conversion
    cleaned_df = cleaned_df.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
    cleaned_df = cleaned_df.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))

    ## Convert integer columns to int
    cleaned_df = cleaned_df.withColumn("follower_count", cleaned_df["follower_count"].cast("int"))
    cleaned_df = cleaned_df.withColumn("downloaded", cleaned_df["downloaded"].cast("int"))
    cleaned_df = cleaned_df.withColumn("index", cleaned_df["index"].cast("int"))

    ## Clean save_location path
    cleaned_df = cleaned_df.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

    ## Rename index column
    cleaned_df = cleaned_df.withColumnRenamed("index", "ind")

    ## Reorder columns
    cleaned_df = cleaned_df.select("ind",
                                    "unique_id",
                                    "title",
                                    "description",
                                    "follower_count", 
                                    "poster_name", 
                                    "tag_list", 
                                    "is_image_or_video", 
                                    "image_src", 
                                    "save_location", 
                                    "category")
    
    return cleaned_df

In [0]:
def clean_geo_data(df):
    ## Drop duplicates
    cleaned_df = df.dropDuplicates()

    ## New column for coordinates
    cleaned_df = cleaned_df.withColumn("coordinates", array("latitude", "longitude"))

    ## Removing old separate columns
    cleaned_df = cleaned_df.drop("latitude", "longitude")

    ## Converting string to timestamp
    cleaned_df = cleaned_df.withColumn("timestamp", to_timestamp("timestamp"))

    ## Reordering dataframe
    cleaned_df = cleaned_df.select("ind", 
                                    "country", 
                                    "coordinates", 
                                    "timestamp")
    
    return cleaned_df

In [0]:
def clean_user_data(df):
    ## Drop duplicates
    cleaned_df = df.dropDuplicates()

    ## Create username as names concatenated
    cleaned_df = cleaned_df.withColumn("user_name", concat("first_name", "last_name"))

    ## Drop first_name and last_name
    cleaned_df = cleaned_df.drop("first_name", "last_name")

    ## Convert string to timestamp
    cleaned_df = cleaned_df.withColumn("date_joined", to_timestamp("date_joined"))

    cleaned_df = cleaned_df.select("ind",
                                    "user_name",
                                    "age",
                                    "date_joined")
    
    return cleaned_df