## WHAT WE FOUND WHILE EXPLORING THE DATA:

- We need to apply a proper schema
- The date column needs fixing
- We need to extract/remove usernames
- We need to extract hashtags and replace them with their word equivalent
- We need to remove URLs, as our algorithm won't need that or use that
- The same goes for email-addresses
- Symbols stored in HTML notation do not appear properly unescaped (example: &lt;3 and s&amp;^t)
- Unwanted characters are present, perhaps  Encoding is ‘broken’ ? (example: �����ߧ�ǿ�����ж�؜��� )


In [56]:
import html
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

pd.options.display.max_columns = None
pd.options.display.max_rows = 250
pd.options.display.max_colwidth = 150

schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"

IN_PATH = "/home/jovyan/data-sets/sentiment-140-training-data/RAW"
OUT_PATH = "/home/jovyan/data-sets/sentiment-140-training-data/CLEAN"

spark_reader = spark.read.schema(schema)


@f.udf
def html_unescape(s: str):
    if isinstance(s, str):
        return html.unescape(s)
    return s


def clean_data(df):
    df = (
        df
        .withColumn("original_text", f.col("text"))
        .withColumn("text", f.regexp_replace(f.col("text"), url_regex, ""))
        .withColumn("text", f.regexp_replace(f.col("text"), email_regex, ""))
        .withColumn("text", f.regexp_replace(f.col("text"), user_regex, ""))
        .withColumn("text", f.regexp_replace(f.col("text"), "#", " "))
        .withColumn("text", html_unescape(f.col("text")))
        .filter("text != ''")
    )
    return df

df_raw = spark_reader.csv(IN_PATH, timestampFormat=timestampformat) 
df_clean = clean_data(df_raw)

df_clean.write.partitionBy("polarity").parquet(OUT_PATH, mode="overwrite")
