In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

import pandas as pd
from IPython.core.display import display
import seaborn as sns


spark = SparkSession.builder.getOrCreate()

# General settings for display purposes
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = 144
sns.set(color_codes=True)

# Source sentiment140: http://help.sentiment140.com/for-students/
schema = "polarity FLOAT, id LONG, date_time STRING, query STRING, user STRING, text STRING"
spark_reader = spark.read.schema(schema)

# file 1: testdata.manual.2009.06.14.csv
TESTDATA_PATH = (
    "/home/jovyan/data-sets/sentiment-140-training-data/testdata.manual.2009.06.14.csv"
)
raw_test_data = spark_reader.csv(
    TESTDATA_PATH,
    quote='"',
    header=False,
    inferSchema=True,
    columnNameOfCorruptRecord="corrupt_data",
).cache()

# file 2: training.1600000.processed.noemoticon.csv
TRAININGDATA_PATH = "/home/jovyan/data-sets/sentiment-140-training-data/training.1600000.processed.noemoticon.csv"
raw_training_data = spark_reader.csv(
    TRAININGDATA_PATH,
    quote='"',
    header=False,
    inferSchema=True,
    columnNameOfCorruptRecord="corrupt_data",
).cache()

# path that we will write our raw data to
OUTPUT_PATH = "/home/jovyan/data-sets/sentiment-140-training-data/RAW"

# First look at the test data

In [None]:
# Count of data
print(f"Overall data count: {raw_test_data.count()}")

# Data summary
display(raw_test_data.summary().toPandas())
print("Data schema")
raw_test_data.printSchema()

# Let's look at 50 rows of data
display(raw_test_data.limit(50).toPandas())


# First look at the training data

In [None]:
# Count of data
print(f"Overall data count: {raw_training_data.count()}")

# Data summary
display(raw_training_data.summary().toPandas())
print("Data schema")
raw_training_data.printSchema()

# Let's look at 50 rows of data
display(raw_training_data.limit(50).toPandas())

Test data:
- 498 rows of test_data

Training data:
- 1600000 rows of training_data


### Initial Findings:
- We need to apply a proper schema
- The date column needs fixing
- We need to extract twitter user names/handles (we'll extract it and call the output column `users_mentioned`)
- We need to extract hashtags and replace them with the words from the hashtag (we'll extract it and call the output column `hashtags`)
- We need to extract URLs, as our algorithm won't need that or use that (we'll simply remove it from the data)
- The same goes for email-address
- HTML does not appear properly unescaped, we're going to have to fix that (example: `&lt;3` and `s&amp;^t`)
- Encoding seems to be 'broken' (example: `�����ߧ�ǿ�����ж�؜��� &lt;&lt;----I DID NOT KNOW I CUD or HOW TO DO ALL DAT ON MY PHONE TIL NOW. WOW..MY LIFE IS NOW COMPLETE. JK.`)


# Detailed statistics

## Polarity
According to Sentiment140 documentation, we would expect the `polarity` column to have one of three values representing user sentiment:
- 0 = negative
- 2 = neutral
- 4 = positive

Once we train our own model, we don't want data-skew to introduce bias. So let's see how polarity is distributed in the data that we have.

#### Polarity column (test data)
Let's first look at the test data.

In [None]:
df = raw_test_data.select("polarity").na.drop()
print(f"No of rows with Polarity: {df.count()}/{raw_test_data.count()}")

sns.distplot(df.toPandas())

#### Polarity column (training data)

Now let's look at the training data.

In [None]:
df = raw_training_data.select("polarity").na.drop()
print(f"No of rows with Polarity: {df.count()} / {raw_training_data.count()}")

sns.distplot(df.toPandas())

#### Results:
We can clearly see that the training data only has polarity data centered around 0 (Negative) and 4 (Positive).

Let's confirm this:

In [None]:
polarity_df = raw_training_data.select("polarity").cache()

polarity_df.groupBy("polarity").count().toPandas()

Very nice! We have a nice even 50/50 split between polarity.

### Conclusions:
- As 498 rows is way too little for us to train a model on, we're going to disregard this dataset and focus on the Training Data. 
- We've determined the steps that need to be taken to clean the data

# Store our raw data

Now it's time for us to write the raw data we intend to use to disk.  
We're going to:
- keep the format CSV
- partition the data by polarity, this will create 2 subfolders inside our output folder 
- repartition the data in 20 partitions: This will ensure that we have 20 smaller csv files per partition

In [None]:
raw_training_data.repartition(20).write.partitionBy("polarity").csv(
    OUTPUT_PATH, mode="overwrite"
)