In [1]:
from pyspark.ml.feature import RegexTokenizer, HashingTF, IDF, CountVectorizer, Normalizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from nltk.sentiment import SentimentIntensityAnalyzer

Note that to run this notebook up you need to wget your own dataset using:

wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

and unzip the file into ./sentiment140 subfolder.

This dataset is too large to push to the github repository.

In [2]:
data_file = r"file:///home/jovyan/repos/distributed-sentiment-analysis-on-twitter-data/sentiment140/training.1600000.processed.noemoticon.csv"

In [3]:
# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("TrainModel") \
    .getOrCreate()

In [7]:
# define the data schema(format/structure) for our twitter data in the csv file
training_data_schema = StructType([StructField("polarity", StringType(), True),
                                  StructField("id", StringType(), True),
                                  StructField("date", StringType(), True),
                                  StructField("query", StringType(), True),
                                  StructField("user", StringType(), True),
                                  StructField("text", StringType(), True)])

In [8]:
df_raw = spark.read.csv(
    data_file, schema=training_data_schema
)

In [9]:
df_raw.show(truncate=False)

+--------+----------+----------------------------+--------+---------------+---------------------------------------------------------------------------------------------------------------------+
|polarity|id        |date                        |query   |user           |text                                                                                                                 |
+--------+----------+----------------------------+--------+---------------+---------------------------------------------------------------------------------------------------------------------+
|0       |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|0       |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|0       |1467810917|Mon Apr 0

In [30]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

at_user_pat = r'@[A-Za-z0-9_]+'  # r'@[\w]+'
url_pat = r'https?://[^ ]+'  # r'https?:\/\/[^\s]+'
www_pat = r'www.[^ ]+'
repeating_chars_pat = r'([A-Za-z])\1+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(at_user_pat, 'USERNAME', bom_removed)
    stripped = re.sub(url_pat, 'URL', stripped)
    stripped = re.sub(www_pat, 'URL', stripped)
    stripped = re.sub(repeating_chars_pat, r'\1\1', stripped)
    
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

udf_tweet_cleaner = udf(tweet_cleaner)

In [31]:
text_preprocessed = df_raw.withColumn("text_preprocessed", udf_tweet_cleaner(col("text")))

In [32]:
text_preprocessed.select("polarity", "text").show(truncate=False)

+--------+---------------------------------------------------------------------------------------------------------------------+
|polarity|text                                                                                                                 |
+--------+---------------------------------------------------------------------------------------------------------------------+
|0       |@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|0       |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|0       |@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |
|0       |my whole body feels itchy and like its on fire                                                                       |
|0       |@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't 

In [33]:
text_preprocessed.select("polarity", "text_preprocessed").show(truncate=False)

+--------+---------------------------------------------------------------------------------------------------------+
|polarity|text_preprocessed                                                                                        |
+--------+---------------------------------------------------------------------------------------------------------+
|0       |username url aww that bummer you shoulda got david carr of third day to do it                            |
|0       |is upset that he can not update his facebook by texting it and might cry as result school today also blah|
|0       |username dived many times for the ball managed to save the rest go out of bounds                         |
|0       |my whole body feels itchy and like its on fire                                                           |
|0       |username no it not behaving at all mad why am here because can not see you all over there                |
|0       |username not the whole crew                           