In [1]:
# Import Dependencies
import pandas as pd
import re
import spacy

## Import Dataset

In [3]:
twitter_df = pd.read_csv("Resources/sent_analysis_dataset.csv", error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [4]:
# For right now, we will only use part of the set
# twitter_df = twitter_df[:-1000000]
# len(twitter_df.index)

In [5]:
twitter_df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


## Clean Tweets

In [3]:
# Drop unnecessary columns
twitter_df = twitter_df.drop(["ItemID", "SentimentSource"], axis = 1)

In [34]:
twitter_df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [20]:
# Check columns for missing data
twitter_df.isnull().sum()

Sentiment        0
SentimentText    0
dtype: int64

In [21]:
# Verify data are of correct type
twitter_df.dtypes

Sentiment         int64
SentimentText    object
dtype: object

In [22]:
# Verify sentiment column has appropriate data
twitter_df["Sentiment"].unique()

array([0, 1])

In [17]:
twitter_df["X"] = twitter_df["SentimentText"]
twitter_df.head(50)

Unnamed: 0,Sentiment,SentimentText,X
0,0,is so sad for my APL frie...,is so sad for my APL frie...
1,0,I missed the New Moon trail...,I missed the New Moon trail...
2,1,omg its already 7:30 :O,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me!!! ...
5,0,or i just worry too much?,or i just worry too much?
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,0,Sunny Again Work Tomorrow :-| ...,Sunny Again Work Tomorrow :-| ...
8,1,handed in my uniform today . i miss you ...,handed in my uniform today . i miss you ...
9,1,hmmmm.... i wonder how she my number @-),hmmmm.... i wonder how she my number @-)


In [None]:
# Decided not to replace text abbreviations with full words b/c the abbreviations carry their own meaning
# Decided to keep hashtags other than the # b/c they may carry their own meaning

In [None]:
# Stemming? Lemma?

In [None]:
# Convert all to lower case

In [None]:
# Replace "n't" with " not" - GRETEL FIGURE OUT HOW TO TO THIS
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub(r"n't", " ", x))

In [None]:
# Replacing everything with a space that I will remove later

In [24]:
# Remove links and html special entities
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub("http*", " ", x))
twitter_df["X"] = twitter_df["SentimentText"].map(lambda x: re.sub(r'&\w*;', '', x))

In [25]:
# Remove handles
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub("r^@", "", x))

In [22]:
# Replace happy and sad emoticons with words "happy" and "sad"
# from: https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf)
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])


In [None]:
# GRETEL - FIGURE OUT HOW TO DO THIS
twitter_df["X"] = twitter_df["SentimentText"].map(lambda x: re.sub(emoticons_happy, "happy", x))

In [8]:
# Remove special characters

twitter_df["X"] = twitter_df["SentimentText"].map(lambda x: re.sub(r"[!#$%&'\\()*+,-./:;<=>?@\^_`{|}~]", "", x))
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub("\[", " ", x))
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub("\]", " ", x))

In [73]:
x = "123!#$%&'()*+,-./:;<=>?@[\]^_`{|}~45"
y = re.sub(r"[!#$%&'\\()*+,-./:;<=>?@\^_`{|}~]", "", x)
           
print(y)

123  45


In [11]:
# All extra whitespace
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub("\s", " ", x))
twitter_df["X"] = twitter_df["X"].map(lambda x: re.sub("  ", "", x))

## Text Vectorization

## Split into testing and training datasets

### Cross-validation?
What cross-validation does is splitting the training data into a certain number of training folds 
(with 75% of the training data) and a the same number of testing folds (with 25% of the training data), 
use the training folds to train the classifier, and test it against the testing folds to obtain performance 
metrics (see below). The process is repeated multiple times and an average for each of the metrics is calculated.

If your testing set is always the same, you might be overfitting to that testing set, which means you might 
be adjusting your analysis to a given set of data so much that you might fail to analyze a different set. 
Cross-validation helps prevent that.
The more data you have, the more folds you will be able to use.

## Apply algorithm (1 to begin)