# Cleaning the data
## Using Spacy and pandas

The dataset was cleaned and tokenized using spaCy, a natual language processing python package.  The tweets are tokenized by individual word, parts of speech tagging, and dependency parsing.  The final features used, in all instances, are the lemmatized tokens.

In [None]:
import pandas as pd
import spacy

# will need to run this in terminal first: python -m spacy download en

nlp = spacy.load('en', disable=['ner'])

In [None]:
df = pd.read_csv("tweets.csv")
tokens = []
lemma = []
pos = []
dep = []

for doc in nlp.pipe(df['text'].astype('unicode').values, batch_size=205000,
                        n_threads=4):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        dep.append([n.dep_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        dep.append(None)

df['text_tokens'] = tokens
df['text_lemma'] = lemma
df['text_pos'] = pos
df['text_dep'] = dep

df.to_csv('tweets_parsed.csv')

In [None]:
df = pd.read_csv("train.csv", index_col=False, encoding='latin-1', header=0)
tokens = []
lemma = []
pos = []
dep = []

for doc in nlp.pipe(df['SentimentText'].astype('unicode').values, batch_size=500,
                        n_threads=4):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        dep.append([n.dep_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        dep.append(None)

df['text_tokens'] = tokens
df['text_lemma'] = lemma
df['text_pos'] = pos
df['text_dep'] = dep

df.to_csv('training_parsed.csv')