In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from __future__ import unicode_literals
import ast # this is just use to evaluate the lemmas
import time
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# loads the required default english model used to tokenize words
# this must be downloaded previously 
# python -m spacy download en

nlp = spacy.load('en', disable=['ner'])

df = pd.read_csv("tweets.csv")
df.index

In [None]:
# This cell takes awhile to run.  As such, the results were saved in a new 
# .csv all cells below are self-contained without the first three cells

df = pd.read_csv("tweets.csv")
tokens = []
lemma = []
pos = []
dep = []

# where pos is parts of speech and dep is dependency 

for doc in nlp.pipe(df['text'].astype('unicode').values, batch_size=205000,
                        n_threads=4):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        dep.append([n.dep_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        dep.append(None)

df['text_tokens'] = tokens
df['text_lemma'] = lemma
df['text_pos'] = pos
df['text_dep'] = dep

df.to_csv('tweets_parsed.csv')

In [None]:
tweets = pd.read_csv("tweets_parsed.csv", low_memory=False, index_col=0, dtype='object')


In [None]:
# this statement just converts the str(list) to a list of strings

tweets['tokens'] = tweets['text_tokens'].apply(ast.literal_eval)
tweets['lemma'] = tweets['text_lemma'].apply(ast.literal_eval)
tweets['pos'] = tweets['text_pos'].apply(ast.literal_eval)
tweets['dep'] = tweets['text_dep'].apply(ast.literal_eval)

print(tweets.text[25])
print(tweets.tokens[25])
print(tweets.lemma[25])

In [None]:

df = pd.read_csv("train.csv", index_col=False, encoding='latin-1', header=0)


tokens = []
lemma = []
pos = []
dep = []

for doc in nlp.pipe(df['text'].astype('unicode').values, batch_size=500,
                        n_threads=4):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        dep.append([n.dep_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        dep.append(None)

df['text_tokens'] = tokens
df['text_lemma'] = lemma
df['text_pos'] = pos
df['text_dep'] = dep

df.to_csv('training_parsed2.csv')

In [None]:
ts = pd.read_csv("training_parsed.csv", low_memory=False, index_col=0, dtype='object')

ts['tokens'] = ts['text_tokens'].apply(ast.literal_eval)
ts['lemma'] = ts['text_lemma'].apply(ast.literal_eval)
ts['pos'] = ts['text_pos'].apply(ast.literal_eval)
ts['dep'] = ts['text_dep'].apply(ast.literal_eval)

In [None]:

ts_new = ts[:80000]
ts_rest = ts[~ts.index.isin(ts_new.index)]
ts_rest = ts[~ts.ItemID.isin(ts_new.ItemID)]

In [None]:
vectorizer = TfidfVectorizer()

# fit_transform fits the transform and transforms, while .transform 
# applies that fit to the rest of the datasets
# this is the weighting and normalizing 

features = vectorizer.fit_transform(ts_new.lemma.astype('unicode'))
#print(vectorizer.vocabulary_)
smatrix = vectorizer.transform(ts_rest.lemma.astype('unicode'))
#print(smatrix)


y = ts_new['Sentiment']
X = features


dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
t0 = time.time()

#building the tree
dtf = dt.fit(X,y)

t1 = time.time()
#testing the tree
predict_dt = dt.predict(smatrix)
t2 = time.time()
print('time to train '+str(t1-t0))
print('time to predict '+str(t2-t1))

print(classification_report(ts_rest.Sentiment, predict_dt))

In [None]:
#applying the same transform to the troll tweets 
# and running the corpus through the tree

tweet_matrix = vectorizer.transform(tweets.text_lemma)

predict_tweets = dt.predict(tweet_matrix)
tweet_prob = dt.predict_proba(tweet_matrix)
#tweet_score = dt.score(tweet_matrix,y)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(dtf.feature_importances_, 'o')
#plt.ylim(0.005,0.051)
plt.show()

In [None]:
b = dtf.feature_importances_.tolist()
out = pd.Series(b)
out = pd.DataFrame({'x':out.index, 'y':out.values})

In [None]:
out_cut = out[out.y>=.01]
out_cut.reset_index()
print(out_cut)

In [None]:
# this flips the dictionary so that we can call the index
maps = {value: key for key, value in vectorizer.vocabulary_.items()}

In [None]:
# pulls out the pertinent terms

def get(key, d=maps, default=None):
    
    if key in d:
        return d[key]
    else:
        return default


abc = out_cut['x'].apply(get)

In [None]:
abc = pd.DataFrame({'key':abc.index,'vocab':abc.values},index=out_cut.index)
abc['weight'] = out_cut.y.values

In [None]:
print(abc)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# same algorithm as for the decision tree, but the forest instead

t0 = time.time()
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
rf.fit(X,y)
t1 = time.time()

print(t1-t0)

In [None]:

t0 = time.time()
rf_predicta = rf.predict(smatrix)

t1 = time.time()

print(t1-t0)
accuracy = accuracy_score(ts_rest.Sentiment,rf_predicta)

print(classification_report(ts_rest.Sentiment, predict_dt))
print(classification_report(ts_rest.Sentiment, rf_predicta))
print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

In [None]:
t0 = time.time()
rf_predict = rf.predict(tweet_matrix)
t1 = time.time()

print(t1-t0)
#accuracy = accuracy_score
#print(classification_report(ts_rest.Sentiment, predict_dt))

In [None]:
print(len(tweets))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# plotting the confusion matrix
cm = pd.DataFrame(confusion_matrix(ts_rest.sentiment, rf_predicta))
sns.heatmap(cm, annot=True)