In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



In [2]:
df = pd.read_csv('/Users/ds/Documents/datascience/final_project/final_train_data/FINAL-TRAIN.csv', encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,tweet,scores,sentiment
0,Families caught in the grip of the opioid cris...,"{u'neg': 0.376, u'neu': 0.624, u'pos': 0.0, u'...",1
1,Wisconsin needs action from Washington to comb...,"{u'neg': 0.356, u'neu': 0.644, u'pos': 0.0, u'...",1
2,I'm heartbroken to see WV families &amp; towns...,"{u'neg': 0.323, u'neu': 0.562, u'pos': 0.115, ...",1
3,#SmoggySkiesAct allows corporate polluters to ...,"{u'neg': 0.453, u'neu': 0.547, u'pos': 0.0, u'...",1
4,#Fakenews that was true:\n\nFlynn lied to VP\n...,"{u'neg': 0.341, u'neu': 0.574, u'pos': 0.085, ...",2


In [4]:
X = df.tweet
y = df.sentiment

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB(alpha=0.8)
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [7]:
vect = CountVectorizer(strip_accents='unicode', ngram_range=(1,2), max_df=0.35)

In [8]:
tokenize_test(vect)

('Features: ', 55270)
('Accuracy: ', 0.66333333333333333)


In [9]:
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha=0.8)
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

In [76]:
df_best_worst = df[(df.sentiment==3) | (df.sentiment==1)]

In [77]:
X = df_best_worst.tweet
y = df_best_worst.sentiment

In [78]:
vect = CountVectorizer()

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [80]:
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB(alpha=1)
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [81]:
tokenize_test(vect)

('Features: ', 7630)
('Accuracy: ', 0.86101694915254234)


In [82]:
y_test_binary = np.where(y_test==3, 1, 0)
print(y_test_binary.mean())
print(1 - y_test_binary.mean())

0.522033898305
0.477966101695


In [10]:
print metrics.confusion_matrix(y_test, y_pred_class)

[[ 53  72   0]
 [ 24 262  29]
 [  3  74  83]]


In [11]:
def clean_tweets(col):
    tweets = []
    for x in col:
        x = x.replace("#", "")
        split = x.split(' ')
        new_list = []
        for word in split:
            if word[0:4] == 'http':
                new_list.append("url")
            else:
                new_list.append(word)
        split = ' '.join(new_list)
        tweets.append(split)
    df['clean_tweets'] = tweets
            

In [12]:
clean_tweets(df.tweet)

In [13]:
df.head()

Unnamed: 0,tweet,scores,sentiment,clean_tweets
0,Families caught in the grip of the opioid cris...,"{u'neg': 0.376, u'neu': 0.624, u'pos': 0.0, u'...",1,Families caught in the grip of the opioid cris...
1,Wisconsin needs action from Washington to comb...,"{u'neg': 0.356, u'neu': 0.644, u'pos': 0.0, u'...",1,Wisconsin needs action from Washington to comb...
2,I'm heartbroken to see WV families &amp; towns...,"{u'neg': 0.323, u'neu': 0.562, u'pos': 0.115, ...",1,I'm heartbroken to see WV families &amp; towns...
3,#SmoggySkiesAct allows corporate polluters to ...,"{u'neg': 0.453, u'neu': 0.547, u'pos': 0.0, u'...",1,SmoggySkiesAct allows corporate polluters to r...
4,#Fakenews that was true:\n\nFlynn lied to VP\n...,"{u'neg': 0.341, u'neu': 0.574, u'pos': 0.085, ...",2,Fakenews that was true:\n\nFlynn lied to VP\n\...


In [14]:
X = df.clean_tweets
y = df.sentiment

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
tokenize_test(vect)

('Features: ', 43904)
('Accuracy: ', 0.65833333333333333)


In [17]:
estimators = [('cv', CountVectorizer()), ('nb', MultinomialNB())]
pip = Pipeline(estimators)

In [18]:
stop = ['english', None]
accents = ['unicode', None]
ngram = [(1,1),(1,2)]
case = [True, False]
mx = [0.4, 0.3, 0.35]
a = [0.775, 0.8, 0.825]

params = {'cv__stop_words': stop,
         'cv__ngram_range': ngram,
         'cv__strip_accents': accents,
         'cv__lowercase': case,
         'cv__max_df': mx,
         'nb__alpha': a}

In [19]:
gs = GridSearchCV(estimator = pip, param_grid = params,
                  n_jobs=-1, verbose = True, cv=5)

In [20]:
X = df.clean_tweets
y = df.sentiment

In [90]:
gs.fit(X, y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  2.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('cv', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cv__lowercase': [True, False], 'nb__alpha': [0.775, 0.8, 0.825], 'cv__stop_words': ['english', None], 'cv__ngram_range': [(1, 1), (1, 2)], 'cv__strip_accents': ['unicode', None], 'cv__max_df': [0.4, 0.3, 0.35]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [21]:
print gs.best_params_

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [22]:
vect = CountVectorizer(strip_accents='unicode', ngram_range=(1,2), max_df=0.4)

In [23]:
tokenize_test(vect)

('Features: ', 43904)
('Accuracy: ', 0.65833333333333333)


In [24]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [51]:
df.head()

Unnamed: 0,tweet,scores,sentiment,clean_tweets,vader_scores
0,Families caught in the grip of the opioid cris...,"{u'neg': 0.376, u'neu': 0.624, u'pos': 0.0, u'...",1,Families caught in the grip of the opioid cris...,1
1,Wisconsin needs action from Washington to comb...,"{u'neg': 0.356, u'neu': 0.644, u'pos': 0.0, u'...",1,Wisconsin needs action from Washington to comb...,1
2,I'm heartbroken to see WV families &amp; towns...,"{u'neg': 0.323, u'neu': 0.562, u'pos': 0.115, ...",1,I'm heartbroken to see WV families &amp; towns...,1
3,#SmoggySkiesAct allows corporate polluters to ...,"{u'neg': 0.453, u'neu': 0.547, u'pos': 0.0, u'...",1,SmoggySkiesAct allows corporate polluters to r...,1
4,#Fakenews that was true:\n\nFlynn lied to VP\n...,"{u'neg': 0.341, u'neu': 0.574, u'pos': 0.085, ...",2,Fakenews that was true:\n\nFlynn lied to VP\n\...,1


In [27]:
vader_test_set = []
for x in df.tweet:
    vader_test_set.append(x.encode('utf-8'))

In [52]:
len(vader_test_set)

3000

In [29]:
analyzer = SentimentIntensityAnalyzer()

In [56]:
list_of_scores = []

for tweet in vader_test_set:
    vs = analyzer.polarity_scores(tweet)
    list_of_scores.append([tweet, vs])

In [57]:
df1 = pd.DataFrame(list_of_scores, columns = ['tweet', 'scores'])

In [65]:
df1.loc[2500,'scores']

{'compound': 0.627, 'neg': 0.0, 'neu': 0.795, 'pos': 0.205}

In [66]:
scores = []

for x in df1.scores:
    if x['compound'] > 0.5:
        scores.append(3)
    elif x['compound'] < -0.5:
        scores.append(1)
    else:
        scores.append(2)

In [67]:
scores

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [68]:
df1['sentiment'] = scores

In [69]:
df['vader_scores'] = scores

In [70]:
df.head()

Unnamed: 0,tweet,scores,sentiment,clean_tweets,vader_scores
0,Families caught in the grip of the opioid cris...,"{u'neg': 0.376, u'neu': 0.624, u'pos': 0.0, u'...",1,Families caught in the grip of the opioid cris...,1
1,Wisconsin needs action from Washington to comb...,"{u'neg': 0.356, u'neu': 0.644, u'pos': 0.0, u'...",1,Wisconsin needs action from Washington to comb...,1
2,I'm heartbroken to see WV families &amp; towns...,"{u'neg': 0.323, u'neu': 0.562, u'pos': 0.115, ...",1,I'm heartbroken to see WV families &amp; towns...,1
3,#SmoggySkiesAct allows corporate polluters to ...,"{u'neg': 0.453, u'neu': 0.547, u'pos': 0.0, u'...",1,SmoggySkiesAct allows corporate polluters to r...,1
4,#Fakenews that was true:\n\nFlynn lied to VP\n...,"{u'neg': 0.341, u'neu': 0.574, u'pos': 0.085, ...",2,Fakenews that was true:\n\nFlynn lied to VP\n\...,1


In [71]:
Vader_accuracy = metrics.accuracy_score(df.vader_scores, df.sentiment)

In [72]:
Vader_accuracy

0.63566666666666671

In [156]:
stop = [None]
accents = ['unicode']
ngram = [(1,2)]
case = [True]
mx = [0.4]
a = [0.775]
feat = [29750, 30000, 31250]

params = {'cv__stop_words': stop,
         'cv__ngram_range': ngram,
         'cv__strip_accents': accents,
         'cv__lowercase': case,
         'cv__max_df': mx,
         'nb__alpha': a,
         'cv__max_features': feat}

In [157]:
gs = GridSearchCV(estimator = pip, param_grid = params,
                  n_jobs=-1, verbose = True, cv=5)

In [158]:
X = df.clean_tweets
y = df.sentiment

In [159]:
gs.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('cv', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cv__max_features': [29750, 30000, 31250], 'cv__lowercase': [True], 'nb__alpha': [0.775], 'cv__stop_words': [None], 'cv__ngram_range': [(1, 2)], 'cv__strip_accents': ['unicode'], 'cv__max_df': [0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [160]:
gs.best_params_

{'cv__lowercase': True,
 'cv__max_df': 0.4,
 'cv__max_features': 29750,
 'cv__ngram_range': (1, 2),
 'cv__stop_words': None,
 'cv__strip_accents': 'unicode',
 'nb__alpha': 0.775}

In [169]:
gs.fit

<bound method GridSearchCV.fit of GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('cv', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cv__max_features': [29750, 30000, 31250], 'cv__lowercase': [True], 'nb__alpha': [0.775], 'cv__stop_words': [None], 'cv__ngram_range': [(1, 2)], 'cv__strip_accents': ['unicode'], 'cv__max_df': [0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)>

In [73]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [75]:
df.sentiment.value_counts()

2    1528
3     811
1     661
Name: sentiment, dtype: int64