In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from sklearn.externals import joblib



In [2]:
df = pd.read_csv('../creating_sample_and_labelled_data/final.csv', encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,tweet,sentiment
0,Families caught in the grip of the opioid cris...,1
1,Wisconsin needs action from Washington to comb...,1
2,I'm heartbroken to see WV families &amp; towns...,1
3,#SmoggySkiesAct allows corporate polluters to ...,1
4,#Fakenews that was true:\n\nFlynn lied to VP\n...,2


In [4]:
df.sentiment.value_counts()

2    1528
3     811
1     661
Name: sentiment, dtype: int64

In [5]:
X = df.tweet
y = df.sentiment

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
#Creating function to test the accuracy of various parameters
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [8]:
vect = CountVectorizer(ngram_range=(1, 3), min_df=2)
tokenize_test(vect)

Features:  12525
Accuracy:  0.618333333333


In [9]:
vect = CountVectorizer()
tokenize_test(vect)

Features:  12457
Accuracy:  0.645


In [10]:
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  12205
Accuracy:  0.651666666667


In [11]:
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)

Features:  55282
Accuracy:  0.653333333333


In [12]:
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)

Features:  9766
Accuracy:  0.621666666667


Creating dataframe that eliminates neutral tweets - only includes positive and negative tweets. How accurate are the models then?

In [13]:
pos_neg = df[(df.sentiment==3) | (df.sentiment==1)]

In [14]:
X = pos_neg.tweet
y = pos_neg.sentiment

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
vect = CountVectorizer(ngram_range=(1, 3), min_df=2)
tokenize_test(vect)

Features:  6079
Accuracy:  0.827118644068


In [17]:
vect = CountVectorizer()
tokenize_test(vect)

Features:  7630
Accuracy:  0.861016949153


In [18]:
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  7395
Accuracy:  0.833898305085


In [19]:
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)

Features:  30678
Accuracy:  0.840677966102


In [20]:
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)

Features:  5009
Accuracy:  0.833898305085


In [21]:
#Calculating the null accuracy for the binary model to be tested against
y_test_binary = np.where(y_test==3, 1, 0)
print(y_test_binary.mean())

0.522033898305


# Using grid search for finding the best parameters

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [23]:
estimators = [('cv', CountVectorizer()), ('nb', MultinomialNB())]
pip = Pipeline(estimators)

In [24]:
stop = ['english', None]
accents = ['unicode', None]
ngram = [(1,1),(1,2), (1,3), (2,3)]
case = [True, False]
mx = [0.4, 0.6, 0.8, 1]
a = [0.6, 0.8, 1]

params = {'cv__stop_words': stop,
         'cv__ngram_range': ngram,
         'cv__strip_accents': accents,
         'cv__lowercase': case,
         'cv__max_df': mx,
         'nb__alpha': a}

In [25]:
gs = GridSearchCV(estimator = pip, param_grid = params,
                  n_jobs=-1, verbose = True, cv=5)

In [7]:
X = df.tweet
y = df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [27]:
gs.fit(X, y)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:  7.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'cv__stop_words': ['english', None], 'cv__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)], 'cv__strip_accents': ['unicode', None], 'cv__lowercase': [True, False], 'cv__max_df': [0.4, 0.6, 0.8, 1], 'nb__alpha': [0.6, 0.8, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [28]:
print ("Best Params:")
print (gs.best_params_)

Best Params:
{'cv__lowercase': True, 'cv__max_df': 0.4, 'cv__ngram_range': (1, 2), 'cv__stop_words': None, 'cv__strip_accents': 'unicode', 'nb__alpha': 0.8}


In [29]:
accents = ['unicode']
ngram = [(1,2)]
mx = [0.3, 0.4, 0.5]
a = [0.7, 0.8, 0.9]

params = {'cv__ngram_range': ngram,
         'cv__strip_accents': accents,
         'cv__max_df': mx,
         'nb__alpha': a}

In [30]:
gs = GridSearchCV(estimator = pip, param_grid = params,
                  n_jobs=-1, verbose = True, cv=5)

In [31]:
gs.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   11.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'cv__ngram_range': [(1, 2)], 'cv__strip_accents': ['unicode'], 'cv__max_df': [0.3, 0.4, 0.5], 'nb__alpha': [0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [32]:
print (gs.best_params_)

{'cv__max_df': 0.3, 'cv__ngram_range': (1, 2), 'cv__strip_accents': 'unicode', 'nb__alpha': 0.8}


In [33]:
accents = ['unicode']
ngram = [(1,2)]
mx = [0.25,0.3, 0.35]
a = [0.75, 0.8, 0.85]

params = {'cv__ngram_range': ngram,
         'cv__strip_accents': accents,
         'cv__max_df': mx,
         'nb__alpha': a}

In [34]:
gs = GridSearchCV(estimator = pip, param_grid = params,
                  n_jobs=-1, verbose = True, cv=5)

In [35]:
gs.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   10.8s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'cv__ngram_range': [(1, 2)], 'cv__strip_accents': ['unicode'], 'cv__max_df': [0.25, 0.3, 0.35], 'nb__alpha': [0.75, 0.8, 0.85]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [36]:
print (gs.best_params_)

{'cv__max_df': 0.35, 'cv__ngram_range': (1, 2), 'cv__strip_accents': 'unicode', 'nb__alpha': 0.8}


In [37]:
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB(alpha=0.8)
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [15]:
vect = CountVectorizer(strip_accents='unicode', ngram_range=(1,2), max_df=0.35)

In [16]:
tokenize_test(vect)

NameError: name 'tokenize_test' is not defined

In [17]:
X_train_dtm = vect.fit_transform(X_train)

In [18]:
X_test_dtm = vect.transform(X_test)

In [19]:
nb = MultinomialNB(alpha=0.8)

In [20]:
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=0.8, class_prior=None, fit_prior=True)

In [21]:
#Pickling functions for later use
joblib.dump(nb, 'nb.pkl')

['nb.pkl']

In [22]:
joblib.dump(vect, 'nb_vect.pkl')

['nb_vect.pkl']