In [17]:
import db
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import re
import pickle

# Getting info from SQLite tweets

In [2]:
tweets=list(db.orm.select(t for t in db.Tweet))

In [3]:
text_L=list(i.text for i in tweets)
sentiment_L =list(i.sentiment for i in tweets)

In [4]:
df = pd.DataFrame()

In [5]:
df['Text']=text_L
df['Sentiment']=sentiment_L

In [6]:
np.random.seed(1337)
msk = np.random.rand(len(df)) < 0.8
print(len(df[msk]),
len(df[~msk]))

40460 10309


In [7]:
df_train = df[msk]
df_test = df[~msk]

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv_ngram = CountVectorizer(ngram_range=(1,2),max_df=0.5)
cv_ngram.fit(df_train['Text'])
X = cv_ngram.transform(df_train['Text'])
X_test = cv_ngram.transform(df_test['Text'])

# Splitting in train and test

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, df_train['Sentiment'], train_size = 0.75
)



In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
grid={"C":np.linspace(0.0000000000001,1,20)}
lr=LogisticRegression()
lr_cv=GridSearchCV(lr,grid,cv=10)
lr_cv.fit(X_train,y_train)

print("tuned hyperparameters :(best parameters) ",lr_cv.best_params_)
print("accuracy :",lr_cv.best_score_)
print()
print("Best parameters set found on development set:")
print()
print(lr_cv.best_params_)
print()
print("Grid scores on development set:")
print()
means = lr_cv.cv_results_['mean_test_score']
stds = lr_cv.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, lr_cv.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()
y_true, y_pred = y_val, lr_cv.predict(X_val)
print(classification_report(y_true, y_pred))
print()




tuned hyperparameters :(best parameters)  {'C': 0.4736842105263684}
accuracy : 0.7696490360850222

Best parameters set found on development set:

{'C': 0.4736842105263684}

Grid scores on development set:

0.585 (+/-0.008) for {'C': 1e-13}
0.756 (+/-0.011) for {'C': 0.05263157894746316}
0.762 (+/-0.010) for {'C': 0.10526315789482632}
0.766 (+/-0.010) for {'C': 0.1578947368421895}
0.768 (+/-0.013) for {'C': 0.21052631578955264}
0.768 (+/-0.013) for {'C': 0.26315789473691575}
0.769 (+/-0.013) for {'C': 0.31578947368427895}
0.769 (+/-0.011) for {'C': 0.3684210526316421}
0.769 (+/-0.011) for {'C': 0.42105263157900524}
0.770 (+/-0.010) for {'C': 0.4736842105263684}
0.769 (+/-0.011) for {'C': 0.5263157894737316}
0.769 (+/-0.010) for {'C': 0.5789473684210947}
0.769 (+/-0.010) for {'C': 0.631578947368458}
0.769 (+/-0.011) for {'C': 0.6842105263158211}
0.769 (+/-0.011) for {'C': 0.7368421052631843}
0.769 (+/-0.011) for {'C': 0.7894736842105474}
0.769 (+/-0.011) for {'C': 0.8421052631579106}
0.7

# GridSearch + pipeline

In [11]:
from pprint import pprint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import nltk.corpus

stopwords_PT = nltk.corpus.stopwords.words('portuguese')

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('lr',LogisticRegression()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
#    'vect__min_df': (0.1, 0.25, 0.5),
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'vect__stop_words': (['portuguese']),
    'lr__C': np.linspace(0.0000000000001,1,20),
}

grid_search=GridSearchCV(pipeline,parameters,cv=10,scoring="f1_macro")
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(df_train['Text'],df_train['Sentiment'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'lr']
parameters:
{'lr__C': array([1.00000000e-13, 5.26315789e-02, 1.05263158e-01, 1.57894737e-01,
       2.10526316e-01, 2.63157895e-01, 3.15789474e-01, 3.68421053e-01,
       4.21052632e-01, 4.73684211e-01, 5.26315789e-01, 5.78947368e-01,
       6.31578947e-01, 6.84210526e-01, 7.36842105e-01, 7.89473684e-01,
       8.42105263e-01, 8.94736842e-01, 9.47368421e-01, 1.00000000e+00]),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}




done in 3326.052s

Best score: 0.773
Best parameters set:
	lr__C: 0.631578947368458
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


In [12]:
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

Best parameters set found on development set:

{'lr__C': 0.631578947368458, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}

Grid scores on development set:

0.496 (+/-0.015) for {'lr__C': 1e-13, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)}
0.501 (+/-0.015) for {'lr__C': 1e-13, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}
0.496 (+/-0.015) for {'lr__C': 1e-13, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.501 (+/-0.015) for {'lr__C': 1e-13, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}
0.496 (+/-0.015) for {'lr__C': 1e-13, 'vect__max_df': 1.0, 'vect__ngram_range': (1, 1)}
0.501 (+/-0.015) for {'lr__C': 1e-13, 'vect__max_df': 1.0, 'vect__ngram_range': (1, 2)}
0.750 (+/-0.018) for {'lr__C': 0.05263157894746316, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)}
0.758 (+/-0.016) for {'lr__C': 0.05263157894746316, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}
0.750 (+/-0.018) for {'lr__C': 0.05263157894746316, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.7

In [24]:
C1=[]
score_v=[]
print("\t"*3+"VALIDATION:\n")
# for c in (10**exp for exp in range(-10, 10)):
for c in np.linspace(0.0000000000001,1,20):
    lr=LogisticRegression(C=c)
    lr.fit(X_train,y_train)
    print ("C={:.5f}:\t\t{:.5f}".format
           (c,accuracy_score(y_val, lr.predict(X_val))))
    C1.append(c)
    score_v.append(accuracy_score(y_val, lr.predict(X_val)))

			VALIDATION:

C=0.00000:		0.58843
C=0.05263:		0.76075
C=0.10526:		0.76718
C=0.15789:		0.77143
C=0.21053:		0.77252
C=0.26316:		0.77301
C=0.31579:		0.77341
C=0.36842:		0.77380
C=0.42105:		0.77380
C=0.47368:		0.77400
C=0.52632:		0.77459
C=0.57895:		0.77558
C=0.63158:		0.77608
C=0.68421:		0.77588
C=0.73684:		0.77568
C=0.78947:		0.77519
C=0.84211:		0.77548
C=0.89474:		0.77548
C=0.94737:		0.77519
C=1.00000:		0.77558


# Final Test - Accuracy & F1_score

In [13]:
final_model = LogisticRegression(C=0.7368421052631843)
final_model.fit(X_test, df_test['Sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['Sentiment'], final_model.predict(X_test)))
print("Error: %s" % (1- accuracy_score(df_test['Sentiment'], final_model.predict(X_test))))

Final Accuracy: 0.9553787952274712
Error: 0.044621204772528844


In [23]:
final_model = LogisticRegression(C=0.7368421052631843)
final_model.fit(X_test, df_test['Sentiment'])
print ("Final F1_Score: %s" 
       % f1_score(df_test['Sentiment'], final_model.predict(X_test)))
print("Error: %s" % (1- f1_score(df_test['Sentiment'], final_model.predict(X_test))))

Final F1_Score: 0.9581894201054354
Error: 0.0418105798945646


In [14]:
lr_cv_best_parameters=lr_cv.best_estimator_.get_params()['C']
final_model1 = LogisticRegression(C=lr_cv_best_parameters)
final_model1.fit(X_test, df_test['Sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['Sentiment'], final_model1.predict(X_test)))
print("Error: %s" % (1- accuracy_score(df_test['Sentiment'], final_model1.predict(X_test))))

Final Accuracy: 0.9368512949849646
Error: 0.06314870501503544


In [22]:
lr_cv_best_parameters=lr_cv.best_estimator_.get_params()['C']
final_model1 = LogisticRegression(C=lr_cv_best_parameters)
final_model1.fit(X_test, df_test['Sentiment'])
print ("Final F1_Score: %s" 
       % f1_score(df_test['Sentiment'], final_model1.predict(X_test)))
print("Error: %s" % (1- f1_score(df_test['Sentiment'], final_model1.predict(X_test))))

Final F1_Score: 0.9411019632678911
Error: 0.058898036732108916


In [15]:
final_model2 = LogisticRegression(C=best_parameters['lr__C'])
final_model2.fit(X_test, df_test['Sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['Sentiment'], final_model2.predict(X_test)))
print("Error: %s" % (1- accuracy_score(df_test['Sentiment'], final_model2.predict(X_test))))

Final Accuracy: 0.9488796197497332
Error: 0.05112038025026677


In [19]:
final_model2 = LogisticRegression(C=best_parameters['lr__C'])
final_model2.fit(X_test, df_test['Sentiment'])
print ("Final F1_Score: %s" 
       % f1_score(df_test['Sentiment'], final_model2.predict(X_test)))
print("Error: %s" % (1- f1_score(df_test['Sentiment'], final_model2.predict(X_test))))

Final F1_Score: 0.952190873627869
Error: 0.04780912637213097


# Dumping model into pickles

In [24]:
pickle.dump(final_model2, open("./our_model.pkl", "wb"))
pickle.dump(cv_ngram, open("./our_cv.pkl", "wb"))