In [2]:
# Import Dependencies
import pandas as pd
import numpy as np
import re
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
from clean import replace_emoticons, clean_text, clean_tweet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from joblib import dump

In [None]:
# Import spacy nlp library
nlp = spacy.load('en_core_web_sm',parser=False, entity=False) 

## Import Dataset

In [3]:
twitter_df = pd.read_csv("Resources/sent_analysis_dataset.csv", error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [4]:
# Randomly select 10% tweets to train and test model
twitter_df = twitter_df.sample(frac = 0.001) 

In [5]:
twitter_df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
1418817,1418833,1,Sentiment140,"Haha, one can't xD but i finished it none the ..."
640552,640568,1,Sentiment140,done eating lunch..
482902,482915,0,Sentiment140,@secretfanofu i just read that sux cuz people...
739488,739504,0,Sentiment140,Have to miss the area meeting tonight due to t...
875109,875125,0,Sentiment140,I want to come home from work. Im sick. -_- My...


## Clean Tweets

In [6]:
# Drop and rename columns
twitter_df = twitter_df.drop(["ItemID", "SentimentSource"], axis = 1)
twitter_df = twitter_df.rename(columns = {"SentimentText": "Text"})

In [7]:
# Check columns for missing data
twitter_df.isnull().sum()

Sentiment    0
Text         0
dtype: int64

In [8]:
# Verify data are of correct type
twitter_df.dtypes

Sentiment     int64
Text         object
dtype: object

In [9]:
# Verify sentiment column has appropriate data
twitter_df["Sentiment"].unique()

array([1, 0])

In [10]:
# Clean text of tweets using previously defined clean_tweet function
 
twitter_df["Text"] = twitter_df["Text"].map(lambda x: clean_tweet(x, nlp))

## Text Vectorization

PARAMETERS IN TFIDVECTORIZER

decode_error : {‘strict’, ‘ignore’, ‘replace’} (default=’strict’)
Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding. By default, it is ‘strict’, meaning that a UnicodeDecodeError will be raised. Other values are ‘ignore’ and ‘replace’.

IF TOO MANY FEATURES, ADJUST HERE
max_df : float in range [0.0, 1.0] or int (default=1.0)
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

min_df : float in range [0.0, 1.0] or int (default=1)
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

DON'T CHANGE, BUT INCLUDE IN README THAT WE ARE DOING THE DEFAULTS AND WHY

norm : ‘l1’, ‘l2’ or None, optional (default=’l2’)
Each output row will have unit norm, either: * ‘l2’: Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied. * ‘l1’: Sum of absolute values of vector elements is 1. See preprocessing.normalize

use_idf : boolean (default=True)
Enable inverse-document-frequency reweighting.

smooth_idf : boolean (default=True)
Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.


## Split into testing and training datasets

### Cross-validation?
What cross-validation does is splitting the training data into a certain number of training folds 
(with 75% of the training data) and a the same number of testing folds (with 25% of the training data), 
use the training folds to train the classifier, and test it against the testing folds to obtain performance 
metrics (see below). The process is repeated multiple times and an average for each of the metrics is calculated.

If your testing set is always the same, you might be overfitting to that testing set, which means you might 
be adjusting your analysis to a given set of data so much that you might fail to analyze a different set. 
Cross-validation helps prevent that.
The more data you have, the more folds you will be able to use.

kept test size at default, which is .25
just using part of the dataset now; use rest later

In [11]:
# Split into testing and training datasets
X_train, X_test, y_train, y_test = train_test_split(twitter_df["Text"], twitter_df["Sentiment"])


In [17]:
type(X_train)

pandas.core.series.Series

## Try Naive Bayes Classifier

We are going to use gridsearch to do cross-validation
Use complement MB b/c outperforms MNB (look at documentation)

In [12]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer()),
    ("classify", ComplementNB()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__alpha": (.8, 1)
             }
nb = GridSearchCV(pipeline, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 1)

In [13]:
nb.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    7.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...abulary=None)), ('classify', ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vectorize__use_idf': (True, False), 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_df': (1, 0.8), 'vectorize__norm': ('l1', 'l2'), 'classify__alpha': (0.8, 1)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [14]:
# Get best results
print(f"Best Score: {nb.best_score_} with parameters: {nb.best_params_}")
optimized_nb = nb.best_estimator_

Best Score: 0.6875 with parameters: {'classify__alpha': 0.8, 'vectorize__max_df': 0.8, 'vectorize__ngram_range': (1, 2), 'vectorize__norm': 'l2', 'vectorize__use_idf': False}


In [15]:
# Evaluate model using test data
predictions = optimized_nb.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

Accuracy score: 0.6987341772151898
Confusion matrix: [[169  46]
 [ 73 107]]
Classification report: 
              precision    recall  f1-score   support

           0       0.70      0.79      0.74       215
           1       0.70      0.59      0.64       180

   micro avg       0.70      0.70      0.70       395
   macro avg       0.70      0.69      0.69       395
weighted avg       0.70      0.70      0.70       395



## Try another algorithm

In [None]:
# pipeline = Pipeline([
#     ("vectorize", TfidfVectorizer()),
#     ("classify", LinearRegression()),
# ])

# # Set parameters for Gridsearch
# parameters = {"vectorize__use_idf": (True, False),
#               "vectorize__ngram_range": [(1,1), (1,2)],
#               "vectorize__max_df" : (1, .8),
#               "vectorize__norm": ("l1", "l2"),
#               "classify__
#              }
# nb = GridSearchCV(pipeline, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 1)

## Save Best Model

In [16]:
# Save best model
dump(optimized_nb, 'twitter_model.joblib')

['twitter_model.joblib']