In [1]:
# Import Dependencies
import pandas as pd
import re
import numpy as np
#from clean import clean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from joblib import dump

## Import Dataset

In [2]:
twitter_df = pd.read_csv("Resources/sent_analysis_dataset.csv", error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
# Randomly select 50% tweets to train and test model
twitter_df = twitter_df.sample(frac = 0.25) 

In [4]:
twitter_df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
803098,803114,1,Sentiment140,has just watched sweeney todd and is now watch...
980292,980308,0,Sentiment140,@Leslie_G stack is injured are you a rowdy fa...
1392305,1392321,0,Sentiment140,Defs shouldnt have wore heels last night. Now ...
1039592,1039608,1,Sentiment140,new yorrrrk
787928,787944,1,Sentiment140,"@farabovetheclou Next Gen? That's Ok, I forgiv..."


## Clean Tweets

In [5]:
# Drop and rename columns
twitter_df = twitter_df.drop(["ItemID", "SentimentSource"], axis = 1)
twitter_df = twitter_df.rename(columns = {"SentimentText": "Text"})

In [6]:
# Check columns for missing data
twitter_df.isnull().sum()

Sentiment    0
Text         0
dtype: int64

In [7]:
# Verify data are of correct type
twitter_df.dtypes

Sentiment     int64
Text         object
dtype: object

In [8]:
# Verify sentiment column has appropriate data
twitter_df["Sentiment"].unique()

array([1, 0])

In [9]:
# Clean data using clean function

# twitter_df["Sentiment"] = twitter_df["Sentiment"].map(lambda x: clean(x))

In [15]:
# GET RID OF THIS ONCE WE HAVE CLEAN FUNCTION
twitter_df["Text"] = twitter_df["Text"].map(lambda x: re.sub(r"[!#$%&'\\()*+,-./:;<=>?@\^_`{|}~]", "", x))
twitter_df["Text"] = twitter_df["Text"].map(lambda x: re.sub("\[", " ", x))
twitter_df["Text"] = twitter_df["Text"].map(lambda x: re.sub("\]", " ", x))

In [16]:
twitter_df.head(20)

Unnamed: 0,Sentiment,Text
803098,1,has just watched sweeney todd and is now watch...
980292,0,LeslieG stack is injured are you a rowdy fan ...
1392305,0,Defs shouldnt have wore heels last night Now t...
1039592,1,new yorrrrk
787928,1,farabovetheclou Next Gen Thats Ok I forgive yo...
438528,1,slottedpig Nowhere friend visiting but we have...
679788,1,Game time Wooo come on LA Id love to see Kobe ...
1202348,1,so hungry i ate my 2 slices of pizza while dri...
324047,0,laceyee I want to drive na And yeah I wont le...
329176,0,lauraveagle haha I knowwhy are we not hanging ...


## Text Vectorization

PARAMETERS IN TFIDVECTORIZER

decode_error : {‘strict’, ‘ignore’, ‘replace’} (default=’strict’)
Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding. By default, it is ‘strict’, meaning that a UnicodeDecodeError will be raised. Other values are ‘ignore’ and ‘replace’.

IF TOO MANY FEATURES, ADJUST HERE
max_df : float in range [0.0, 1.0] or int (default=1.0)
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

min_df : float in range [0.0, 1.0] or int (default=1)
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

DON'T CHANGE, BUT INCLUDE IN README THAT WE ARE DOING THE DEFAULTS AND WHY

norm : ‘l1’, ‘l2’ or None, optional (default=’l2’)
Each output row will have unit norm, either: * ‘l2’: Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied. * ‘l1’: Sum of absolute values of vector elements is 1. See preprocessing.normalize

use_idf : boolean (default=True)
Enable inverse-document-frequency reweighting.

smooth_idf : boolean (default=True)
Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.


## Split into testing and training datasets

### Cross-validation?
What cross-validation does is splitting the training data into a certain number of training folds 
(with 75% of the training data) and a the same number of testing folds (with 25% of the training data), 
use the training folds to train the classifier, and test it against the testing folds to obtain performance 
metrics (see below). The process is repeated multiple times and an average for each of the metrics is calculated.

If your testing set is always the same, you might be overfitting to that testing set, which means you might 
be adjusting your analysis to a given set of data so much that you might fail to analyze a different set. 
Cross-validation helps prevent that.
The more data you have, the more folds you will be able to use.

kept test size at default, which is .25
just using part of the dataset now; use rest later

In [17]:
# Split into testing and training datasets
X_train, X_test, y_train, y_test = train_test_split(twitter_df["Text"], twitter_df["Sentiment"])


## Try Naive Bayes Classifier

We are going to use gridsearch to do cross-validation
Use complement MB b/c outperforms MNB (look at documentation)

In [18]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer()),
    ("classify", ComplementNB()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__alpha": (.8, 1)
             }
nb = GridSearchCV(pipeline, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 1)

In [19]:
nb.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
# Get best results
print(f"Best Score: {nb.best_score_} with parameters: {nb.best_params_}")
optimized_nb = nb.best_estimator_

In [None]:
# Evaluate model using test data
predictions = optimized_nb.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

## Try another algorithm

## Save Best Model

In [None]:
# Save best model
dump(optimized_nb, 'twitter_model.joblib')