In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import re
import spacy
from spacy.lang.en import English
from spacy.attrs import ORTH, LEMMA, NORM, TAG
from clean import replace_emoticons, clean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from joblib import dump

## Import Dataset

In [2]:
twitter_full_df = pd.read_csv("Resources/sent_analysis_dataset.csv", error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
twitter_df = twitter_full_df

In [None]:
# Select slice to test
twitter_df = twitter_full_df.loc[:1000]

In [15]:
twitter_df = twitter_full_df.sample(frac = .0025)

## Clean Tweets

In [16]:
# Drop and rename columns
twitter_df = twitter_df.drop(["ItemID", "SentimentSource"], axis = 1)
twitter_df = twitter_df.rename(columns = {"SentimentText": "Text"})

In [6]:
# Check columns for missing data
twitter_df.isnull().sum()

Sentiment    0
Text         0
dtype: int64

In [7]:
# Verify data are of correct type
twitter_df.dtypes

Sentiment     int64
Text         object
dtype: object

In [8]:
# Verify sentiment column has appropriate data
twitter_df["Sentiment"].unique()

array([0, 1])

In [17]:
# Clean text of tweets using previously defined clean_tweet function
twitter_df["Text"] = twitter_df["Text"].map(lambda x: clean(x))

In [22]:
twitter_df.head()
# Backup cleaned csv
twitter_df.to_csv("Resources/clean_dataset.csv")

## Define Tokenizer

In [23]:
# Import spacy nlp library
nlp = spacy.load('en_core_web_sm', entity=False)

# Add customized stop words
nlp.Defaults.stop_words |= {"-PRON-","joe", "biden", "bernie","sanders", "elizabeth", \
                            "warren", "kamala", "harris", "s", "ve", "twitter", "tweet",\
                            "come", "year", "know"}

# Creating tokenizer function
def spacy_tokenizer(tweet):
    
    # Parse tweets into tokens
    tokens = nlp(tweet)

    # Lemmatize tokens
    tokens = [word.lemma_ for word in tokens if word.lemma_ not in nlp.Defaults.stop_words]

    # Return list of tokens
    return tokens

## Split into testing and training datasets

In [24]:
# Split into testing and training datasets
X_train, X_test, y_train, y_test = train_test_split(twitter_df["Text"], \
                                                    twitter_df["Sentiment"])

## Test different models

### Multinomial Naive Bayes Classifier

In [25]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer(tokenizer = spacy_tokenizer, max_features = 1000)),
    ("classify", MultinomialNB()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__alpha": (.8, 1)
             }
mnb = GridSearchCV(pipeline, param_grid = parameters, cv = 3, verbose = 1)

In [26]:
mnb.fit(X_train,y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 42.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vectorize',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1000,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                     

In [27]:
# Get best results
print(f"Best Score: {mnb.best_score_} with parameters: {mnb.best_params_}")
optimized_mnb = mnb.best_estimator_

Best Score: 0.6773648648648649 with parameters: {'classify__alpha': 0.8, 'vectorize__max_df': 0.8, 'vectorize__ngram_range': (1, 1), 'vectorize__norm': 'l1', 'vectorize__use_idf': False}


In [28]:
# Evaluate model using test data
predictions = optimized_mnb.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

Accuracy score: 0.6818642350557245
Confusion matrix: [[374 123]
 [191 299]]
Classification report: 
              precision    recall  f1-score   support

           0       0.66      0.75      0.70       497
           1       0.71      0.61      0.66       490

    accuracy                           0.68       987
   macro avg       0.69      0.68      0.68       987
weighted avg       0.69      0.68      0.68       987



### Complement Naive Bayes Classifier

In [29]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer(tokenizer = spacy_tokenizer)),
    ("classify", ComplementNB()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__alpha": (.8, 1)
             }
cnb = GridSearchCV(pipeline, param_grid = parameters, cv = 3, verbose = 1)

In [30]:
cnb.fit(X_train,y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 168.6min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vectorize',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                     

In [31]:
# Get best results
print(f"Best Score: {cnb.best_score_} with parameters: {cnb.best_params_}")
optimized_cnb = cnb.best_estimator_

Best Score: 0.6905405405405406 with parameters: {'classify__alpha': 1, 'vectorize__max_df': 0.8, 'vectorize__ngram_range': (1, 1), 'vectorize__norm': 'l2', 'vectorize__use_idf': False}


In [32]:
# Evaluate model using test data
predictions = optimized_cnb.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

Accuracy score: 0.6838905775075987
Confusion matrix: [[373 124]
 [188 302]]
Classification report: 
              precision    recall  f1-score   support

           0       0.66      0.75      0.71       497
           1       0.71      0.62      0.66       490

    accuracy                           0.68       987
   macro avg       0.69      0.68      0.68       987
weighted avg       0.69      0.68      0.68       987



## Save Best Model

In [33]:
# Save best model
dump(optimized_mnb, 'twitter_model.joblib')

['twitter_model.joblib']