In [23]:
# Import Dependencies
import pandas as pd
import numpy as np
import re
import spacy
from spacy.lang.en import English
from spacy.attrs import ORTH, LEMMA, NORM, TAG
from clean import replace_emoticons, clean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from joblib import dump

## Import Dataset

In [24]:
twitter_full_df = pd.read_csv("Resources/sent_analysis_dataset.csv", error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
twitter_df = twitter_full_df

In [39]:
# Select slice to test
twitter_df = twitter_full_df.loc[:10000]

In [46]:
twitter_df = twitter_full_df.sample(frac = .005)

## Clean Tweets

In [47]:
# Drop and rename columns
twitter_df = twitter_df.drop(["ItemID", "SentimentSource"], axis = 1)
twitter_df = twitter_df.rename(columns = {"SentimentText": "Text"})

In [None]:
# Check columns for missing data
twitter_df.isnull().sum()

In [None]:
# Verify data are of correct type
twitter_df.dtypes

In [None]:
# Verify sentiment column has appropriate data
twitter_df["Sentiment"].unique()

In [48]:
# Clean text of tweets using previously defined clean_tweet function
twitter_df["Text"] = twitter_df["Text"].map(lambda x: clean(x))

In [None]:
twitter_df.head()
# Backup cleaned csv
# twitter_df.to_csv("Resources/clean_dataset.csv")

## Define Tokenizer

In [49]:
# Import spacy nlp library
nlp = spacy.load('en_core_web_sm', entity=False)

# Add customized stop words
nlp.Defaults.stop_words |= {"-PRON-","joe", "biden", "bernie","sanders", "elizabeth", \
                            "warren", "kamala", "harris", "s", "ve", "twitter", "tweet",\
                            "come", "year", "know"}

# Creating tokenizer function
def spacy_tokenizer(tweet):
    
    # Parse tweets into tokens
    tokens = nlp(tweet)

    # Lemmatize tokens
    tokens = [word.lemma_ for word in tokens if word.lemma_ not in nlp.Defaults.stop_words]

    # Return list of tokens
    return tokens

## Split into testing and training datasets

In [50]:
# Split into testing and training datasets
X_train, X_test, y_train, y_test = train_test_split(twitter_df["Text"], \
                                                    twitter_df["Sentiment"])

## Test different models

### Multinomial Naive Bayes Classifier

In [51]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer(tokenizer = spacy_tokenizer, max_features = 1000)),
    ("classify", MultinomialNB()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__alpha": (.8, 1)
             }
mnb = GridSearchCV(pipeline, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 1)

In [52]:
mnb.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 18.9min


KeyboardInterrupt: 

In [None]:
# Get best results
print(f"Best Score: {mnb.best_score_} with parameters: {mnb.best_params_}")
optimized_mnb = mnb.best_estimator_

In [None]:
# Evaluate model using test data
predictions = optimized_mnb.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

### Complement Naive Bayes Classifier

In [None]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer(tokenizer = spacy_tokenizer)),
    ("classify", ComplementNB()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__alpha": (.8, 1)
             }
cnb = GridSearchCV(pipeline, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 1)

In [None]:
cnb.fit(X_train,y_train)

In [None]:
# Get best results
print(f"Best Score: {nb.best_score_} with parameters: {nb.best_params_}")
optimized_cnb = cnb.best_estimator_

In [None]:
# Evaluate model using test data
predictions = optimized_cnb.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

### SVG Classifier

In [None]:
# Create transformation pipeline
pipeline = Pipeline([
    ("vectorize", TfidfVectorizer()),
    ("classify", SVC()),
])

# Set parameters for Gridsearch
parameters = {"vectorize__use_idf": (True, False),
              "vectorize__ngram_range": [(1,1), (1,2)],
              "vectorize__max_df" : (1, .8),
              "vectorize__norm": ("l1", "l2"),
              "classify__kernel": ("linear", "rbf’")
             }
svc = GridSearchCV(pipeline, param_grid = parameters, n_jobs = -1, cv = 5, verbose = 1)

In [None]:
svc.fit(X_train,y_train)

In [None]:
# Get best results
print(f"Best Score: {svc.best_score_} with parameters: {svc.best_params_}")
optimized_svc = svc.best_estimator_

In [None]:
# Evaluate model using test data
predictions = optimized_svc.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"Confusion matrix: {confusion_matrix(y_test, predictions)}")
print(f"Classification report: \n{classification_report(y_test, predictions)}")

### Deep Learning Model

## Save Best Model

In [None]:
# Save best model
dump(optimized_mnb, 'twitter_model.joblib')