In [None]:
# Python 3+

# 3rd party imports (not present in the standard python library)
# To install, pip install numpy pandas

import numpy as np
import pandas as pd

# Standard python library imports

import glob

In [None]:
# A large dataset with 1.6 million tweets are being used to train the model
# Due to its size, the file is not included in this repository
# The dataset can be downloaded from https://www.kaggle.com/kazanova/sentiment140

# File in current workspace
glob.glob('*.csv')

In [None]:
# Import data

df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1', names = ["Score", "Id", "Date", "Flag", "User", "Tweet"])

In [None]:
# First 5 records

df.head()

In [None]:
# Last 5 records

df.tail()

In [None]:
# To train the model, our primary data points are the tweet and the score assoociated with the score
# Score here is the sentiment where 0 = negative, 4 = positive
# Columns that are not required are removed and the score is normalized to be in the 0 - 1 range

df.drop(["Id", "Date", "Flag", "User"], axis = 1, inplace = True)
df['Score'] = df['Score'].apply(lambda i : i / 4)

In [None]:
df.head()

In [None]:
df['Tweet'][0]

In [None]:
## Tweet cleanup (this process takes a significant amount of time)
# Use the df-cleaned.pickle to load a cleaned up dataframe
# Removing stop words, @ mentions, webpages and special characters

from nltk.corpus import stopwords # nltk.download('stopwords') before importing
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def clean(tweet):
    stage1 = [word for word in tweet.lower().split() if word not in stopwords.words('english')] # stopword removal
    stage2 = [word[1:] if word.startswith('#') else word for word in stage1] # Hashtag symbol removal
    stage3 = [stemmer.stem(word) for word in stage2 if not any([word.startswith('@'), word.startswith('http'), word.startswith('www')])] # @ mentions and websites removal and stemming
    return ' '.join(stage3)

In [None]:
%%time

df['TweetStripped'] = df['Tweet'].apply(clean)

In [None]:
df.head()

In [None]:
# Import dataframe from pickle

import pickle

with open('Pickled data/df-cleaned-final.pickle', 'rb') as f:
    df = pickle.load(f)

In [None]:
df.head()

In [None]:
## Analysis

from sklearn.feature_extraction.text import TfidfVectorizer # Perfoms the TF-IDF
from sklearn.model_selection import train_test_split # Used to split the data into training and testing

# Data is split in the ratio of 0.9 (train) : 0.1 (test)
train_x, test_x, train_y, test_y = train_test_split(df['TweetStripped'], df['Score'], test_size = 0.1, shuffle = True)

# To compare the accuracy when the raw tweet is used to train the model, the original data is split as well
train_x2, test_x2, train_y2, test_y2 = train_test_split(df['Tweet'], df['Score'], test_size = 0.1, shuffle = True)


In [None]:
# Initialize and fit the TfTfidfVectorizer
vector = TfidfVectorizer(max_features = 10000, ngram_range = (1,2), stop_words='english')
%time vector.fit(train_x)

In [None]:
# Transform the data to pass it into various classifiers
train_x_transformed = vector.transform(train_x)

In [None]:
# The data will be trained on several models to find the one with the highest accuracy

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression()))
models.append(('NB', MultinomialNB()))

### Models below for this dataset take significantly longer
#models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('KNN', KNeighborsClassifier()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('SVM', SVC()))

In [None]:
%%time

# Train the models

results = dict()
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state = 9)
    cv_results = model_selection.cross_val_score(model, train_x_transformed, train_y, cv = kfold, scoring = 'accuracy', n_jobs = -1, verbose = 1)
    results[name] = cv_results
    print('{}: Average: {}, std: {}'.format(name, cv_results.mean(), cv_results.std()))

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# Unpacking trained models

with open('Pickled data/nn.pickle', 'rb') as f3:
    NN = pickle.load(f3)

with open('Pickled data/LR.pickle', 'rb') as f3:
    LR = pickle.load(f3)

with open('Pickled data/naive-bayes.pickle', 'rb') as f3:
    NB = pickle.load(f3)

In [None]:
### Models to train
# Neural Network (Single layer with 100 units)
# Logistic Regression
# Multinomial Naive-Bayes

In [None]:
## Neural Network 
# (Note - training is suspended after seeing diminishing gain at around the 43rd iteration)

NN = MLPClassifier(verbose=2)
NN.fit(train_x_transformed, train_y)

In [None]:
# Logistic Regression

LR = LogisticRegression()
LR.fit(train_x_transformed, train_y)

In [None]:
# Multinomial Naive-Bayes

NB = MultinomialNB()
NB.fit(train_x_transformed, train_y)

In [None]:
### Predictions from various models

predNN = NN.predict(vector.transform(test_x))
predLR = LR.predict(vector.transform(test_x))
predNB = NB.predict(vector.transform(test_x))

In [None]:
# Calculate accuracy and confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
for model, prediction in zip(['Neural Network', 'Logistic Regression', 'Naive Bayes'], [predNN, predLR, predNB]):
    print('Model: {}'.format(model))
    print('Accuracy - {}'.format(accuracy_score(test_y, prediction)))
    print('Confusion matrix - {}\n'.format(confusion_matrix(test_y, prediction)))

In [None]:
## Function to test a tweet, defaults to LR due to its higher accuracy

def predict(tweet, model = LR):
    return model.predict(vector.transform([clean(tweet)]))

In [None]:
# 1: positive, 0: negative

print('NN: {}'.format(predict('I love math!', model = NN)))
print('LR: {}'.format(predict('I love math!', model = LR)))
print('NB: {}'.format(predict('I love math!', model = NB)))

In [None]:
## Shelve the model, vector and predict objects

import shelve

with shelve.open('shelve.model', 'c') as shelf:
    shelf['model'] = LR
    shelf['vector'] = vector