In [8]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Global Parameters
stop_words = set(stopwords.words('english'))

In [9]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding='latin-1')
    dataset.columns = cols
    return dataset

In [10]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

In [11]:
def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    
    return " ".join(filtered_words)

In [12]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [13]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"

In [14]:
# Load dataset
dataset = load_dataset("1600000 Tweets.csv", ['target', 't_id', 'created_at', 'query', 'user', 'text'])
# Remove unwanted columns from dataset
n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])
#Preprocess data
dataset.text = dataset['text'].apply(preprocess_tweet_text)
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)

In [16]:
import pickle
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

In [25]:
# Training Naive Bayes model #Without Lem Stem: accuracy = 76.81 #With Lem Stem: accuracy = 76.89

BNB_model = BernoulliNB(alpha = 0.12)
BNB_model.fit(X_train, y_train)
saved_model = open('Saved_Model_LemStem_Bernoulli_NB.sav', 'wb')
pickle.dump(BNB_model, saved_model)
saved_model.close()

In [26]:
y_predict_bnb = BNB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_bnb))

0.76895625


In [27]:
# Training Logistics Regression model #Without Lem Stem: accuracy = 78.77 #With Lem Stem: accuracy = 78.8225

LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
saved_model = open('Saved_Model_LemStem_Logistic_Regression.sav', 'wb')
pickle.dump(LR_model, saved_model)
saved_model.close()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.788225


In [33]:
# Training Ridge classifier model #Without Lem Stem: accuracy = 77.99 #With Lem Stem: accuracy = 78.01

RC_model = RidgeClassifier(tol=1e-2, solver="auto")
RC_model.fit(X_train, y_train)
saved_model = open('Saved_Model_LemStem_Ridge_Classifier.sav', 'wb')
pickle.dump(RC_model, saved_model)
saved_model.close()

In [34]:
y_predict_rc = RC_model.predict(X_test)
print(accuracy_score(y_test, y_predict_rc))

0.780175


In [41]:
# Training SVM #Lemstem accuracy = 76.47

SVM = SGDClassifier(loss='hinge', penalty='l2')
SVM.fit(X_train, y_train)
saved_model = open('Saved_Model_LemStem_SVM_Classifier.sav', 'wb')
pickle.dump(SVM, saved_model)
saved_model.close()

In [42]:
y_predict_svm = SVM.predict(X_test)
print(accuracy_score(y_test, y_predict_svm))

0.7643125


In [43]:
# Training Passive-Aggresive classifier #LemStem Accuracy = 74.36

PAC_model = PassiveAggressiveClassifier(max_iter=50)
PAC_model.fit(X_train, y_train)
saved_model = open('Saved_Model_LemStem_Passive_Aggresive_Classifier.sav', 'wb')
pickle.dump(PAC_model, saved_model)
saved_model.close()

In [44]:
y_predict_pac = PAC_model.predict(X_test)
print(accuracy_score(y_test, y_predict_pac))

0.74363125


In [None]:
# Training Random Forest Classifier

RFC_model = RandomForestClassifier()
RFC_model.fit(X_train, y_train)
saved_model = open('Saved_Model_LemStem_Random_Forest_Classifier.sav', 'wb')
pickle.dump(RFC_model, saved_model)
saved_model.close()

In [None]:
y_predict_rfc = RFC_model.predict(X_test)
print(accuracy_score(y_test, y_predict_rfc))