# SVM and Naive Bayes model for classification of tweet sentiment

---

# 1. Installs and imports

## 1.1. Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

## 1.2. Import required libraries

In [2]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import numpy as np
from sklearn.metrics import accuracy_score

# 2. Load cleaned tweets dataset

In [3]:
df = pd.read_csv('./cleaned_tweets.csv')

In [4]:
np.random.seed(450)

# 3. Drop text

In [5]:
df = df[['sentiment', 'Snowball_Stem']]

In [6]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,aww bummer shoulda got david carr third day
1,0,upset can not updat facebook text might cri re...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad can not see


# 4. Drop rows with NaN

In [7]:
df.isna().sum()

sentiment           0
Snowball_Stem    8046
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

In [10]:
X= df['Snowball_Stem']

In [11]:
y = df['sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1193965,), (397989,), (1193965,), (397989,))

# 5. Applying TFIDF Unigram

In [14]:
# v1 = TfidfVectorizer()
# v1.fit(X)

In [15]:
# X1_train = v1.transform(X_train)
# X1_test = v1.transform(X_test)

# 6. Applying TFIDF Bigram

In [16]:
# v2 = TfidfVectorizer(ngram_range = (2, 2))
# v2.fit(X)

In [17]:
# X2_train = v2.transform(X_train)
# X2_test = v2.transform(X_test)

# 7. TFIDF Unigram + Bigram

In [18]:
v3 = TfidfVectorizer(ngram_range = (1, 2))
v3.fit(X)

TfidfVectorizer(ngram_range=(1, 2))

In [19]:
X3_train = v3.transform(X_train)
X3_test = v3.transform(X_test)

# Encoding labels

In [20]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

# Running naive bayes

In [21]:
# Naive1 = naive_bayes.MultinomialNB()
# Naive1.fit(X1_train,y_train)

In [22]:
# Naive2 = naive_bayes.MultinomialNB()
# Naive2.fit(X2_train,y_train)

In [23]:
# NB1 = Naive1.predict(X1_test)
# NB2 = Naive2.predict(X2_test)

In [24]:
# print("Naive Bayes Unigram Accuracy Score -> ",accuracy_score(NB1, y_test)*100)

In [25]:
# print("Naive Bayes Bigram Accuracy Score -> ",accuracy_score(NB2, y_test)*100)

In [26]:
Naive3= naive_bayes.MultinomialNB()
Naive3.fit(X3_train,y_train)

MultinomialNB()

In [27]:
NB3 = Naive3.predict(X3_test)

In [28]:
print("Naive Bayes Bigram Accuracy Score -> ",accuracy_score(NB3, y_test)*100)

Naive Bayes Bigram Accuracy Score ->  77.87652422554392


# 5. Reduce dataframe size

In [29]:
df[df.sentiment != 0].shape

(795860, 2)

In [30]:
df[df.sentiment == 0].shape

(796094, 2)

In [31]:
df[df.sentiment != 0][:200000].shape

(200000, 2)

In [32]:
df[df.sentiment == 0][:200000].shape

(200000, 2)

In [33]:
reduced_df = pd.concat([df[df.sentiment != 0][:50000], df[df.sentiment == 0][:50000]])

In [34]:
reduced_df.shape

(100000, 2)

In [35]:
df = reduced_df

In [36]:
X= df['Snowball_Stem']

In [37]:
y = df['sentiment']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [63]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((75000,), (25000,), (75000,), (25000,))

# TFIDF on reduced data

In [40]:
# X1_train = v1.transform(X_train)
# X1_test = v1.transform(X_test)

In [41]:
# X2_train = v2.transform(X_train)
# X2_test = v2.transform(X_test)

In [62]:
X3_train = v3.transform(X_train)
X3_test = v3.transform(X_test)

# Running SVM

In [43]:
#SVM = svm.SVC(C=1.0, kernel='linear', degree=1, gamma='auto')

In [44]:
# SVM.fit(X1_train,y_train)

In [45]:
# predict the labels on validation dataset
# SVM1_predictions = SVM.predict(X1_test)

In [46]:
# Use accuracy_score function to get the accuracy
# print("SVM Accuracy Score -> ",accuracy_score(SVM1_predictions, y_test)*100)

In [47]:
# SVM2 = svm.SVC(C=1.0, kernel='linear', degree=1, gamma='auto')

In [48]:
# SVM2.fit(X2_train,y_train)

In [49]:
# SVM2_predictions = SVM2.predict(X2_test)

In [50]:
# print("SVM Accuracy Score -> ",accuracy_score(SVM2_predictions, y_test)*100)

In [64]:
SVM3 = svm.SVC(kernel='linear')

In [65]:
SVM3.fit(X3_train,y_train)

SVC(kernel='linear')

In [66]:
SVM3_pred = SVM3.predict(X3_test)
print("SVM Accuracy Score -> ",accuracy_score(SVM3_pred, y_test)*100)

SVM Accuracy Score ->  77.24


---

# Saving the models

In [67]:
 import pickle

In [70]:
SVM_model_path = "./SVM_UnigramBigram_77.pickle"
NB_model_path = "./NB_UnigramBigram_78.pickle"
vectorizer_path ="./UnigramBigram_vectorizer.pickle"

In [71]:
# pickle.dump(Naive3, open(NB_model_path, 'wb'))
pickle.dump(SVM3, open(SVM_model_path,'wb'))
# pickle.dump(v3, open(vectorizer_path, 'wb'))