# SVM and Naive Bayes model for classification of tweet sentiment

---

# 1. Installs and imports

## 1.1. Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

## 1.2. Import required libraries

In [2]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import numpy as np
from sklearn.metrics import accuracy_score

# 2. Load cleaned tweets dataset

In [3]:
df = pd.read_csv('./cleaned_tweets.csv')

In [4]:
np.random.seed(450)

# 3. Drop text

In [5]:
df = df[['sentiment', 'Snowball_Stem']]

In [6]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,awww that bummer shoulda got david carr third day
1,0,upset cant updat facebook text might cri resul...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad cant see


# 4. Drop rows with NaN

In [7]:
df.isna().sum()

sentiment           0
Snowball_Stem    7661
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

In [10]:
X= df['Snowball_Stem']

In [11]:
y = df['sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1194254,), (398085,), (1194254,), (398085,))

# 5. Applying TFIDF Unigram

In [14]:
v1 = TfidfVectorizer()
v1.fit(X)

In [14]:
X1_train = v1.transform(X_train)
X1_test = v1.transform(X_test)

# 6. Applying TFIDF Bigram

In [15]:
v2 = TfidfVectorizer(ngram_range = (2, 2))
v2.fit(X)

In [15]:
X2_train = v2.transform(X_train)
X2_test = v2.transform(X_test)

# 7. TFIDF Unigram + Bigram

In [17]:
v3 = TfidfVectorizer(ngram_range = (1, 2))
v3.fit(X)

TfidfVectorizer(ngram_range=(1, 2))

In [18]:
X3_train = v3.transform(X_train)
X3_test = v3.transform(X_test)

# Encoding labels

In [19]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

# Running naive bayes

In [20]:
Naive1 = naive_bayes.MultinomialNB()
Naive1.fit(X1_train,y_train)

In [20]:
Naive2 = naive_bayes.MultinomialNB()
Naive2.fit(X2_train,y_train)

MultinomialNB()

In [21]:
NB1 = Naive1.predict(X1_test)
NB2 = Naive2.predict(X2_test)

In [22]:
print("Naive Bayes Unigram Accuracy Score -> ",accuracy_score(NB1, y_test)*100)

In [22]:
print("Naive Bayes Bigram Accuracy Score -> ",accuracy_score(NB2, y_test)*100)

Naive Bayes Unigram Accuracy Score ->  76.1231897710288
Naive Bayes Bigram Accuracy Score ->  74.31302360048734


In [23]:
Naive3= naive_bayes.MultinomialNB()
Naive3.fit(X3_train,y_train)

MultinomialNB()

In [24]:
NB3 = Naive3.predict(X3_test)

In [25]:
print("Naive Bayes Bigram Accuracy Score -> ",accuracy_score(NB3, y_test)*100)

Naive Bayes Bigram Accuracy Score ->  78.26871145609606


# 5. Reduce dataframe size

In [26]:
df[df.sentiment != 0].shape

(796018, 2)

In [27]:
df[df.sentiment == 0].shape

(796321, 2)

In [28]:
df[df.sentiment != 0][:200000].shape

(200000, 2)

In [29]:
df[df.sentiment == 0][:200000].shape

(200000, 2)

In [30]:
reduced_df = pd.concat([df[df.sentiment != 0][:10000], df[df.sentiment == 0][:10000]])

In [31]:
reduced_df.shape

(20000, 2)

In [32]:
df = reduced_df

In [33]:
X= df['Snowball_Stem']

In [34]:
y = df['sentiment']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [36]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15000,), (5000,), (15000,), (5000,))

# TFIDF on reduced data

In [37]:
X1_train = v1.transform(X_train)
X1_test = v1.transform(X_test)

In [38]:
X2_train = v2.transform(X_train)
X2_test = v2.transform(X_test)

In [39]:
X3_train = v3.transform(X_train)
X3_test = v3.transform(X_test)

# Running SVM

In [40]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=1, gamma='auto')

In [41]:
SVM.fit(X1_train,y_train)

SVC(degree=1, gamma='auto', kernel='linear')

In [42]:
# predict the labels on validation dataset
SVM1_predictions = SVM.predict(X1_test)

In [44]:
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(SVM1_predictions, y_test)*100)

SVM Accuracy Score ->  73.86


In [45]:
SVM2 = svm.SVC(C=1.0, kernel='linear', degree=1, gamma='auto')

In [46]:
SVM2.fit(X2_train,y_train)

SVC(degree=1, gamma='auto', kernel='linear')

In [47]:
SVM2_predictions = SVM2.predict(X2_test)

In [48]:
print("SVM Accuracy Score -> ",accuracy_score(SVM2_predictions, y_test)*100)

SVM Accuracy Score ->  64.1


In [49]:
SVM3 = svm.SVC()

In [50]:
SVM3.fit(X3_train,y_train)

SVC()

In [51]:
SVM3_pred = SVM3.predict(X3_test)
print("SVM Accuracy Score -> ",accuracy_score(SVM3_pred, y_test)*100)

SVM Accuracy Score ->  75.14


---

# Saving the models

In [52]:
import pickle

In [53]:
SVM_model_path = "./SVM_UnigramBigram_75.pickle"
NB_model_path = "./NB_UnigramBigram_78.pickle"
vectorizer_path ="./UnigramBigram_vectorizer.pickle"

In [54]:
pickle.dump(Naive3, open(NB_model_path, 'wb'))
pickle.dump(SVM3, open(SVM_model_path,'wb'))
pickle.dump(v3, open(vectorizer_path, 'wb'))