In [1]:
import pandas as pd

dataset_columns=["polarity","text"]
dataset_encoding="ISO-8859-1"

dataset=pd.read_csv("sentiment140CleanSW.csv")
dataset=dataset.drop("Unnamed: 0", axis=1)
dataset.head()

Unnamed: 0,polarity,text
0,0,awww that s a bummer you shoulda got david car...
1,0,is upset that he can t update his facebook by ...
2,0,i dived many time for the ball managed to save...
3,0,my whole body feel itchy and like it on fire
4,0,no it s not behaving at all i m mad why am i h...


In [2]:
dataset.count()

polarity    1600000
text        1596237
dtype: int64

I notice that the entries in polarity are not the same number as the entries in text. This is most likely due to cleaning the data previously. For example some tweets may have contained only a link but I used a regular expression to remove links so this could cause some tweets to have no text.

In [3]:
dataset=dataset.dropna()
dataset=dataset.reset_index(drop=True)

In [21]:
print("Negative entries", round(len(dataset[dataset.polarity==0].index)/
      len(dataset.index)*100,2),"%")
print("Positive entries", round(len(dataset[dataset.polarity==4].index)/
      len(dataset.index)*100,2),"%")

Negative entries 50.01 %
Positive entries 49.99 %


In [22]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

cvec = CountVectorizer()
cvec.fit(dataset.text)

neg_matrix = cvec.transform(dataset[dataset.polarity == 0].text)
pos_matrix = cvec.transform(dataset[dataset.polarity == 4].text)
neg_wf = np.sum(neg_matrix,axis=0)
pos_wf = np.sum(pos_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_wf))
pos = np.squeeze(np.asarray(pos_wf))

word_frequency = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
word_frequency.columns=["negative","positive"]
word_frequency["total"]=word_frequency["negative"]+word_frequency["positive"]
word_frequency=word_frequency.sort_values(by="total",ascending=False,axis=0)
word_frequency.head(15)

Unnamed: 0,negative,positive,total
to,313162,252564,565726
the,257830,265993,523823
it,182955,167019,349974
my,190766,125951,316717
and,153948,149632,303580
you,103806,198199,302005
is,128021,108922,236943
in,115612,101222,216834
for,98998,117368,216366
of,92739,91098,183837


In [27]:
from sklearn.model_selection import train_test_split
from decimal import Decimal

x=dataset.text
y=dataset.polarity
seed=123

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=seed)
#x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=.5, random_state=seed)

total_train=len(x_train)
neg_train=round((len((x_train)[y==0]))/len(x_train)*100,2)
pos_train=round((len((x_train)[y==4]))/len(x_train)*100,2)
print("TRAINING SET entries",total_train)
print("Negative",neg_train,"%")
print("Positive",pos_train,"%")

total_test=len(x_test)
neg_test=round((len((x_test)[y==0]))/len(x_test)*100,2)
pos_test=round((len((x_test)[y==4]))/len(x_test)*100,2)
print("\nTEST SET entries",total_test)
print("Negative",neg_test,"%")
print("Positive",pos_test,"%")

# total_test=len(x_val)
# neg_test=(len((x_val)[y==0]))/len(x_val)*100
# pos_test=(len((x_val)[y==4]))/len(x_val)*100
# print("\nVALIDATION SET entries",total_test)
# print("Negative",neg_test,"%")
# print("Positive",pos_test,"%")

TRAINING SET entries 1436613
Negative 50.01 %
Positive 49.99 %

TEST SET entries 159624
Negative 50.04 %
Positive 49.96 %


In [28]:
from sklearn.naive_bayes import MultinomialNB

x_train_cvec=cvec.fit_transform(x_train)
x_test_cvec=cvec.transform(x_test)

classifier=MultinomialNB()
classifier.fit(x_train_cvec,y_train)
predictions=classifier.predict(x_test_cvec)

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy:", round(accuracy_score(y_test,predictions)*100,2))
print("Precision:", round(precision_score(y_test,predictions,pos_label=4)*100,2))
print("Recall:", round(recall_score(y_test,predictions,pos_label=4)*100,2))

Accuracy: 78.0
Precision: 79.0
Recall: 76.22


In [31]:
test_string=["your lamp is very useful"]
test_cvec=cvec.transform(test_string)
print(classifier.predict(test_cvec))

[4]


In [10]:
import pickle
file=open("multinomialNBNOSW.pickle","wb")
pickle.dump(classifier,file)
file.close()

In [11]:
file=open("multinomialNBNOSW.pickle","rb")
naiveClassifier=pickle.load(file)
file.close()

In [32]:
label=naiveClassifier.predict(test_cvec)
if label==4:
    print("positive")
else:
    print("negative")

positive
