In [14]:
import pandas as pd

dataset_columns=["polarity","text"]
dataset_encoding="ISO-8859-1"

dataset=pd.read_csv("sentiment140CleanNOSW.csv")
dataset=dataset.drop("Unnamed: 0", axis=1)
dataset.head()

Unnamed: 0,polarity,text
0,0,awww bummer shoulda got david carr third day
1,0,upset update facebook texting might cry result...
2,0,dived many time ball managed save rest go bound
3,0,whole body feel itchy like fire
4,0,behaving mad see


In [15]:
dataset.count()

polarity    1600000
text        1591391
dtype: int64

I notice that the entries in polarity are not the same number as the entries in text. This is most likely due to cleaning the data previously. For example some tweets may have contained only a link but I used a regular expression to remove links so this could cause some tweets to have no text.

In [16]:
dataset=dataset.dropna()
dataset=dataset.reset_index(drop=True)

In [17]:
print("Negative entries", len(dataset[dataset.polarity==0].index)/
      len(dataset.index)*100,"%")
print("Positive entries", len(dataset[dataset.polarity==4].index)/
      len(dataset.index)*100,"%")

Negative entries 50.01209633584707 %
Positive entries 49.98790366415293 %


In [18]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

cvec = CountVectorizer()
cvec.fit(dataset.text)

neg_matrix = cvec.transform(dataset[dataset.polarity == 0].text)
pos_matrix = cvec.transform(dataset[dataset.polarity == 4].text)
neg_wf = np.sum(neg_matrix,axis=0)
pos_wf = np.sum(pos_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_wf))
pos = np.squeeze(np.asarray(pos_wf))

word_frequency = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
word_frequency.columns=["negative","positive"]
word_frequency["total"]=word_frequency["negative"]+word_frequency["positive"]
word_frequency=word_frequency.sort_values(by="total",ascending=False,axis=0)
word_frequency.head()

Unnamed: 0,negative,positive,total
day,52151,56920,109071
good,29250,62181,91431
get,48049,38411,86460
like,41534,38365,79899
go,48128,30408,78536


In [19]:
from sklearn.model_selection import train_test_split

x=dataset.text
y=dataset.polarity
seed=123

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=seed)
#x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=.5, random_state=seed)

total_train=len(x_train)
neg_train=(len((x_train)[y==0]))/len(x_train)*100
pos_train=(len((x_train)[y==4]))/len(x_train)*100
print("TRAINING SET entries",total_train)
print("Negative",neg_train,"%")
print("Positive",pos_train,"%")

total_test=len(x_test)
neg_test=(len((x_test)[y==0]))/len(x_test)*100
pos_test=(len((x_test)[y==4]))/len(x_test)*100
print("\nTEST SET entries",total_test)
print("Negative",neg_test,"%")
print("Positive",pos_test,"%")

# total_test=len(x_val)
# neg_test=(len((x_val)[y==0]))/len(x_val)*100
# pos_test=(len((x_val)[y==4]))/len(x_val)*100
# print("\nVALIDATION SET entries",total_test)
# print("Negative",neg_test,"%")
# print("Positive",pos_test,"%")

TRAINING SET entries 1432251
Negative 50.023843586075344 %
Positive 49.976156413924656 %

TEST SET entries 159140
Negative 49.90637174814629 %
Positive 50.093628251853715 %


In [20]:
from sklearn.naive_bayes import MultinomialNB

x_train_cvec=cvec.fit_transform(x_train)
x_test_cvec=cvec.transform(x_test)

classifier=MultinomialNB()
classifier.fit(x_train_cvec,y_train)
predictions=classifier.predict(x_test_cvec)

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy:", accuracy_score(y_test,predictions))
print("Precision:", precision_score(y_test,predictions,pos_label=4))
print("Recall:", recall_score(y_test,predictions,pos_label=4))

Accuracy: 0.7684931506849315
Precision: 0.7737820062575825
Recall: 0.7600572009182253


In [22]:
test_string=["it was very good"]
test_cvec=cvec.transform(test_string)
print(classifier.predict(test_cvec))

[4]


In [23]:
import pickle
file=open("multinomialNBNOSW.pickle","wb")
pickle.dump(classifier,file)
file.close()

In [24]:
file=open("multinomialNBNOSW.pickle","rb")
naiveClassifier=pickle.load(file)
file.close()

In [25]:
label=naiveClassifier.predict(test_cvec)
if label==4:
    print("positive")
else:
    print("negative")

positive
