In [90]:
#Data source: https://www.kaggle.com/kazanova/sentiment140
#
#From the Data Description.
#
#This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . 
#The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment.
#
#It contains the following 6 fields:
# target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
# ids: The id of the tweet ( 2087)
# date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
# flag: The query (lyx). If there is no query, then this value is NO_QUERY.
# user: the user that tweeted (robotickilldozr)
# text: the text of the tweet (Lyx is cool)

In [91]:
import pandas as pd
df = pd.read_csv("twitter_sentiment_140.csv", engine='python')
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

In [92]:
print(df.target.unique())
print(df.isna().sum())
#Shows that there is no neutral sentiment
#Also that there is no nan values

[0 4]
target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64


In [93]:
df_labels = df['target'].copy()
df_data_list = df['text'].copy().tolist()
#Creates labels and data_list for later processing

In [94]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(df_data_list, df_labels, test_size = 0.2, random_state = 1)
print(len(train_data))
print(len(test_data))

#Imports train_test_split and creates training/testing datasets accordingly

1279999
320000


In [95]:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer()
counter.fit(train_data)
train_counts = counter.transform(train_data)
test_counts = counter.transform(test_data)

print(train_data[2])
print(train_counts[2])
#Creates vector counts of each tweet for test and training sets. Prints example

@DavidArchie Haha...!!! Yeah!  T- shirts r awesome!! &amp; it's SUMMER!  lol!
  (0, 55897)	1
  (0, 72415)	1
  (0, 150274)	1
  (0, 226652)	1
  (0, 257356)	1
  (0, 322809)	1
  (0, 473458)	1
  (0, 501521)	1
  (0, 579261)	1


In [96]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
#Creates Mulitnomial Naive Bayes classifier
classifier.fit(train_counts, train_labels)
#Trains classifier on data set
predictions = classifier.predict(test_counts)
#Uses test data to see test accuracy

In [97]:
from sklearn.metrics import accuracy_score

print(accuracy_score(test_labels, predictions))
#Prints accuracy score as percentage

0.78178125


In [98]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_labels, predictions))
#Shows confusion matrix for predictions.

[[130972  29159]
 [ 40671 119198]]


In [119]:
#Tests custom text against classifier, showcasing a fault in the classifier
test_text = "I am very happy, not sad in any way"
custom_counts = counter.transform([test_text])
print(classifier.predict(custom_counts))

[0]
