In [31]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import pandas as pd
import numpy as np
import re

In [32]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [33]:
#New column for classification labels, either 1 or 0
def scoreCol(df, text_field):
    scores = []
    for x in df[text_field]:
        if x == 'positive':
            x = 1
            scores.append(x)
        elif x == 'negative':
            x = 0
            scores.append(x)
        elif x == 'neutral':
            x = -1
            scores.append(x)
    df['scores'] = scores
    return df

In [34]:
#Read data into dataframe
data = pd.read_csv(r'rawData.csv')

In [35]:
#Call classification label method
data = scoreCol(data,'sentiment')

In [37]:
listOfTokens = []
for tokensUnparsed in data["text"].tolist():
    tokensUnparsed = re.sub("(\\'|'|\\\")", '"', tokensUnparsed)
    listOfTokens.append(tokensUnparsed[2:-2].split('", "'))
print(listOfTokens[:5])

[['i', '`', 'd', 'have', 'responded', 'if', 'i', 'were', 'going'], ['sooo', 'sad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'diego'], ['my', 'boss', 'is', 'bullying', 'me'], ['what', 'interview', 'leave', 'me', 'alone'], ['sons', 'of', 'why', 'couldn', '`', 't', 'they', 'put', 'them', 'on', 'the', 'releases', 'we', 'already', 'bought']]


In [38]:
all_words = [token for tokens in listOfTokens for token in tokens]
sentence_lengths = [len(tokens) for tokens in listOfTokens]
Vocabulary = sorted(list(set(all_words)))
print("%s tokens total, with a vocabulary size of %s" % (len(all_words), len(Vocabulary)))

378839 tokens total, with a vocabulary size of 26323


In [39]:
# Create list_corpus
list_corpus = data['text']

# Get labels
list_labels = []
for l in data['scores'].tolist():
    label = int(l)
    list_labels.append(label)

In [40]:
#Do a check...
print(list_corpus[:5])
print(list_labels[:5])

0    ['i', '`', 'd', 'have', 'responded', 'if', 'i'...
1    ['sooo', 'sad', 'i', 'will', 'miss', 'you', 'h...
2               ['my', 'boss', 'is', 'bullying', 'me']
3        ['what', 'interview', 'leave', 'me', 'alone']
4    ['sons', 'of', 'why', 'couldn', '`', 't', 'the...
Name: text, dtype: object
[-1, 0, 0, 0, 0]


In [41]:
# Convert all review documents to a sparse matrix of token counts
vectorizer = CountVectorizer() 
termDocumentMatrix = vectorizer.fit_transform(list_corpus)

In [42]:
#Split the training dataset into two sections:
x_train, x_test, y_train, y_test = train_test_split(termDocumentMatrix, list_labels, test_size = 0.2)

In [43]:
#Create a Multinominal Classifier
mnb = MultinomialNB()

In [44]:
#Train the model using the training sets
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
#Predict the response for test dataset
y_predicted_counts = mnb.predict(x_test)

In [46]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)

In [47]:
print("data = cleaned Dataset: vectorizer = CountVectorizer, accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
metrics.confusion_matrix(y_test,y_predicted_counts)

data = cleaned Dataset: vectorizer = CountVectorizer, accuracy = 0.643, precision = 0.647, recall = 0.643, f1 = 0.643


array([[1528,  356,  392],
       [ 532,  904,   86],
       [ 507,   91, 1101]], dtype=int64)