In [65]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import pandas as pd
import numpy as np
import re

### Define Metrics

In [66]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

### This method creates classification integer
Positive tweets : 1
Neutral tweets : 0
Negative tweets : -1

In [67]:
#New column for classification labels, either 1 or 0
def scoreCol(df, text_field):
    scores = []
    for x in df[text_field]:
        if x == 'positive':
            x = 1
            scores.append(x)
        elif x == 'negative':
            x = 0
            scores.append(x)
        elif x == 'neutral':
            x = -1
            scores.append(x)
    df['scores'] = scores
    return df

### Read in dataset
The first dataset is going to be the cleaned dataset

In [68]:
#Read data into dataframe
data = pd.read_csv(r'cleanedData.csv')

In [69]:
#Call classification label method
data = scoreCol(data,'sentiment')
data.head(5)

Unnamed: 0,textID,text,sentiment,scores
0,cb774db0d1,"['would', 'respond', 'go']",neutral,-1
1,549e992a42,"['sooo', 'sad', 'miss', 'san', 'diego']",negative,0
2,088c60f138,"['boss', 'bully']",negative,0
3,9642c003ef,"['interview', 'leave', 'alone']",negative,0
4,358bd9e861,"['son', 'could', 'not', 'put', 'release', 'alr...",negative,0


Get list of Tokens for each tweet

In [70]:
listOfTokens = []
for tokensUnparsed in data['text'].tolist():
    tokensUnparsed = re.sub("(\\'|'|\\\")", '"', tokensUnparsed)
    listOfTokens.append(tokensUnparsed[2:-2].split('", "'))
listOfTokens[:5]

[['would', 'respond', 'go'],
 ['sooo', 'sad', 'miss', 'san', 'diego'],
 ['boss', 'bully'],
 ['interview', 'leave', 'alone'],
 ['son', 'could', 'not', 'put', 'release', 'already', 'buy']]

Do a check so see the corpus size and the size of the vocabulary:

In [71]:
all_words = [token for tokens in listOfTokens for token in tokens]
sentence_lengths = [len(tokens) for tokens in listOfTokens]
Vocabulary = sorted(list(set(all_words)))
print("%s tokens total, with a vocabulary size of %s" % (len(all_words), len(Vocabulary)))
all_words[:20]

198400 tokens total, with a vocabulary size of 23263


['would',
 'respond',
 'go',
 'sooo',
 'sad',
 'miss',
 'san',
 'diego',
 'boss',
 'bully',
 'interview',
 'leave',
 'alone',
 'son',
 'could',
 'not',
 'put',
 'release',
 'already',
 'buy']

Create the list_corpus and the labels:

In [81]:
# Create list_corpus
list_corpus = data['text']
#from nltk.tokenize.treebank import TreebankWordDetokenizer
#TreebankWordDetokenizer().detokenize(['the', 'quick', 'brown'])    

# Get labels
list_labels = []
for l in data['scores'].tolist():
    label = int(l)
    list_labels.append(label)

In [82]:
#Do a check...
print(list_corpus[:5])
print(list_labels[:5])

0                           ['would', 'respond', 'go']
1              ['sooo', 'sad', 'miss', 'san', 'diego']
2                                    ['boss', 'bully']
3                      ['interview', 'leave', 'alone']
4    ['son', 'could', 'not', 'put', 'release', 'alr...
Name: text, dtype: object
[-1, 0, 0, 0, 0]


### Create count (Bag-of-Words) vectorizer

In [83]:
# Convert all review documents to a sparse matrix of token counts
vectorizer = CountVectorizer() 
termDocumentMatrix = vectorizer.fit_transform(list_corpus)

In [84]:
#Split the training dataset into two sections:
x_train, x_test, y_train, y_test = train_test_split(termDocumentMatrix, list_labels, test_size = 0.2)

In [85]:
#Create a Multinominal Classifier
mnb = MultinomialNB()

In [86]:
#Train the model using the training sets
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [87]:
#Predict the response for test dataset
y_predicted_counts = mnb.predict(x_test)

In [88]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)

In [89]:
print("data = cleaned Dataset: vectorizer = CountVectorizer, accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
metrics.confusion_matrix(y_test,y_predicted_counts)

data = cleaned Dataset: vectorizer = CountVectorizer, accuracy = 0.652, precision = 0.656, recall = 0.652, f1 = 0.653


array([[1513,  359,  386],
       [ 540,  887,   84],
       [ 464,   78, 1186]], dtype=int64)