In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import pandas as pd
import numpy as np
import re

### Define Metrics

In [18]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

### This method creates classification integer
Positive tweets : 1
Neutral tweets : 0
Negative tweets : -1

In [19]:
#New column for classification labels, either 1 or 0
def scoreCol(df, text_field):
    scores = []
    for x in df[text_field]:
        if x == 'positive':
            x = 1
            scores.append(x)
        elif x == 'negative':
            x = 0
            scores.append(x)
        elif x == 'neutral':
            x = -1
            scores.append(x)
    df['scores'] = scores
    return df

### Read in dataset
The first dataset is going to be the cleaned dataset

In [20]:
#Read data into dataframe
data = pd.read_csv(r'cleanedData.csv')

In [21]:
#Call classification label method
data = scoreCol(data,'sentiment')

Get list of Tokens for each tweet

In [23]:
listOfTokens = []
for tokensUnparsed in data["text"].tolist():
    tokensUnparsed = re.sub("(\\'|'|\\\")", '"', tokensUnparsed)
    listOfTokens.append(tokensUnparsed[2:-2].split('", "'))
print(listOfTokens[:5])

[['would', 'respond', 'go'], ['sooo', 'sad', 'miss', 'san', 'diego'], ['boss', 'bully'], ['interview', 'leave', 'alone'], ['son', 'could', 'not', 'put', 'release', 'already', 'buy']]


Do a check so see the corpus size and the size of the vocabulary:

In [24]:
all_words = [token for tokens in listOfTokens for token in tokens]
sentence_lengths = [len(tokens) for tokens in listOfTokens]
Vocabulary = sorted(list(set(all_words)))
print("%s tokens total, with a vocabulary size of %s" % (len(all_words), len(Vocabulary)))

198400 tokens total, with a vocabulary size of 23263


Create the list_corpus and the labels:

In [26]:
# Create list_corpus
list_corpus = data['text']

# Get labels
list_labels = []
for l in data['scores'].tolist():
    label = int(l)
    list_labels.append(label)

In [27]:
#Do a check...
print(list_corpus[:5])
print(list_labels[:5])

0                           ['would', 'respond', 'go']
1              ['sooo', 'sad', 'miss', 'san', 'diego']
2                                    ['boss', 'bully']
3                      ['interview', 'leave', 'alone']
4    ['son', 'could', 'not', 'put', 'release', 'alr...
Name: text, dtype: object
[-1, 0, 0, 0, 0]


### Create TF-IDF Vectorizer 

In [28]:
# Convert all review documents to a sparse matrix of token counts
vectorizer = TfidfVectorizer() 
termDocumentMatrix = vectorizer.fit_transform(list_corpus)

In [29]:
#Split the training dataset into two sections:
x_train, x_test, y_train, y_test = train_test_split(termDocumentMatrix, list_labels, test_size = 0.2)

In [30]:
#Create a Multinominal Classifier
mnb = MultinomialNB()

In [31]:
#Train the model using the training sets
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
#Predict the response for test dataset
y_predicted_counts = mnb.predict(x_test)

In [33]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)

In [34]:
print("data = cleaned Dataset: vectorizer = TfidfVectorizer, accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
metrics.confusion_matrix(y_test,y_predicted_counts)

data = cleaned Dataset: vectorizer = TfidfVectorizer, accuracy = 0.621, precision = 0.663, recall = 0.621, f1 = 0.613


array([[1849,  167,  252],
       [ 831,  616,   64],
       [ 721,   46,  951]], dtype=int64)