# Challenge: sentiment analysis with Naive bayes

In [36]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [23]:
# Importing the data

df = pd.read_csv('imdb_labelled.txt', sep='  \t', header=None, engine='python')
#df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.columns = ['Sentence', 'Score']

In [24]:
#Setting the bools for the score
df.loc[df['Score'] == 0, 'Bool'] = True
df.loc[df['Score'] == 1, 'Bool'] = False

Create a list of words that are in the negative and positive comments.

In [25]:
negative_words = pd.DataFrame([])
positive_words = pd.DataFrame([])
lower_no_punc = pd.DataFrame([])
#[^\w\d\s]+
for index in df.index:
    if df.iloc[index]['Score'] == 0:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        negative_words = negative_words.append(out.lower().split())

    if df.iloc[index]['Score'] == 1:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        positive_words = positive_words.append(out.lower().split())
    lower_no_punc = lower_no_punc.append([out.lower().strip()])

lower_no_punc = lower_no_punc.reset_index(drop=True)
lower_no_punc.columns = ['Sentence']
negative_words = pd.DataFrame(negative_words[0].unique())
positive_words = pd.DataFrame(positive_words[0].unique())

Dropping out the words that are in both positive and negative lists.

In [26]:
#Compare word lists to each other and remove matching.
neg_match = negative_words[0].isin(positive_words[0])
pos_match = positive_words[0].isin(negative_words[0])

for index in neg_match.index:
    if neg_match.iloc[index] == True:
        negative_words.drop(index, inplace=True)

for index in pos_match.index:
    if pos_match.iloc[index] == True:
        positive_words.drop(index, inplace=True)
        
negative_words = negative_words.reset_index(drop=True)
positive_words = positive_words.reset_index(drop=True)
# Drops anything that is less then three words, reducing match errors.
negative_words.drop(negative_words[negative_words[0].str.len() < 3].index, inplace=True)
positive_words.drop(positive_words[positive_words[0].str.len() < 3].index, inplace=True)

Outputting the result of the word list to see if it matches the score.

In [27]:
for neg in negative_words[0]:
    df[str(neg)] = lower_no_punc['Sentence'].str.contains(str(neg), case=False)

In [28]:
from sklearn.naive_bayes import BernoulliNB

#Instantiate the model and store in a variable.
bnb = BernoulliNB()

## Naive Bayes

In [29]:
data = df[negative_words[0]]
target = df['Bool']
target = target.astype('bool')

In [30]:

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 268


### Results

I engineered the features to separate the words from the negative and positive comments and then subtracting out words that showed up in both lists.  This helped to create a good list the is representative of the negative comments.

This method could be improved as around 27% of the comments not being accurately represented.  Leaving us with around a 70% accuracy.

# Challenge: evaluate your sentiment classifier

### Using negative word list

In [31]:
con_matrix = pd.crosstab(target, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(con_matrix)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Predicted  False  True 
Actual                 
False        500      0
True         268    232
With 20% Holdout: 0.595
Testing on Sample: 0.732


Here we can see that all of the errors are when false should be true.

### Using positive words list

In [21]:
# Importing the data

df = pd.read_csv('imdb_labelled.txt', sep='  \t', header=None, engine='python')
#df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.columns = ['Sentence', 'Score']
#Setting the bools for the score
df.loc[df['Score'] == 0, 'Bool'] = True
df.loc[df['Score'] == 1, 'Bool'] = False

negative_words = pd.DataFrame([])
positive_words = pd.DataFrame([])
lower_no_punc = pd.DataFrame([])

for index in df.index:
    if df.iloc[index]['Score'] == 0:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        negative_words = negative_words.append(out.lower().split())

    if df.iloc[index]['Score'] == 1:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        positive_words = positive_words.append(out.lower().split())
    lower_no_punc = lower_no_punc.append([out.lower().strip()])

lower_no_punc = lower_no_punc.reset_index(drop=True)
lower_no_punc.columns = ['Sentence']
negative_words = pd.DataFrame(negative_words[0].unique())
positive_words = pd.DataFrame(positive_words[0].unique())

#Compare word lists to each other and remove matching.
pos_match = positive_words[0].isin(negative_words[0])
neg_match = negative_words[0].isin(positive_words[0])

for index in pos_match.index:
    if pos_match.iloc[index] == True:
        positive_words.drop(index, inplace=True)
        
for index in neg_match.index:
    if neg_match.iloc[index] == True:
        negative_words.drop(index, inplace=True)

positive_words = positive_words.reset_index(drop=True)
negative_words = negative_words.reset_index(drop=True)
# Drops anything that is less then three words, reducing match errors.
positive_words.drop(positive_words[positive_words[0].str.len() < 3].index, inplace=True)
negative_words.drop(negative_words[negative_words[0].str.len() < 3].index, inplace=True)

for pos in positive_words[0]:
    df[str(pos)] = lower_no_punc['Sentence'].str.contains(str(pos), case=False)
    
#Instantiate the model and store in a variable.
bnb = BernoulliNB()

data = df[positive_words[0]]
target = df['Bool']
target = target.astype('bool')
bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
con_matrix = pd.crosstab(target, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(con_matrix)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 279
Predicted  False  True 
Actual                 
False        224    276
True           3    497
With 20% Holdout: 0.62
Testing on Sample: 0.721


We have improved greatly on the positive word predictions begin more accurate, but with a drastic decrease of of our negative comments being accurate.

### Word list non-comparison positive wordlist

Here we will just compile the word lists without comparing them to each other and eliminating them out if they are in the other word list.

In [32]:
# Importing the data

df = pd.read_csv('imdb_labelled.txt', sep='  \t', header=None, engine='python')
#df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.columns = ['Sentence', 'Score']
#Setting the bools for the score
df.loc[df['Score'] == 0, 'Bool'] = True
df.loc[df['Score'] == 1, 'Bool'] = False

negative_words = pd.DataFrame([])
positive_words = pd.DataFrame([])
lower_no_punc = pd.DataFrame([])

for index in df.index:
    if df.iloc[index]['Score'] == 0:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        negative_words = negative_words.append(out.lower().split())

    if df.iloc[index]['Score'] == 1:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        positive_words = positive_words.append(out.lower().split())
    lower_no_punc = lower_no_punc.append([out.lower().strip()])

lower_no_punc = lower_no_punc.reset_index(drop=True)
lower_no_punc.columns = ['Sentence']
negative_words = pd.DataFrame(negative_words[0].unique())
positive_words = pd.DataFrame(positive_words[0].unique())

#Compare word lists to each other and remove matching.
#pos_match = positive_words[0].isin(negative_words[0])
#neg_match = negative_words[0].isin(positive_words[0])

# for index in pos_match.index:
#     if pos_match.iloc[index] == True:
#         positive_words.drop(index, inplace=True)
        
# for index in neg_match.index:
#     if neg_match.iloc[index] == True:
#         negative_words.drop(index, inplace=True)

positive_words = positive_words.reset_index(drop=True)
negative_words = negative_words.reset_index(drop=True)
# Drops anything that is less then three words, reducing match errors.
positive_words.drop(positive_words[positive_words[0].str.len() < 3].index, inplace=True)
negative_words.drop(negative_words[negative_words[0].str.len() < 3].index, inplace=True)

for pos in positive_words[0]:
    df[str(pos)] = lower_no_punc['Sentence'].str.contains(str(pos), case=False)
    
#Instantiate the model and store in a variable.
bnb = BernoulliNB()

data = df[positive_words[0]]
target = df['Bool']
target = target.astype('bool')
bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
con_matrix = pd.crosstab(target, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(con_matrix)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 210
Predicted  False  True 
Actual                 
False        294    206
True           4    496
With 20% Holdout: 0.68
Testing on Sample: 0.79


There is good improvement on the negative comments being more accurately identified when just using the positive words list.

### Word list non-comparison negative wordlist

In [33]:
# Importing the data

df = pd.read_csv('imdb_labelled.txt', sep='  \t', header=None, engine='python')
#df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.columns = ['Sentence', 'Score']
#Setting the bools for the score
df.loc[df['Score'] == 0, 'Bool'] = True
df.loc[df['Score'] == 1, 'Bool'] = False

negative_words = pd.DataFrame([])
positive_words = pd.DataFrame([])
lower_no_punc = pd.DataFrame([])

for index in df.index:
    if df.iloc[index]['Score'] == 0:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        negative_words = negative_words.append(out.lower().split())

    if df.iloc[index]['Score'] == 1:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        positive_words = positive_words.append(out.lower().split())
    lower_no_punc = lower_no_punc.append([out.lower().strip()])

lower_no_punc = lower_no_punc.reset_index(drop=True)
lower_no_punc.columns = ['Sentence']
negative_words = pd.DataFrame(negative_words[0].unique())
positive_words = pd.DataFrame(positive_words[0].unique())

#Compare word lists to each other and remove matching.
#pos_match = positive_words[0].isin(negative_words[0])
#neg_match = negative_words[0].isin(positive_words[0])

# for index in pos_match.index:
#     if pos_match.iloc[index] == True:
#         positive_words.drop(index, inplace=True)
        
# for index in neg_match.index:
#     if neg_match.iloc[index] == True:
#         negative_words.drop(index, inplace=True)

positive_words = positive_words.reset_index(drop=True)
negative_words = negative_words.reset_index(drop=True)
# Drops anything that is less then three words, reducing match errors.
positive_words.drop(positive_words[positive_words[0].str.len() < 3].index, inplace=True)
negative_words.drop(negative_words[negative_words[0].str.len() < 3].index, inplace=True)

for neg in negative_words[0]:
    df[str(neg)] = lower_no_punc['Sentence'].str.contains(str(neg), case=False)
    
#Instantiate the model and store in a variable.
bnb = BernoulliNB()

data = df[negative_words[0]]
target = df['Bool']
target = target.astype('bool')
bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
con_matrix = pd.crosstab(target, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(con_matrix)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 133
Predicted  False  True 
Actual                 
False        489     11
True         122    378
With 20% Holdout: 0.705
Testing on Sample: 0.867


The total amount of mislabels has decreased by 77 to our lowest yet when using the negative words list, wich has increased our accuracy.  It seems to have balanced out both the negative and false comments bringing the mislabels for each closer in count to each other.

###  Negative no compare without deleting small words

In [41]:
# Importing the data

df = pd.read_csv('imdb_labelled.txt', sep='  \t', header=None, engine='python')
#df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.columns = ['Sentence', 'Score']
#Setting the bools for the score
df.loc[df['Score'] == 0, 'Bool'] = True
df.loc[df['Score'] == 1, 'Bool'] = False

negative_words = pd.DataFrame([])
positive_words = pd.DataFrame([])
lower_no_punc = pd.DataFrame([])

for index in df.index:
    if df.iloc[index]['Score'] == 0:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        negative_words = negative_words.append(out.lower().split())

    if df.iloc[index]['Score'] == 1:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        positive_words = positive_words.append(out.lower().split())
    lower_no_punc = lower_no_punc.append([out.lower().strip()])

lower_no_punc = lower_no_punc.reset_index(drop=True)
lower_no_punc.columns = ['Sentence']
negative_words = pd.DataFrame(negative_words[0].unique())
positive_words = pd.DataFrame(positive_words[0].unique())

#Compare word lists to each other and remove matching.
#pos_match = positive_words[0].isin(negative_words[0])
#neg_match = negative_words[0].isin(positive_words[0])

# for index in pos_match.index:
#     if pos_match.iloc[index] == True:
#         positive_words.drop(index, inplace=True)
        
# for index in neg_match.index:
#     if neg_match.iloc[index] == True:
#         negative_words.drop(index, inplace=True)

positive_words = positive_words.reset_index(drop=True)
negative_words = negative_words.reset_index(drop=True)
# Drops anything that is less then three words, reducing match errors.
#positive_words.drop(positive_words[positive_words[0].str.len() < 3].index, inplace=True)
#negative_words.drop(negative_words[negative_words[0].str.len() < 3].index, inplace=True)

for neg in negative_words[0]:
    df[str(neg)] = lower_no_punc['Sentence'].str.contains(str(neg), case=False)
    
#Instantiate the model and store in a variable.
bnb = BernoulliNB()

data = df[negative_words[0]]
target = df['Bool']
target = target.astype('bool')
bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
con_matrix = pd.crosstab(target, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(con_matrix)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
cross_val_score(bnb, data, target, cv=10)

Number of mislabeled points out of a total 1000 points : 109
Predicted  False  True 
Actual                 
False        481     19
True          90    410
With 20% Holdout: 0.755
Testing on Sample: 0.891


array([0.65, 0.75, 0.68, 0.68, 0.72, 0.7 , 0.74, 0.65, 0.69, 0.73])

It looks like decreasing the complexity and constraints on my initial classifier is increasing the accuracy of the model.  There is much improvment that can be done to the classifier as shown with the cross validation.

###  Positive no compare without deleting small words

In [35]:
from sklearn.model_selection import train_test_split

# Importing the data

df = pd.read_csv('imdb_labelled.txt', sep='  \t', header=None, engine='python')
#df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.columns = ['Sentence', 'Score']
#Setting the bools for the score
df.loc[df['Score'] == 0, 'Bool'] = True
df.loc[df['Score'] == 1, 'Bool'] = False

negative_words = pd.DataFrame([])
positive_words = pd.DataFrame([])
lower_no_punc = pd.DataFrame([])

for index in df.index:
    if df.iloc[index]['Score'] == 0:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        negative_words = negative_words.append(out.lower().split())

    if df.iloc[index]['Score'] == 1:
        s = df.iloc[index]['Sentence']
        out = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
        positive_words = positive_words.append(out.lower().split())
    lower_no_punc = lower_no_punc.append([out.lower().strip()])

lower_no_punc = lower_no_punc.reset_index(drop=True)
lower_no_punc.columns = ['Sentence']
negative_words = pd.DataFrame(negative_words[0].unique())
positive_words = pd.DataFrame(positive_words[0].unique())

#Compare word lists to each other and remove matching.
#pos_match = positive_words[0].isin(negative_words[0])
#neg_match = negative_words[0].isin(positive_words[0])

# for index in pos_match.index:
#     if pos_match.iloc[index] == True:
#         positive_words.drop(index, inplace=True)
        
# for index in neg_match.index:
#     if neg_match.iloc[index] == True:
#         negative_words.drop(index, inplace=True)

positive_words = positive_words.reset_index(drop=True)
negative_words = negative_words.reset_index(drop=True)
# Drops anything that is less then three words, reducing match errors.
#positive_words.drop(positive_words[positive_words[0].str.len() < 3].index, inplace=True)
#negative_words.drop(negative_words[negative_words[0].str.len() < 3].index, inplace=True)

for pos in positive_words[0]:
    df[str(pos)] = lower_no_punc['Sentence'].str.contains(str(pos), case=False)
    
#Instantiate the model and store in a variable.
bnb = BernoulliNB()

data = df[positive_words[0]]
target = df['Bool']
target = target.astype('bool')
bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
con_matrix = pd.crosstab(target, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(con_matrix)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 209
Predicted  False  True 
Actual                 
False        295    205
True           4    496
With 20% Holdout: 0.665
Testing on Sample: 0.791


The same does not seem true for the positive words list.  Reducing the complexity and constraints has improved this classifier, there is still some imbalance that is creating errors in this word list.

In [39]:
cross_val_score(bnb, data, target, cv=15)

array([0.58208955, 0.65671642, 0.64179104, 0.59701493, 0.59701493,
       0.64179104, 0.68656716, 0.67164179, 0.76119403, 0.65671642,
       0.63636364, 0.65151515, 0.6969697 , 0.66666667, 0.71212121])

A cross validation confirms that this is true. 