# Predicting sentiment from product reviews

In [1]:
#Libraries Import
import json
import string
import numpy as np
import pandas as pd
pd.set_option("Chained_Assignment",None)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#read dataframe
dataframe=pd.read_csv("amazon_baby.csv")

In [3]:
dataframe.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
dataframe.info()
#contains null values for name, reviews

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
name      183213 non-null object
review    182702 non-null object
rating    183531 non-null int64
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


In [5]:
#replace null values with empty string
dataframe = dataframe.fillna({'review':''})

In [6]:
#remove punctuations
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator) 

dataframe["review_without_punctuation"] = dataframe['review'].apply(lambda x : remove_punctuation(x))
dataframe=dataframe[["name","review_without_punctuation","rating"]]

In [7]:
#ignore all reviews with rating = 3, since they tend to have a neutral sentiment
dataframe=dataframe[dataframe["rating"]!=3].reset_index(drop=True)

In [8]:
# reviews with a rating of 4 or higher to be positive reviews, while the ones with rating of 2 
#or lower are negative. For the sentiment column, we use +1 for the positive class label and -1 
#for the negative class label
dataframe['sentiment'] = dataframe['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [9]:
#test-train data
with open('module-2-assignment-test-idx.json') as test_data_file:    
    test_data_idx = json.load(test_data_file)
with open('module-2-assignment-train-idx.json') as train_data_file:    
    train_data_idx = json.load(train_data_file)

train_data = dataframe.iloc[train_data_idx]
test_data = dataframe.iloc[test_data_idx]


In [10]:
#Build the word count vector for each review_without_punctuations
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_without_punctuation'])
test_matrix = vectorizer.transform(test_data['review_without_punctuation'])

In [11]:
#Logistic model fit
sentiment_model = LogisticRegression(solver='liblinear',n_jobs=1)
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# QUIZ Predicting sentiment from product reviews

#### Question 1
How many weights are greater than or equal to 0?

__Ans__: 

In [12]:
np.sum(sentiment_model.coef_ >= 0)

87151

#### Question 2
Of the three data points in sample_test_data, which one has the lowest probability of being classified as a positive review?

In [13]:
sample_test_data = test_data.iloc[10:13]
sample_test_matrix = vectorizer.transform(sample_test_data['review_without_punctuation'])
print(sentiment_model.classes_)
print(sentiment_model.predict_proba(sample_test_matrix))

[-1  1]
[[3.67713366e-03 9.96322866e-01]
 [9.59664165e-01 4.03358355e-02]
 [9.99970284e-01 2.97164132e-05]]


__Ans__: Third

#### Question 3
Which of the following products are represented in the 20 most positive reviews?

__Ans__: Third

In [14]:
test_data["postive_review_probability"]=[x[1] for x in np.asarray(sentiment_model.predict_proba(test_matrix))]
top_20=list(test_data.sort_values("postive_review_probability",ascending=False)[:20]["name"])
options_list=["Snuza Portable Baby Movement Monitor","MamaDoo Kids Foldable Play Yard Mattress Topper, Blue","Britax Decathlon Convertible Car Seat, Tiffany","Safety 1st Exchangeable Tip 3 in 1 Thermometer"]
[x for x in options_list if x in top_20]

['Britax Decathlon Convertible Car Seat, Tiffany']

#### Question 4
Which of the following products are represented in the 20 most negative reviews?

__Ans__:   

In [15]:
test_data["postive_review_probability"]=[x[0] for x in np.asarray(sentiment_model.predict_proba(test_matrix))]
top_20=list(test_data.sort_values("postive_review_probability",ascending=False)[:20]["name"])
options_list=["The First Years True Choice P400 Premium Digital Monitor, 2 Parent Unit","JP Lizzy Chocolate Ice Classic Tote Set","Peg-Perego Tatamia High Chair, White Latte","Safety 1st High-Def Digital Monitor"]
[x for x in options_list if x in top_20]

['The First Years True Choice P400 Premium Digital Monitor, 2 Parent Unit',
 'Peg-Perego Tatamia High Chair, White Latte',
 'Safety 1st High-Def Digital Monitor']

#### Question 5
What is the accuracy of the sentiment_model on the test_data? Round your answer to 2 decimal places (e.g. 0.76)

__Ans__:

In [16]:
def get_classification_accuracy(model, data, true_labels):
    pred_y=model.predict(data)
    correct=np.sum(pred_y==true_labels)
    accuracy=round(correct/len(true_labels),2)
    return accuracy

get_classification_accuracy(sentiment_model,test_matrix,test_data["sentiment"])

0.93

#### Question 6
Does a higher accuracy value on the training_data always imply that the classifier is better?

__Ans__: No, higher accuracy on training data does not necessarily imply that the classifier is better.

#### Question 7
Consider the coefficients of simple_model. There should be 21 of them, an intercept term + one for each word in significant_words.How many of the 20 coefficients (corresponding to the 20 significant_words and excluding the intercept term) are positive for the simple_model?

__Ans__:

In [17]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']


vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 significant words
train_matrix_sub = vectorizer_word_subset.fit_transform(train_data['review_without_punctuation'])
test_matrix_sub = vectorizer_word_subset.transform(test_data['review_without_punctuation'])
#Logistic model fit
simple_model = LogisticRegression(solver='liblinear',n_jobs=1)
simple_model.fit(train_matrix_sub, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
simple_model_coefficient = pd.DataFrame({'word':significant_words,'simple_model_coefficient':simple_model.coef_.flatten()}).sort_values(['simple_model_coefficient'], ascending=False).reset_index(drop=True)
len(simple_model_coefficient[simple_model_coefficient["simple_model_coefficient"]>0])

10

#### Question 8
Are the positive words in the simple_model also positive words in the sentiment_model?

__Ans__: No

In [19]:
simple_model_coefficient=simple_model_coefficient.set_index("word",drop=True)

sentiment_model_coefficient = pd.DataFrame({'word':list(vectorizer.vocabulary_),'sentimental_model_coefficient':sentiment_model.coef_.flatten()}).sort_values(['sentimental_model_coefficient'], ascending=False).reset_index(drop=True)
sentiment_model_coefficient=sentiment_model_coefficient[sentiment_model_coefficient["word"].isin(significant_words)].set_index("word",drop=True)

simple_model_coefficient.join(sentiment_model_coefficient,on="word",how="left")


Unnamed: 0_level_0,simple_model_coefficient,sentimental_model_coefficient
word,Unnamed: 1_level_1,Unnamed: 2_level_1
loves,1.673074,0.01043761
perfect,1.509812,-0.6860067
love,1.36369,0.2670837
easy,1.192538,-0.0054688
great,0.944,0.06483661
little,0.520186,-0.3146887
well,0.50376,6.160998e-07
able,0.190909,0.2129163
old,0.085513,0.008179701
car,0.058855,0.05519557


#### Question 9
Which model (sentiment_model or simple_model) has higher accuracy on the TRAINING set?

__Ans__:    Sentiment Model

In [20]:
print("Sentiment Model: ",get_classification_accuracy(sentiment_model,train_matrix,train_data["sentiment"]))
print("Simple Model: ",get_classification_accuracy(simple_model,train_matrix_sub,train_data["sentiment"]))


Sentiment Model:  0.97
Simple Model:  0.87


#### Question 10
Which model (sentiment_model or simple_model) has higher accuracy on the TEST set?

__Ans__: Sentiment Model

In [21]:
print("Sentiment Model: ",get_classification_accuracy(sentiment_model,test_matrix,test_data["sentiment"]))
print("Simple Model: ",get_classification_accuracy(simple_model,test_matrix_sub,test_data["sentiment"]))


Sentiment Model:  0.93
Simple Model:  0.87


#### Question 11
Enter the accuracy of the majority class classifier model on the test_data. Round your answer to two decimal places (e.g. 0.76).

__Ans__:

In [22]:
#Find Majority Class
freq=pd.crosstab(test_data["sentiment"],columns=["count"]).reset_index()
freq

col_0,sentiment,count
0,-1,5241
1,1,28095


In [23]:
#Majority class=1
baseline_model=round(freq[freq["sentiment"]==1]["count"].values[0]/freq["count"].sum(),2)
print("Baseline Model: ", baseline_model)

Baseline Model:  0.84


#### Question 12
Is the sentiment_model definitely better than the majority class classifier (the baseline)?

__Ans__: Yes