In [4]:
import numpy as np
import pandas as pd
import string
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [5]:
products = pd.read_csv('./amazon_baby.csv')

In [6]:
# Function to clean all the NaN elements
def cleanNaN(value):
    if pd.isnull(value):
        return ""
    else:
        return value
products['review'] = products['review'].apply( cleanNaN )

# Convert review to string type
products['review'] = products['review'].astype(str)

# Function to delete all the punctuation
def punct_strip(word):
    for i in string.punctuation:
         word = word.replace(i, '')
    return word

# All the review strings to lower case. Delete all the punctuation
def review_convertion( review ):
    return punct_strip( review ).lower()

products['review'] = products['review'].apply( review_convertion )

In [7]:
def count_words(text, word):
    return text.lower().split().count(word)

products['fantastic'] = products['review'].apply(lambda x: count_words(x, 'fantastic'))
products['terrible'] = products['review'].apply(lambda x: count_words(x, 'terrible'))


In [8]:
products.head()

Unnamed: 0,name,review,rating,fantastic,terrible
0,Planetwise Flannel Wipes,these flannel wipes are ok but in my opinion n...,3,0,0
1,Planetwise Wipe Pouch,it came early and was not disappointed i love ...,5,0,0
2,Annas Dream Full Quilt with 2 Shams,very soft and comfortable and warmer than it l...,5,0,0
3,Stop Pacifier Sucking without tears with Thumb...,this is a product well worth the purchase i h...,5,0,0
4,Stop Pacifier Sucking without tears with Thumb...,all of my kids have cried nonstop when i tried...,5,0,0


In [9]:
products = products[ products['rating'] != 3]
products['sentiment'] = products['rating'] >=4

X = products[['fantastic', 'terrible']]
y = products['sentiment']

X_train, X_test, y_train, y_test = train_test_split( X,y, test_size=0.2, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=100).fit( X_train, y_train )

print(f"fantastic = {model.coef_[0][0]}")
print(f"terrible = {model.coef_[0][1]}")

fantastic = 0.9872734937065066
terrible = -2.2123911857342513


In [10]:
predicted = model.predict( X_test )
predicted_probability = model.predict_proba(X_test)[:, 1]

print(metrics.accuracy_score(y_test, predicted)) #Score
print(metrics.roc_auc_score(y_test, predicted_probability))

0.8436028904680519
0.5148720879707679


0.9335851998440826

0.9563326306365713

In [11]:
My_reviews = products[ products['name'] == 'Medela Freestyle Breast Pump' ].copy()
My_reviews_filtered = My_reviews[(My_reviews['fantastic'] > 0) | (My_reviews['terrible'] > 0)].copy()

My_vect_dtm = My_reviews_filtered[['fantastic', 'terrible']]

My_reviews_filtered['predicted_sentiment'] = model.predict(My_vect_dtm)
My_reviews_filtered['predicted_sentiment_proba'] = model.predict_proba(My_vect_dtm)[:, 1]

My_reviews_filtered = My_reviews_filtered.sort_values('rating', ascending=False)

In [12]:
My_reviews_filtered[['review', 'rating', 'fantastic', 'terrible', 'predicted_sentiment', 'predicted_sentiment_proba']].head(2)

Unnamed: 0,review,rating,fantastic,terrible,predicted_sentiment,predicted_sentiment_proba
56900,i thought long and hard about purchasing this ...,4,1,0,True,0.935123
56891,i owned a pis when i was expressing for my dau...,2,0,1,False,0.370172


In [13]:
My_reviews_filtered[['review', 'rating', 'fantastic', 'terrible', 'predicted_sentiment', 'predicted_sentiment_proba']].tail(2)

Unnamed: 0,review,rating,fantastic,terrible,predicted_sentiment,predicted_sentiment_proba
56907,i just had my second baby for my first i used ...,1,0,1,False,0.370172
56923,save your money this is a piece of junkspecifi...,1,0,1,False,0.370172
