In [41]:
# Import Required Data Science Packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_roc_curve

import warnings
#warnings.filterwarnings("ignore")

In [2]:
# Upload the dataset
products = pd.read_csv('amazon_baby.csv').fillna('')

In [3]:
# View Head of the dataset
products.head()

Unnamed: 0,name,review,rating
0,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,This is a safe product used to clean the airwa...,4
1,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,I purchased this item after reading rave revie...,4
2,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,"Honestly, this product is not worth the $7.45 ...",1
3,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,My wife has a nightly ritual of cleaning my da...,4
4,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,Very useful tool. Highly recommand it! I am us...,5


In [4]:
# Calculate number or reviews for each product
# The table is also .sort() for most reviewed product
products.groupby('name').count()

Unnamed: 0_level_0,review,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1
,36,36
&quot;The Custodian.&quot; 2.5&quot; Wireless Monitor with 1 Tilt and Pan Camera From Safe Baby. Includes Free Power Pack Valued At $24.95,2,2
&quot;The Sentinel&quot; 3.5&quot; Portable Two Camera Monitor System From Safe Baby,1,1
(1) Cresci Products Window Wedge (2 Per Pack) WHITE color,25,25
"100% Lana Organic Red/Saffron Goose, Large - 12&quot;",1,1
...,...,...
eeBoo Fairies of the Field Lacing Cards,1,1
"i Play Soft And Snuggly Blanket, Pink",1,1
iPlay Tan Jungle Chichi Blankie,1,1
medibag 117 Piece Kid Friendly First Aid Kit for the Whole Family,14,14


In [5]:
## Explore a specific Product
# Filter data for the most review product
giraffe_reviews = products[products['name']=='Vulli Sophie the Giraffe Teether']

In [6]:
# Print our the head()
giraffe_reviews.head()

Unnamed: 0,name,review,rating
6475,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5
6476,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,5
6477,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,1
6478,Vulli Sophie the Giraffe Teether,All the moms in my moms' group got Sophie for ...,5
6479,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,5


In [7]:
# Get length (number) of the filtered dataset
len(giraffe_reviews)

774

In [8]:
# summarize the variable 'rating'
giraffe_reviews['rating'].describe()

count    774.000000
mean       4.294574
std        1.222280
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

In [9]:
# create filter for products with missing review
products = products[(products['review'] != '') | (products['rating'].isna() == False)]
# create filter and ignore all 3*  reviews
products = products[products['rating']!= 3]

In [10]:
products.shape

(13423, 3)

In [11]:
## Building a sentiment classifier and Build word count vectors

# Creates vectorized count for the words in the 'review' column
Vectorizer = CountVectorizer(analyzer='word')

# Create a list of the reviews as a list of character strings
records = list()
for i in products['review']:
    records.append(i)

# Apply the transform records to vectorized word count & create an array
records_vector = Vectorizer.fit_transform(records)
records_data = records_vector.toarray()

In [12]:
# Create a data from of word features for each record
records_data = pd.DataFrame(records_data, columns = Vectorizer.get_feature_names())

In [13]:
## Define what is positive and negative sentiment (Essential in sentiment analysis)
# look at ratings for all products
products['rating'].describe()

count    13423.000000
mean         4.234076
std          1.292855
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: rating, dtype: float64

In [14]:
#positive sentiment = 4-star or 5-star reviews
# Create sentiment column with 1 and 0 for criteria specificed
products['sentiment'] = products['rating'] >= 4

In [15]:
# print products dataset
products

Unnamed: 0,name,review,rating,sentiment
0,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,This is a safe product used to clean the airwa...,4,True
1,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,I purchased this item after reading rave revie...,4,True
2,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,"Honestly, this product is not worth the $7.45 ...",1,False
3,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,My wife has a nightly ritual of cleaning my da...,4,True
4,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,Very useful tool. Highly recommand it! I am us...,5,True
...,...,...,...,...
14866,Spongebob Squarepants Cuddle Pillow,My son said give it 5 stars so thats all that ...,5,True
14867,Spongebob Squarepants Cuddle Pillow,This is fairly big and the price is awesome co...,5,True
14868,Spongebob Squarepants Cuddle Pillow,This was a great purchase for my 2 year old ni...,5,True
14869,Spongebob Squarepants Cuddle Pillow,Most of the reviews I read before buying the p...,5,True


In [16]:
# See breakdown of postive and negative sentiments
products['sentiment'].isna().sum()

0

In [17]:
# Check to see if the shape of the datasets are similar before concatinating them
print(records_data.shape)
print(products.shape)

# Reset the index so that concatination can happen row-wise
products = products.reset_index()
records_data = records_data.reset_index()

(13423, 18830)
(13423, 4)


In [18]:
# concatinate the products sentiment series with the records DataFrame
products['item_name'] = products['name']
products_ = pd.concat([products['item_name'], products['sentiment'], records_data], axis = 1)

In [19]:
## Train our sentiment classifier
# Create first sentiment claddifier, and split data into test and training set
train_data, test_data = train_test_split(products_, test_size=0.8, random_state=0)

In [20]:
# create model matracies
x_train = train_data.drop(columns = ['sentiment', 'item_name'])
y_train = np.array(train_data['sentiment'])

x_test = test_data.drop(columns = ['sentiment', 'item_name'])
y_test = np.array(test_data['sentiment'])

In [21]:
# Run sentiment classification model
logit_model = LogisticRegression(penalty = 'none', max_iter = 200)
sentiment_model = logit_model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
# Predict the sentiment from the training data
train_data['predicted_sentiment'] = sentiment_model.predict(x_train)

# Predict the sentiment from the test data
test_data['predicted_sentiment'] = sentiment_model.predict(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [23]:
# See how the model does agianst in the training set
print(classification_report(train_data['sentiment'], train_data['predicted_sentiment']))

              precision    recall  f1-score   support

       False       0.88      0.73      0.80       429
        True       0.95      0.98      0.97      2255

    accuracy                           0.94      2684
   macro avg       0.91      0.86      0.88      2684
weighted avg       0.94      0.94      0.94      2684



In [24]:
# See how the model does agianst in the test set
print(classification_report(test_data['sentiment'], test_data['predicted_sentiment']))

              precision    recall  f1-score   support

       False       0.68      0.50      0.58      1706
        True       0.91      0.96      0.93      9033

    accuracy                           0.88     10739
   macro avg       0.79      0.73      0.75     10739
weighted avg       0.87      0.88      0.88     10739



In [25]:
# Predict sentiment as a probability 
full_prediction_set = products_.drop(columns = ['item_name', 'sentiment'])
products_['predicted_sentiment_Pos'] = sentiment_model.predict_proba(full_prediction_set)[:,1]

In [26]:
# Combine the review back into the dataset
products['review_of_item'] = products['review']
products_ = pd.concat([products_, products['review_of_item']], axis = 1)

In [27]:
# filter for desired product
giraffe_reviews = products_[products_['item_name']== 'Vulli Sophie the Giraffe Teether']

In [28]:
# Print dataset of desired product
giraffe_reviews

Unnamed: 0,item_name,sentiment,level_0,00,000,01,02,03,04,05,...,zoom,zooming,zooms,zooper,zt,zumba,zurich,zzz,predicted_sentiment_Pos,review_of_item
5853,Vulli Sophie the Giraffe Teether,True,5853,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.990888,He likes chewing on all the parts especially t...
5854,Vulli Sophie the Giraffe Teether,True,5854,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.999640,My son loves this toy and fits great in the di...
5855,Vulli Sophie the Giraffe Teether,False,5855,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.996109,There really should be a large warning on the ...
5856,Vulli Sophie the Giraffe Teether,True,5856,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.956139,All the moms in my moms' group got Sophie for ...
5857,Vulli Sophie the Giraffe Teether,True,5857,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.129746,I was a little skeptical on whether Sophie was...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6560,Vulli Sophie the Giraffe Teether,True,6560,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.881362,My baby looooves this teether! I think she li...
6561,Vulli Sophie the Giraffe Teether,True,6561,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.998665,My 5 month old baby loves it. Easy to hold and...
6562,Vulli Sophie the Giraffe Teether,True,6562,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.999057,My daughter has had this giraffe since she was...
6563,Vulli Sophie the Giraffe Teether,True,6563,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.997308,My Grandson loves his Sophie. Would recommend...


In [29]:
#Sort the Giraffe reviews according to predicted sentiment
giraffe_reviews = giraffe_reviews.sort_values(['predicted_sentiment_Pos'], ascending=False)

In [38]:
# print sorted sentiments
first_review = np.array(giraffe_reviews['review_of_item'])[0]
print('Predicted Positive Sentiment', np.array(giraffe_reviews['predicted_sentiment_Pos'])[0])
print(first_review)

Predicted Positive Sentiment 0.9999999999947886
Sophie, oh Sophie, your time has come. My granddaughter, Violet is 5 months old and starting to teeth. What joy little Sophie brings to Violet. Sophie is made of a very pliable rubber that is sturdy but not tough. It is quite easy for Violet to twist Sophie into unheard of positions to get Sophie into her mouth. The little nose and hooves fit perfectly into small mouths, and the drooling has purpose. The paint on Sophie is food quality.Sophie was born in 1961 in France. The maker had wondered why there was nothing available for babies and made Sophie from the finest rubber, phthalate-free on St Sophie's Day, thus the name was born. Since that time millions of Sophie's populate the world. She is soft and for babies little hands easy to grasp. Violet especially loves the bumpy head and horns of Sophie. Sophie has a long neck that easy to grasp and twist. She has lovely, sizable spots that attract Violet's attention. Sophie has happy little 

In [37]:
# get lowest sentiment probabilities
print('Predicted Positive Sentiment', np.array(giraffe_reviews['predicted_sentiment_Pos'].tail())[-1])
print(np.array(giraffe_reviews['review_of_item'].tail())[-1])


Predicted Positive Sentiment 0.000480259336881877
After reading endless reviews on this thing, I finally decided to dish out the money for this toy. Once I got it in the mail, I was already a little disappointed - it looks just like a rubber dog toy you buy from petsmart, squeaks like one too. I still wanted to try it out because I was desperate to find something to help alleviate my son's teething pain, but the dog was more interested than he was. It didn't seem to help him. I just wanted to share this for the moms on here who are willing to buy anything to help with their baby's teething pain. This isn't the toy. My son likes a cold pacifier, the silicone raspberry teether (costs only $4) or even the tip of his baby spoons better than this thing. It certainly shouldn't be amazon's #1 baby item.


In [39]:
# Show the 2nd most positive reviews
second_review = np.array(giraffe_reviews['review_of_item'])[1]
print('Predicted Positive Sentiment, Second', np.array(giraffe_reviews['predicted_sentiment_Pos'])[1])
print(second_review)

Predicted Positive Sentiment, Second 0.9999999996430238
As every mom knows, you always want to give your children the best you can.  Though I thought it was a bit expensive, I had read many good reviews about the Vulli Sophie Giraffe Teether and wanted to get one for my baby girl.  And because Sophie is made from all natural rubber and decorated with food-grade coloring, I felt it would be worth the extra money to have the peace of mind knowing that there are no toxic paints, plastics, or dangerous small parts for my daughter to ingest.  She has had an easy time teething (she has 2 lower center teeth with 4 coming in on top and 2 more on the bottom), but she still likes to chew on her little Sophie Giraffe.  Sophie's long skinny neck and legs are easy for my daughter's small hands to hold.  My little girl loves to nibble on Sophie's ears, horns, and feet.  Plus, I think this little giraffe is just so cute.  Also, just as an anecdote: When I first offered Sophie to my daughter, she was 

In [36]:
# get ROC curve for sentiment analysis results
sentiment_model.evaluate(test_data, metric="roc_curve")

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+--------------------+--------------------+-------+------+
 | threshold |        fpr         |        tpr         |   p   |  n   |
 +-----------+--------------------+--------------------+-------+------+
 |    0.0    |        1.0         |        1.0         | 27976 | 5328 |
 |   1e-05   | 0.847972972972973  | 0.9975693451529882 | 27976 | 5328 |
 |   2e-05   | 0.829954954954955  | 0.9971761509865599 | 27976 | 5328 |
 |   3e-05   | 0.818506006006006  | 0.9969616814412353 | 27976 | 5328 |
 |   4e-05   | 0.8109984984984985 | 0.9967472118959108 | 27976 | 5328 |
 |   5e-05   | 0.8057432432432432 | 0.9966042321990277 | 27976 | 5328 |
 |   6e-05   | 0.7991741741741741 | 0.9962825278810409 | 27976 | 5328 |
 |   7e-05   | 0.7952327327327328 | 0.9961752931083786 | 27976 | 5328 |
 |   8e-05   | 0.7920420420420421 | 0.9961038032599371 | 27976 | 5328 |
 |   9e-05   | 0.7882882882882