In [147]:
# Import Required Data Science Packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [121]:
# Upload the dataset
products = pd.read_csv('amazon_baby.csv').fillna('')

In [122]:
# View Head of the dataset
products.head()

Unnamed: 0,name,review,rating
0,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,This is a safe product used to clean the airwa...,4
1,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,I purchased this item after reading rave revie...,4
2,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,"Honestly, this product is not worth the $7.45 ...",1
3,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,My wife has a nightly ritual of cleaning my da...,4
4,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,Very useful tool. Highly recommand it! I am us...,5


In [123]:
# Calculate number or reviews for each product
# The table is also .sort() for most reviewed product
products.groupby('name').count()

Unnamed: 0_level_0,review,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1
,36,36
&quot;The Custodian.&quot; 2.5&quot; Wireless Monitor with 1 Tilt and Pan Camera From Safe Baby. Includes Free Power Pack Valued At $24.95,2,2
&quot;The Sentinel&quot; 3.5&quot; Portable Two Camera Monitor System From Safe Baby,1,1
(1) Cresci Products Window Wedge (2 Per Pack) WHITE color,25,25
"100% Lana Organic Red/Saffron Goose, Large - 12&quot;",1,1
...,...,...
eeBoo Fairies of the Field Lacing Cards,1,1
"i Play Soft And Snuggly Blanket, Pink",1,1
iPlay Tan Jungle Chichi Blankie,1,1
medibag 117 Piece Kid Friendly First Aid Kit for the Whole Family,14,14


In [124]:
## Explore a specific Product
# Filter data for the most review product
giraffe_reviews = products[products['name']=='Vulli Sophie the Giraffe Teether']

In [125]:
# Print our the head()
giraffe_reviews.head()

Unnamed: 0,name,review,rating
6475,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5
6476,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,5
6477,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,1
6478,Vulli Sophie the Giraffe Teether,All the moms in my moms' group got Sophie for ...,5
6479,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,5


In [126]:
# Get length (number) of the filtered dataset
len(giraffe_reviews)

774

In [127]:
# summarize the variable 'rating'
giraffe_reviews['rating'].describe()

count    774.000000
mean       4.294574
std        1.222280
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

In [128]:
# create filter for products with missing review
products = products[(products['review'] != '') | (products['rating'].isna() == False)]
# create filter and ignore all 3*  reviews
products = products[products['rating']!= 3]

In [129]:
products.shape

(13423, 3)

In [130]:
## Building a sentiment classifier and Build word count vectors

# Creates vectorized count for the words in the 'review' column
Vectorizer = CountVectorizer(analyzer='word')

# Create a list of the reviews as a list of character strings
records = list()
for i in products['review']:
    records.append(i)

# Apply the transform records to vectorized word count & create an array
records_vector = Vectorizer.fit_transform(records)
records_data = records_vector.toarray()

In [131]:
# Create a data from of word features for each record
records_data = pd.DataFrame(records_data, columns = Vectorizer.get_feature_names())

In [132]:
## Define what is positive and negative sentiment (Essential in sentiment analysis)
# look at ratings for all products
products['rating'].describe()

count    13423.000000
mean         4.234076
std          1.292855
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: rating, dtype: float64

In [133]:
#positive sentiment = 4-star or 5-star reviews
# Create sentiment column with 1 and 0 for criteria specificed
products['sentiment'] = products['rating'] >= 4

In [134]:
# print products dataset
products

Unnamed: 0,name,review,rating,sentiment
0,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,This is a safe product used to clean the airwa...,4,True
1,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,I purchased this item after reading rave revie...,4,True
2,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,"Honestly, this product is not worth the $7.45 ...",1,False
3,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,My wife has a nightly ritual of cleaning my da...,4,True
4,Pigeon Baby Nose Cleaning Tweezers Pigeon (Mad...,Very useful tool. Highly recommand it! I am us...,5,True
...,...,...,...,...
14866,Spongebob Squarepants Cuddle Pillow,My son said give it 5 stars so thats all that ...,5,True
14867,Spongebob Squarepants Cuddle Pillow,This is fairly big and the price is awesome co...,5,True
14868,Spongebob Squarepants Cuddle Pillow,This was a great purchase for my 2 year old ni...,5,True
14869,Spongebob Squarepants Cuddle Pillow,Most of the reviews I read before buying the p...,5,True


In [135]:
# See breakdown of postive and negative sentiments
products['sentiment'].isna().sum()

0

In [136]:
# Check to see if the shape of the datasets are similar before concatinating them
print(records_data.shape)
print(products.shape)

# Reset the index so that concatination can happen row-wise
products = products.reset_index()
records_data = records_data.reset_index()

(13423, 18830)
(13423, 4)


In [137]:
# concatinate the products sentiment series with the records DataFrame
products_ = pd.concat([products['name'], products['sentiment'], records_data], axis = 1)

In [138]:
## Train our sentiment classifier
# Create first sentiment claddifier, and split data into test and training set
train_data, test_data = train_test_split(products_, test_size=0.8, random_state=0)

In [139]:
# create model matracies
x_train = train_data.drop(columns = ['sentiment'])
y_train = np.array(train_data['sentiment'])

x_test = test_data.drop(columns = ['sentiment'])
y_test = np.array(test_data['sentiment'])

In [140]:
# Run sentiment classification model
logit_model = LogisticRegression(penalty = 'none', max_iter = 200)
sentiment_model = logit_model.fit(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [156]:
# Predict the sentiment from the training data
train_data['predicted_sentiment'] = sentiment_model.predict(x_train)

# Predict the sentiment from the test data
test_data['predicted_sentiment'] = sentiment_model.predict(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [157]:
test_data

Unnamed: 0,sentiment,level_0,00,000,01,02,03,04,05,06,...,zoom,zooming,zooms,zooper,zt,zumba,zurich,zzz,predicted_probability,predicted_sentiment
489,True,489,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000145,True
12238,True,12238,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.032929,True
3824,False,3824,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.392024,True
4462,True,4462,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.032311,True
3873,True,3873,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.009587,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5414,False,5414,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.391985,True
6657,True,6657,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.004107,True
3550,True,3550,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000085,True
5154,True,5154,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000780,True


In [148]:
# See how the model does agianst in the training set
print(classification_report(train_data['sentiment'], train_data['predicted_sentiment']))

              precision    recall  f1-score   support

       False       0.86      0.70      0.77       429
        True       0.94      0.98      0.96      2255

    accuracy                           0.93      2684
   macro avg       0.90      0.84      0.87      2684
weighted avg       0.93      0.93      0.93      2684



In [158]:
# See how the model does agianst in the test set
print(classification_report(test_data['sentiment'], test_data['predicted_sentiment']))

              precision    recall  f1-score   support

       False       0.79      0.59      0.68      1706
        True       0.93      0.97      0.95      9033

    accuracy                           0.91     10739
   macro avg       0.86      0.78      0.81     10739
weighted avg       0.91      0.91      0.91     10739



In [154]:
# Apply new model to reviews of 'Giraffe' product
# create new sentiment probability variable with predicted values uing .predict()
test_data['predicted_probability'] = sentiment_model.predict_proba(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [161]:
# Predict sentiment accross the entire dataset
full_prediction_set = products_.drop(columns = ['sentiment'])
products_['predicted_sentiment'] = sentiment_model.predict(full_prediction_set)


In [163]:
# See how the model does accross the entire dataset
print(classification_report(products_['sentiment'], products_['predicted_sentiment']))

              precision    recall  f1-score   support

       False       0.81      0.61      0.70      2135
        True       0.93      0.97      0.95     11288

    accuracy                           0.92     13423
   macro avg       0.87      0.79      0.82     13423
weighted avg       0.91      0.92      0.91     13423



In [None]:
# Left off here

In [24]:
# filter for desired product
giraffe_reviews = products[products['name']== 'Vulli Sophie the Giraffe Teether']

In [25]:
# Print dataset of desired product
giraffe_reviews

name,review,rating,word_count,sentiment
Vulli Sophie the Giraffe Teether ...,He likes chewing on all the parts especially the ...,5,"{'purchase': 1.0, 'teething': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,My son loves this toy and fits great in the diaper ...,5,"{'a': 1.0, 'is': 1.0, 'when': 1.0, 'him': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,There really should be a large warning on the ...,1,"{'made': 1.0, 'of': 1.0, 'packaging': 1.0, 'no': ...",0
Vulli Sophie the Giraffe Teether ...,All the moms in my moms' group got Sophie for ...,5,"{'another': 1.0, 'out': 1.0, 'run': 1.0, 'lost': ...",1
Vulli Sophie the Giraffe Teether ...,I was a little skeptical on whether Sophie was ...,5,"{'disappointed': 1.0, 'will': 1.0, 'take': ...",1
Vulli Sophie the Giraffe Teether ...,I have been reading about Sophie and was going ...,5,"{'late': 1.0, 'perfect': 1.0, 'pack': 1.0, 'on ...",1
Vulli Sophie the Giraffe Teether ...,My neice loves her sophie and has spent hours ...,5,"{'delight': 1.0, 'in': 1.0, 'other': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,What a friendly face! And those mesmerizing ...,5,"{'inside': 1.0, 'water': 1.0, 'don': 1.0, 'up': ...",1
Vulli Sophie the Giraffe Teether ...,We got this just for my son to chew on instea ...,5,"{'its': 1.0, 'fine': 1.0, 'is': 1.0, 'which': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,This product is without a doubt the best on the ...,5,"{'belongs': 1.0, 'dog': 1.0, 'if': 1.0, 'bewa ...",1

predicted_sentiment
0.999365536568231
0.999863379168963
0.2545268197811786
0.9165688083915216
0.6855768205886252
0.99999994452112
0.9979351181093526
0.9999745004834384
0.9460144428357088
0.9999999721665868


In [26]:
#Sort the Giraffe reviews according to predicted sentiment
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)

In [28]:
# print sorted sentiments
giraffe_reviews

name,review,rating,word_count,sentiment
Vulli Sophie the Giraffe Teether ...,"Sophie, oh Sophie, your time has come. My ...",5,"{'11': 1.0, 'prisrob': 1.0, '12': 1.0, 'who': ...",1
Vulli Sophie the Giraffe Teether ...,I'll be honest...I bought this toy because all the ...,4,"{'around': 1.0, 'explore': 1.0, 'they': ...",1
Vulli Sophie the Giraffe Teether ...,As a mother of 16month old twins; I bought ...,5,"{'will': 1.0, '15months': 1.0, 'would': 2.0, ...",1
Vulli Sophie the Giraffe Teether ...,We got this little giraffe as a gift from a ...,5,"{'out': 1.0, 'would': 1.0, 've': 1.0, 'enou ...",1
Vulli Sophie the Giraffe Teether ...,"As every mom knows, you always want to give your ...",5,"{'whether': 1.0, 'neutral': 1.0, 'gend ...",1
Vulli Sophie the Giraffe Teether ...,My Mom-in-Law bought Sophie for my son whe ...,5,"{'penny': 1.0, 'little': 1.0, 'perfect': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,"My 4 month old son is teething, and I've tried ...",4,"{'worth': 1.0, 'works': 1.0, 'teether': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,Let me just start off by addressing the choking ...,5,"{'question': 1.0, 'must': 1.0, 'overall': 1.0, ...",1
Vulli Sophie the Giraffe Teether ...,I'm not sure why Sophie is such a hit with the ...,4,"{'makers': 1.0, 'or': 1.0, 'take': 1.0, 'can': ...",1
Vulli Sophie the Giraffe Teether ...,"I admit, I didn't get Sophie the Giraffe at ...",4,"{'dye': 1.0, 'of': 1.0, 'cause': 1.0, 'fade': ...",1

predicted_sentiment
1.0
1.0
1.0
0.9999999999998376
0.9999999999998284
0.9999999999997958
0.9999999999994914
0.9999999999941254
0.999999999987423
0.9999999999829476


In [29]:
# get lowest sentiment probabilities
giraffe_reviews.tail()

name,review,rating,word_count,sentiment
Vulli Sophie the Giraffe Teether ...,My 7 1/2 month old grandbaby was sittng up ...,1,"{'sopie': 1.0, 'with': 1.0, 'be': 1.0, 'shou ...",0
Vulli Sophie the Giraffe Teether ...,I was so looking forward to getting this for my ...,1,"{'amazon': 1.0, 'back': 1.0, 'by': 1.0, 'more': ...",0
Vulli Sophie the Giraffe Teether ...,Received the product and smells like cheap rub ...,1,"{'was': 1.0, 'as': 1.0, 'still': 1.0, 'but': ...",0
Vulli Sophie the Giraffe Teether ...,I got one of these as a showe gift that my baby ...,1,"{'typical': 1.0, 'paint': 1.0, 'came': 1.0, 'mi ...",0
Vulli Sophie the Giraffe Teether ...,When we first got it I thought did my ...,2,"{'mom': 1.0, 'teethers': 1.0, 'btw': 1.0, 'oth ...",0
Vulli Sophie the Giraffe Teether ...,When I received this the paint was peeling off in ...,1,"{'risk': 1.0, 'but': 1.0, 'help': 1.0, 'terrible': ...",0
Vulli Sophie the Giraffe Teether ...,"""Sophie the Giraffe"" has tested positive for ...",1,"{'ingest': 1.0, 'they': 1.0, 'naturally': 1.0, ...",0
Vulli Sophie the Giraffe Teether ...,"My son (now 2.5) LOVED his Sophie, and I bought ...",1,"{'exaggerated': 1.0, 'are': 1.0, 'a': 3.0, ...",0
Vulli Sophie the Giraffe Teether ...,I wanted to love this product and was excited ...,1,"{'risk': 1.0, 'ready': 1.0, 'bummed': 1.0, ...",0
Vulli Sophie the Giraffe Teether ...,This children's toy is nostalgic and very cute. ...,1,"{'amazon': 1.0, 'from': 1.0, 'one': 1.0, 'by': ...",0

predicted_sentiment
1.911074019567032e-06
6.547801686062032e-07
3.8642363974430533e-07
2.2378225159403466e-07
1.93539552818285e-07
1.6900753980991787e-07
1.1748826795728057e-07
5.3649334270240895e-11
3.109035270466724e-13
4.119269639402301e-15


In [30]:
# Show the most positive reviews
giraffe_reviews[0]['review']

"Sophie, oh Sophie, your time has come. My granddaughter, Violet is 5 months old and starting to teeth. What joy little Sophie brings to Violet. Sophie is made of a very pliable rubber that is sturdy but not tough. It is quite easy for Violet to twist Sophie into unheard of positions to get Sophie into her mouth. The little nose and hooves fit perfectly into small mouths, and the drooling has purpose. The paint on Sophie is food quality.Sophie was born in 1961 in France. The maker had wondered why there was nothing available for babies and made Sophie from the finest rubber, phthalate-free on St Sophie's Day, thus the name was born. Since that time millions of Sophie's populate the world. She is soft and for babies little hands easy to grasp. Violet especially loves the bumpy head and horns of Sophie. Sophie has a long neck that easy to grasp and twist. She has lovely, sizable spots that attract Violet's attention. Sophie has happy little squeaks that bring squeals of delight from Viol

In [31]:
# Show the 2nd most positive reviews
giraffe_reviews[1]['review']

'I\'ll be honest...I bought this toy because all the hip parents seem to have one too and I wanted to be a part of the "hip parent" crowd. The price-tag was somewhat of a deterent but I prevailed and purchased this teether for my daughter.At first, Lily didn\'t know what to make of of Sophie and showed little interest in the polka-dotted creature. I continued to introduce Lily to Sophie and kept the toy in the carrier so that it was on-hand during transitions. Eventually, Lily discovered what a wonderful experience it was to gnaw on the hooves and ears and these two have never been far apart since.Lily really enjoys gumming all the different parts of Sophie like no other teether we have. The size of the toy is great as it is somewhat substantial and so easy for a little one to grasp and hold onto. Lily really enjoys hearing Sophie squeak and will smile whenever Sophie makes a noise or pops her head up from Mommy\'s lap to say hello.People have stopped and commented on Sophie and to the

In [32]:
# Most negative reivews
giraffe_reviews[-1]['review']

"This children's toy is nostalgic and very cute. However, there is a distinct rubber smell and a very odd taste, yes I tried it, that my baby did not enjoy. Also, if it is soiled it is extremely difficult to clean as the rubber is a kind of porus material and does not clean well. The final thing is the squeaking device inside which stopped working after the first couple of days. I returned this item feeling I had overpaid for a toy that was defective and did not meet my expectations. Please do not be swayed by the cute packaging and hype surounding it as I was. One more thing, I was given a full refund from Amazon without any problem."

In [33]:
# 2nd Most negative reivews
giraffe_reviews[-2]['review']

'I wanted to love this product and was excited to buy it when I became pregnant but am now hesitant to let my baby use it after reading about the recall in Europe. Apparently, as I understand it, their toxin standards of measurement are lower than ours so they have not been recalled here (apparently we are OK with low levels of nitrates in the toys our children put in their mouths, but Europeans are not...hmmm)...Be that as it may, toxins registering even CLOSE to a dangerous level made me nervous about using. After digging around online I did discover that the company claims to have changed the product after a certain date and lists manufacturing codes so you can check yours (those listed were made after a certain date and are said to be safer). Sadly mine was not made after the &#34;improved&#34; date but I could not return it because there was no formal recall in our country. I considered returning it and hunting for one with an approved manufacturing date but man that was just too 

In [34]:
# Evaluate the sentiment_model for accuracy
sentiment_model.evaluate(test_data)

{'accuracy': 0.9176975738650012,
 'auc': 0.9342357833151299,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  1397 |
 |      1       |        0        |  1344 |
 |      0       |        0        |  3931 |
 |      1       |        1        | 26632 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.951057941255245,
 'log_loss': 0.33047871872320717,
 'precision': 0.9501587641371436,
 'recall': 0.9519588218472976,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+--------------------+--------------------+-------+------+
 | threshold |        fpr         |        tpr         |   p   |  n   |
 +-----------+--------------------+--------------------+-------+------+
 |    0.

In [36]:
# get ROC curve for sentiment analysis results
sentiment_model.evaluate(test_data, metric="roc_curve")

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+--------------------+--------------------+-------+------+
 | threshold |        fpr         |        tpr         |   p   |  n   |
 +-----------+--------------------+--------------------+-------+------+
 |    0.0    |        1.0         |        1.0         | 27976 | 5328 |
 |   1e-05   | 0.847972972972973  | 0.9975693451529882 | 27976 | 5328 |
 |   2e-05   | 0.829954954954955  | 0.9971761509865599 | 27976 | 5328 |
 |   3e-05   | 0.818506006006006  | 0.9969616814412353 | 27976 | 5328 |
 |   4e-05   | 0.8109984984984985 | 0.9967472118959108 | 27976 | 5328 |
 |   5e-05   | 0.8057432432432432 | 0.9966042321990277 | 27976 | 5328 |
 |   6e-05   | 0.7991741741741741 | 0.9962825278810409 | 27976 | 5328 |
 |   7e-05   | 0.7952327327327328 | 0.9961752931083786 | 27976 | 5328 |
 |   8e-05   | 0.7920420420420421 | 0.9961038032599371 | 27976 | 5328 |
 |   9e-05   | 0.7882882882882

In [37]:
## Next Model
# The list of selected words to check sentiment
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [38]:
# loop through each word
## define function for each word
### write condition and return the answer
#### attach new variable to data
for item in selected_words:
    def select_word_count(dic):
            if item in dic:
                return dic[item]
            else: 
                return 0
    products[item] = products["word_count"].apply(select_word_count)

In [39]:
# print to check if new variables made
selected_words_data = products
selected_words_data

name,review,rating,word_count,sentiment
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5,"{'recommend': 1.0, 'highly': 1.0, ...",1
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5,"{'quilt': 1.0, 'of': 1.0, 'the': 1.0, 'than': 1.0, ...",1
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5,"{'tool': 1.0, 'clever': 1.0, 'approach': 2.0, ...",1
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5,"{'rock': 1.0, 'many': 1.0, 'headaches': 1.0, ...",1
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5,"{'thumb': 1.0, 'or': 1.0, 'break': 1.0, 'trying': ...",1
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4,"{'for': 1.0, 'barnes': 1.0, 'at': 1.0, 'is': ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5,"{'right': 1.0, 'because': 1.0, 'questions': 1.0, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5,"{'like': 1.0, 'and': 1.0, 'changes': 1.0, 'the': ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4,"{'in': 1.0, 'pages': 1.0, 'out': 1.0, 'run': 1.0, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4,"{'tracker': 1.0, 'now': 1.0, 'its': 1.0, 'sti ...",1

predicted_sentiment,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0.9997307390047092,0.0,0.0,0.0,0.0,1.0,0,0,0.0,0,0,0
0.9985083368316608,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.999748904249988,0.0,0.0,0.0,0.0,2.0,0,0,0.0,0,0,0
0.9999916625399972,0.0,1.0,0.0,0.0,1.0,0,0,0.0,0,0,0
0.9999999514462168,0.0,1.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.9999146735569904,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.9999916615904652,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.9999938843594008,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0,0,0
0.9961247617006423,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.9999999920460632,0.0,0.0,0.0,0.0,2.0,0,0,0.0,0,0,0


In [40]:
# For loop to compare the words that show up the most
for item in selected_words:
    print(item, selected_words_data[item].sum())

awesome 3892.0
great 55791.0
fantastic 1664.0
amazing 2628.0
love 41994.0
horrible 1110
bad 4183
terrible 1146.0
awful 687
wow 425
hate 1107


In [41]:
# Split dataset to test and training for next model
train_data,test_data = products.random_split(.8, seed=0)

In [42]:
selected_words_model = turicreate.logistic_classifier.create(train_data,target='sentiment', features=selected_words, validation_set=test_data)

In [43]:
# get selected words model coefficients
selected_words_model.coefficients

name,index,class,value,stderr
(intercept),,1,1.3365913848877726,0.0089299697876559
awesome,,1,1.133534666034134,0.0839964398318756
great,,1,0.8630655001196439,0.0189550524443766
fantastic,,1,0.88580475688142,0.1116759129339965
amazing,,1,1.100093311366018,0.0995477626046598
love,,1,1.359268866922504,0.0280683001520995
horrible,,1,-2.251335236759102,0.0802024938878842
bad,,1,-0.9914778800650664,0.0384842866469906
terrible,,1,-2.2236614360851346,0.0773173620378574
awful,,1,-2.05290820403136,0.1009973543525924


In [44]:
# evaluate model to get selected word accuracy
selected_words_model.evaluate(test_data)

{'accuracy': 0.8463848186404036,
 'auc': 0.6936022046674926,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |  159  |
 |      0       |        0        |  371  |
 |      0       |        1        |  4957 |
 |      1       |        1        | 27817 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9157860082304526,
 'log_loss': 0.39622654670876317,
 'precision': 0.8487520595594068,
 'recall': 0.9943165570488991,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+--------------------+-----+-------+------+
 | threshold |        fpr         | tpr |   p   |  n   |
 +-----------+--------------------+-----+-------+------+
 |    0.0    |        1.0         | 1.0 | 27976 | 53