In [1]:
# !pip install pandas
# !pip install numpy
# !pip install nltk
# !pip install bs4

import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sanavesa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [3]:
# seed handpicked to ensure all of the cleaning/pre-processing steps were visually shown
np.random.seed(544)

# credits: google, used to hide bs4 warnings (MarkupResemblesLocatorWarning)
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

## Read Data
Load the data locally while skipping on lines that contains errors.

In [4]:
# Load the data locally, can be done using the URL as well
fname = 'amazon_reviews_us_Kitchen_v1_00.tsv.gz'
original_data = pd.read_csv(fname, sep='\t', compression='gzip', on_bad_lines='skip')

In [5]:
# Make the notebook's display fn use the entire screen width
pd.set_option('display.max_colwidth', None)

## Keep Reviews and Ratings
Strip the dataset to use only the two columns we're interested in, while dropping all rows that have missing values. Finally, we display statistics about the dataset we'll be working with.

In [6]:
# Keep only the 2 columns we need and remove rows that have missing values
data = original_data[['star_rating', 'review_body']].copy()
data.dropna(inplace=True)

# Display statistics about the dataset
print(f'There are {len(data.index)} reviews in the dataset.')
print(f"Review rating's average is {data.star_rating.mean():.3f} with a standard deviation of {data.star_rating.std():.3f}.")
print()

for i in range(1, 6):
    print(f'There are {len(data[ data.star_rating == i ])} reviews with a rating of {i}.')
print()

print('Three sample reviews:')
display(data.sample(n=3, ignore_index=True))

There are 4874644 reviews in the dataset.
Review rating's average is 4.207 with a standard deviation of 1.287.

There are 426870 reviews with a rating of 1.
There are 241939 reviews with a rating of 2.
There are 349539 reviews with a rating of 3.
There are 731701 reviews with a rating of 4.
There are 3124595 reviews with a rating of 5.

Three sample reviews:


Unnamed: 0,star_rating,review_body
0,5.0,Excelente!!
1,1.0,Coffee does not taste good from this percolator. I am very disappointed in this product.
2,5.0,"My husband and I love this set! It is beautiful! Even though it only has one plate, it is a fairly large plate. This makes it perfect for sharing a nice, romantic sushi dinner for two."


# Labelling Reviews:
## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [7]:
# Gather statistics for the three class (negative/positive/neutral sentiment)
count_negative = len(data[ data['star_rating'].isin([1, 2]) ])
count_neutral = len(data[ data['star_rating'].isin([3]) ])
count_positive = len(data[ data['star_rating'].isin([4, 5]) ])

print(f'There are {count_positive} reviews with positive sentiment.')
print(f'There are {count_negative} reviews with negative sentiment.')
print(f'There are {count_neutral} reviews with neutral sentiment (discarded).')

# Discard reviews with rating of 3
filter_index = data[ data['star_rating'] == 3 ].index
data.drop(filter_index, inplace=True)

# Map ratings to sentiment
rating_mapping = {1: 0, 2: 0, 4: 1, 5: 1}
data['star_rating'].replace(rating_mapping, inplace=True)
data['star_rating'] = data['star_rating'].astype('int8')

There are 3856296 reviews with positive sentiment.
There are 668809 reviews with negative sentiment.
There are 349539 reviews with neutral sentiment (discarded).


## We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.
Ensure the dataset we'll be working with contains an equal amount of cases for each class to ensure proper and fair training.


In [8]:
# Get 100k samples for each class, and then concatenate them making it the data we'll use
positive_reviews = data[ data['star_rating'] == 1 ].sample(100000)
negative_reviews = data[ data['star_rating'] == 0 ].sample(100000)
data = pd.concat([positive_reviews, negative_reviews])

# Data Cleaning

## Convert the all reviews into the lower case.
By converting all reviews to lowercase, we reduce the computational power needed to train the model.

In [9]:
# Create a new column for cleaned reviews, which are lowercase
data['cleaned_reviews'] = data['review_body'].str.lower()

## Remove the HTML and URLs from the reviews
Use BeautifulSoup and regular expressions to remove all HTML tags aswell as HTTP URLs from the reviews as they provide little to no benefit for our purposes of sentiment analysis.

In [10]:
# Function to remove HTML tags and URLs from a string
def sanitize_review(text):
    # remove HTML tags
    text = BeautifulSoup(str(text), 'html.parser').get_text()   
    # remove URLS
    text = re.sub(r'http\S+', '', str(text))
    return text

data['cleaned_reviews'] = data['cleaned_reviews'].apply(sanitize_review)

## Perform contractions on the reviews
We also standarize all contractions to their expanded forms to reduce permutations of each word for a faster/streamlined model. This is achieved by utilizing the contractions library as it contains a plethora of pre-defined mappings.

In [11]:
# !pip install contractions
import contractions

def fix_contractions(text):
    return contractions.fix(text)

# Use a library for the contractions as it includes a plethora of pre-defined contractions
data['cleaned_reviews'] = data['cleaned_reviews'].apply(fix_contractions)

## Remove non-alphabetical characters
Remove numbers, signs, and non-English characters as they serve little-to-no purpose in our sentiment analysis, so they're not worth the overhead. This is done using a regular expression.

In [12]:
# Remove all characters but keep english characters and space
data['cleaned_reviews'] = data['cleaned_reviews'].str.replace('[^a-zA-Z\s]', ' ')

  


## Remove the extra spaces between the words
Similarly, extra spaces is wasteful and provides no benefit other than using more resources. As such, it is removed so that the reviews are as efficiently compact as possible.

In [13]:
# Remove all unnecessary spaces
def remove_extra_spaces(text):
    return ' '.join(str(text).split())

data['cleaned_reviews'] = data['cleaned_reviews'].apply(remove_extra_spaces)

## Print cleaning results
Show the effect of the cleaning step on the reviews by calculating the mean character length.

In [14]:
# Calculate mean review character length of before/after cleaning
avg_before_clean = data['review_body'].apply(lambda x: len(str(x))).mean()
avg_after_clean = data['cleaned_reviews'].apply(lambda x: len(str(x))).mean()
print(f'Before cleaning: {avg_before_clean:.0f} characters.')
print(f'After cleaning: {avg_after_clean:.0f} characters.')

Before cleaning: 323 characters.
After cleaning: 309 characters.


# Pre-processing

## Remove the stop words
Using the NLTK stopwords, we remove those stopwords to further reduce the size of the reviews to the bare minimum since stopwords are usually filler words that have little impact on the final result.

In [15]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop = stopwords.words('english')

# Split each review into a list of words, then eliminate those words that are in the stopwords set as provided by nltk
def remove_stop_words(text):
    return ' '.join([word for word in str(text).split() if word not in (stop)])

data['processed_reviews'] = data['cleaned_reviews'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sanavesa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Perform lemmatization
Moreover, to reduce the review size, words are lemmatized to reduce inflections and word permutations to the bare minimum so that the next step, the features extracted can be as small as possible with no redundancies.

In [16]:
from nltk.stem import WordNetLemmatizer

tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in tokenizer.tokenize(text)])

# Use NLTK lemmatizer with verb as its part of speech to reduce inflections
data['processed_reviews'] = data['processed_reviews'].apply(lemmatize)

## Print pre-processing results
Show the effect of the pre-processing step on the reviews by calculating the mean character length.

In [17]:
# Calculate mean review character length of before/after pre-processing
avg_before_preprocessing = data['cleaned_reviews'].apply(lambda x: len(str(x))).mean()
avg_after_preprocessing = data['processed_reviews'].apply(lambda x: len(str(x))).mean()
print(f'Before pre-processing: {avg_before_preprocessing:.0f} characters.')
print(f'After pre-processing: {avg_after_preprocessing:.0f} characters.')

# Display three sample reviews for before/after pre-processing
display(data.sample(n=3, ignore_index=True))

Before pre-processing: 309 characters.
After pre-processing: 183 characters.


Unnamed: 0,star_rating,review_body,cleaned_reviews,processed_reviews
0,1,"We love our Popper! It pops almost every kernel to a perfect crunch and I love that we are eating healthy popcorn, without preservatives,for a fraction of the price than bagged microwave popcorn that's loaded with unhealthy additives.",we love our popper it pops almost every kernel to a perfect crunch and i love that we are eating healthy popcorn without preservatives for a fraction of the price than bagged microwave popcorn that is loaded with unhealthy additives,love popper pop almost every kernel perfect crunch love eat healthy popcorn without preservatives fraction price bag microwave popcorn load unhealthy additives
1,1,"In response to the above review... that touches on the quality of steel, the workmanship of the handle, etc.... but ultimately says the knife looks like it will perform as advertised (and I'm guessing DID... since there is no follow up...). I'd like to say a few things. First off... this cutter is made from American Metalcraft.... and yes, I know the name is a misnomer since their stuff is made in China (all of it as far as I know)... but, you should be doing your research prior to purchasing anyways (never judge a book by its cover, unless you don't care.), so this should not be surprising. Secondly, American Metalcraft is WIDELY regarded as an &#34;industry standard&#34; company, nobody claims that it is the Cadillac of the restaurant industry. Basically, they are the Honda of the culinary world. You don't buy a Honda, then bring it home, go online and point out that your Honda isn't performing like a Mercedes Benz. This cutter is EXACTLY where it should be.... it is priced as a mid-grade cutter, and that is exactly what it is. This product is designed to be a work horse, and not a showroom piece. I just ordered mine today (I have seen them in person in the past however), so I have not yet received it.... but I'm BEYOND sure that it will be up to the standard of all the previous AM products I've purchased. NOT the Mercedes Standard, but the workhorse reliable standard that I have personally experienced from this company over the last few years. So strong is that belief that I was motivated to say something here before mine even shipped. It takes a good company and a great product (again, for the price) to gain that kind of loyalty and credit from a discerning customer, which I consider myself to be. No offense meant to those who have had different experiences. These are just my personal thoughts. But, I think it's VERY unfair to give others the impression that they are looking at a shoddy product, and therefore encouraging them to spend more, when the product shown is a very solid piece, that will probably outperform some other cutters in a higher price bracket. I have worked at three pizza shops, and intend to open my own one day. I don't know a lot but I know the pizza industry, and American Metalcraft is one of the companies I have LEARNED to rely heavily on. In fairness, my experience is limited to their: Pans, trays, cutting wheels, perforated discs, screens, and soon: This beauty.",in response to the above review that touches on the quality of steel the workmanship of the handle etc but ultimately says the knife looks like it will perform as advertised and I am guessing did since there is no follow up I would like to say a few things first off this cutter is made from american metalcraft and yes i know the name is a misnomer since their stuff is made in china all of it as far as i know but you should be doing your research prior to purchasing anyways never judge a book by its cover unless you do not care so this should not be surprising secondly american metalcraft is widely regarded as an industry standard company nobody claims that it is the cadillac of the restaurant industry basically they are the honda of the culinary world you do not buy a honda then bring it home go online and point out that your honda is not performing like a mercedes benz this cutter is exactly where it should be it is priced as a mid grade cutter and that is exactly what it is this product is designed to be a work horse and not a showroom piece i just ordered mine today i have seen them in person in the past however so i have not yet received it but I am beyond sure that it will be up to the standard of all the previous am products I have purchased not the mercedes standard but the workhorse reliable standard that i have personally experienced from this company over the last few years so strong is that belief that i was motivated to say something here before mine even shipped it takes a good company and a great product again for the price to gain that kind of loyalty and credit from a discerning customer which i consider myself to be no offense meant to those who have had different experiences these are just my personal thoughts but i think it is very unfair to give others the impression that they are looking at a shoddy product and therefore encouraging them to spend more when the product shown is a very solid piece that will probably outperform some other cutters in a higher price bracket i have worked at three pizza shops and intend to open my own one day i do not know a lot but i know the pizza industry and american metalcraft is one of the companies i have learned to rely heavily on in fairness my experience is limited to their pans trays cutting wheels perforated discs screens and soon this beauty,response review touch quality steel workmanship handle etc ultimately say knife look like perform advertise I guess since follow I would like say things first cutter make american metalcraft yes know name misnomer since stuff make china far know research prior purchase anyways never judge book cover unless care surprise secondly american metalcraft widely regard industry standard company nobody claim cadillac restaurant industry basically honda culinary world buy honda bring home go online point honda perform like mercedes benz cutter exactly price mid grade cutter exactly product design work horse showroom piece order mine today see person past however yet receive I beyond sure standard previous products I purchase mercedes standard workhorse reliable standard personally experience company last years strong belief motivate say something mine even ship take good company great product price gain kind loyalty credit discern customer consider offense mean different experience personal thoughts think unfair give others impression look shoddy product therefore encourage spend product show solid piece probably outperform cutters higher price bracket work three pizza shop intend open one day know lot know pizza industry american metalcraft one company learn rely heavily fairness experience limit pan trays cut wheel perforate discs screen soon beauty
2,1,My daughter has always wanted an ice cream maker. I bought her the ice cream maker attachment for her kitchen aid mixer for her birthday. She was surprised and VERY HAPPY. She makes ice cream at least once a week. It's great!,my daughter has always wanted an ice cream maker i bought her the ice cream maker attachment for her kitchen aid mixer for her birthday she was surprised and very happy she makes ice cream at least once a week it is great,daughter always want ice cream maker buy ice cream maker attachment kitchen aid mixer birthday surprise happy make ice cream least week great


# Training and Testing data split
Split the data into two distinct parts (80% training, 20% testing) so that there is no overlap. This is done to ensure no data leakage nor bias influences the training and we can have a better view of the training process (if it overfitted for example).

In [18]:
from sklearn.model_selection import train_test_split

# Perform an 80-20 split for training and testing data (using the cleaned+pre-processed reviews)
review_train, review_test, y_train, y_test = train_test_split(data['processed_reviews'], data['star_rating'], test_size=0.2)

# TF-IDF Feature Extraction
Compute tf-idf to reflect how important a word is to a document/corpus which is used to train the models below.
Note that the TF-IDF extraction is done after splitting the dataset into a training and testing dataset. This is to ensure no data leakage occurs from the testing to the training through TF-IDF.

In [19]:
# Fit TF-IDF on training data only to prevent data leakage; apply transformation to test data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(review_train)
X_test = vectorizer.transform(review_test)

## Helper function to report results for each model

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def report_results(text, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred)
    
    print(f'{text}: accuracy is {accuracy:.3f}.')
    print(f'{text}: precision is {precision:.3f}.')
    print(f'{text}: recall is {recall:.3f}.')
    print(f'{text}: f1-score is {f1:.3f}.')
    print()

# Perceptron
Use a perceptron classifier to train on the supplied dataset.

In [29]:
print(data['processed_reviews'].shape)
print(review_train.shape)

(200000,)
(160000,)


In [21]:
from sklearn.linear_model import Perceptron

model = Perceptron()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

report_results('Perceptron Training', y_train, y_train_pred)
report_results('Perceptron Testing', y_test, y_test_pred)

Perceptron Training: accuracy is 0.886.
Perceptron Training: precision is 0.921.
Perceptron Training: recall is 0.886.
Perceptron Training: f1-score is 0.881.

Perceptron Testing: accuracy is 0.847.
Perceptron Testing: precision is 0.881.
Perceptron Testing: recall is 0.847.
Perceptron Testing: f1-score is 0.839.



# SVM
Use an SVM classifier to train on the supplied dataset.

In [22]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

report_results('SVM Training', y_train, y_train_pred)
report_results('SVM Testing', y_test, y_test_pred)

SVM Training: accuracy is 0.929.
SVM Training: precision is 0.931.
SVM Training: recall is 0.929.
SVM Training: f1-score is 0.929.

SVM Testing: accuracy is 0.894.
SVM Testing: precision is 0.895.
SVM Testing: recall is 0.894.
SVM Testing: f1-score is 0.894.



# Logistic Regression
Use a Logistic Regression classifier to train on the supplied dataset with its max iterations parameter increased to 1500 to ensure that the model converges.

In [23]:
from sklearn.linear_model import LogisticRegression

# Increase the max iterations from 100 to 1500 to ensure the model converges
model = LogisticRegression(max_iter=1500)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

report_results('Logistic Regression Training', y_train, y_train_pred)
report_results('Logistic Regression Testing', y_test, y_test_pred)

Logistic Regression Training: accuracy is 0.910.
Logistic Regression Training: precision is 0.914.
Logistic Regression Training: recall is 0.910.
Logistic Regression Training: f1-score is 0.910.

Logistic Regression Testing: accuracy is 0.897.
Logistic Regression Testing: precision is 0.899.
Logistic Regression Testing: recall is 0.897.
Logistic Regression Testing: f1-score is 0.896.



# Naive Bayes
Use a Multinomial Naive Bayes classifier to train on the supplied dataset.

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

report_results('Naive Bayes Training', y_train, y_train_pred)
report_results('Naive Bayes Testing', y_test, y_test_pred)

Naive Bayes Training: accuracy is 0.882.
Naive Bayes Training: precision is 0.889.
Naive Bayes Training: recall is 0.882.
Naive Bayes Training: f1-score is 0.880.

Naive Bayes Testing: accuracy is 0.867.
Naive Bayes Testing: precision is 0.874.
Naive Bayes Testing: recall is 0.867.
Naive Bayes Testing: f1-score is 0.866.

