In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import nltk          # Natural Language Toolkit for text processing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet data
nltk.download('stopwords')   # Download StopWords data
import re            # Regular expressions for text processing
from bs4 import BeautifulSoup  # For HTML parsing
import warnings      # To handle warnings
warnings.filterwarnings("ignore")  # Ignore warnings for the remainder of the code
warnings.filterwarnings("default")  # Set warnings back to default behavior
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sakethanne/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sakethanne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz



## Read Data

In [3]:
# Read the data from the tsv file as a Pandas frame
full_data = pd.read_csv("amazon_reviews_us_Office_Products_v1_00.tsv", delimiter='\t', encoding='utf-8', error_bad_lines=False)



  full_data = pd.read_csv("amazon_reviews_us_Office_Products_v1_00.tsv", delimiter='\t', encoding='utf-8', error_bad_lines=False)
Skipping line 20773: expected 15 fields, saw 22
Skipping line 39834: expected 15 fields, saw 22
Skipping line 52957: expected 15 fields, saw 22
Skipping line 54540: expected 15 fields, saw 22

Skipping line 80276: expected 15 fields, saw 22
Skipping line 96168: expected 15 fields, saw 22
Skipping line 96866: expected 15 fields, saw 22
Skipping line 98175: expected 15 fields, saw 22
Skipping line 112539: expected 15 fields, saw 22
Skipping line 119377: expected 15 fields, saw 22
Skipping line 120065: expected 15 fields, saw 22
Skipping line 124703: expected 15 fields, saw 22

Skipping line 134024: expected 15 fields, saw 22
Skipping line 153938: expected 15 fields, saw 22
Skipping line 156225: expected 15 fields, saw 22
Skipping line 168603: expected 15 fields, saw 22
Skipping line 187002: expected 15 fields, saw 22

Skipping line 200397: expected 15 fields

In [4]:
# Print the data frame that contains the entire dataset from the tsv file
full_data

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31
3,US,52782374,R1PR37BR7G3M6A,B00D7H8XB6,868449945,AmazonBasics 12-Sheet High-Security Micro-Cut ...,Office Products,1,2.0,3.0,N,Y,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...,2015-08-31
4,US,24045652,R3BDDDZMZBZDPU,B001XCWP34,33521401,"Derwent Colored Pencils, Inktense Ink Pencils,...",Office Products,4,0.0,0.0,N,Y,Four Stars,Gorgeous colors and easy to use,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2640249,US,53005790,RLI7EI10S7SN0,B00000DM9M,223408988,PalmOne III Leather Belt Clip Case,Office Products,4,26.0,26.0,N,N,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...,1998-12-07
2640250,US,52188548,R1F3SRK9MHE6A3,B00000DM9M,223408988,PalmOne III Leather Belt Clip Case,Office Products,4,18.0,18.0,N,N,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...,1998-11-30
2640251,US,52090046,R23V0C4NRJL8EM,0807865001,307284585,Gods and Heroes of Ancient Greece,Office Products,4,9.0,16.0,N,N,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...,1998-10-15
2640252,US,52503173,R13ZAE1ATEUC1T,1572313188,870359649,Microsoft EXCEL 97/ Visual Basic Step-by-Step ...,Office Products,5,0.0,0.0,N,N,class text,I am teaching a course in Excel and am using t...,1998-08-22


## Keep Reviews and Ratings

In [5]:
# Keep only the Reviews and Ratings fields
data = full_data[['review_body', 'star_rating', 'review_headline']]

# Convert 'star_rating' to numeric values
data['star_rating'] = pd.to_numeric(data['star_rating'], errors='coerce')

# Display three sample reviews along with ratings
sample_reviews = data.sample(3)
print("====================================================")
print("Sample Reviews:")
print(sample_reviews)

# Report statistics of the ratings
ratings_statistics = data['star_rating'].value_counts().sort_index()
print("====================================================")
print("\nRatings Statistics:")
print(ratings_statistics)

Sample Reviews:
                                               review_body  star_rating  \
1293158  So cute, so simple. Great for a quick thank yo...          5.0   
2109397  Looks like a real good stapler, small sturdy, ...          2.0   
2202452  These folders are making a huge positive diffe...          5.0   

                            review_headline  
1293158  Great for a quick thank you letter  
2109397                         ACCO Brands  
2202452   So much nicer than simple folders  

Ratings Statistics:
1.0     306979
2.0     138384
3.0     193691
4.0     418371
5.0    1582812
Name: star_rating, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['star_rating'] = pd.to_numeric(data['star_rating'], errors='coerce')


 ##  Form three classes and select 20000 reviews randomly from each class.



In [6]:
# Create binary labels for sentiment analysis
data['sentiment'] = data['star_rating'].apply(lambda x: 1 if x > 3 else 0 if x <= 2 else None)

# Discard neutral reviews (rating 3)
data = data.dropna(subset=['sentiment'])

# Select 100,000 positive and 100,000 negative reviews
positive_reviews = data[data['sentiment'] == 1].sample(20000, random_state=42)
negative_reviews = data[data['sentiment'] == 0].sample(20000, random_state=42)

# Concatenate positive and negative reviews
selected_reviews = pd.concat([positive_reviews, negative_reviews])

# Print the reviews that have been selected for further processing randomly
selected_reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['star_rating'].apply(lambda x: 1 if x > 3 else 0 if x <= 2 else None)


Unnamed: 0,review_body,star_rating,review_headline,sentiment
980067,"Yes they're thin, but they're sturdy and do th...",5.0,Thin but Sturdy,1.0
655902,"order came in just fine, would order again",5.0,Name Badge magnets,1.0
1249308,exactly as it appears great for work,4.0,Four Stars,1.0
2190006,I just set this up in my classroom. It is actu...,5.0,Great product!,1.0
935188,good product,5.0,Five Stars,1.0
...,...,...,...,...
553480,"Product does work, but requires multiple appli...",1.0,Product is better for wood furniture that is l...,0.0
1378271,This is not functioning i did all the seller ...,1.0,Not working,0.0
1842986,This calendar would have been great if it hadn...,1.0,Bad product,0.0
657727,This is a fake! Copy of a native.,1.0,One Star,0.0


 ## Split the dataset into training and testing dataset



In [7]:
# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(selected_reviews['review_body'],
                                                    selected_reviews['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42)

In [8]:
X_train

2554960    I bought this printer from Staples. Staples ma...
806282     good value and quick delivery.  No problems wi...
442493                     Great price and original product.
514835     It works, it's cute, not too much else to say....
690043        ink was over one year past the use before date
                                 ...                        
1880369    I enjoy very much. help me in daily shredding ...
1905026    This keeps my magic wand safe and gives me som...
2547856    If you are using Windows 2000 DO NOT buy this ...
1646091    We homeschool and my children have a daily lea...
1519083    Love the wireless printing. Only problem is th...
Name: review_body, Length: 32000, dtype: object

In [9]:
X_test

1209501    I am at my wit's end and at this point I just ...
1990644    This pen draws a solid, even, and confident li...
1084939    I was so excited about this product when I fir...
1590006    The HP printer was received in excellent condi...
2558567    I have used zest for longer than I can remembe...
                                 ...                        
1932246    This printer is a great value for the price. T...
1317813    Have not been able to hear a recording from th...
868565                                Good reliable product.
745594                                              Love it!
418299     I put the batteries on the charge and they nev...
Name: review_body, Length: 8000, dtype: object

In [10]:
y_train

2554960    1.0
806282     1.0
442493     1.0
514835     1.0
690043     0.0
          ... 
1880369    1.0
1905026    1.0
2547856    0.0
1646091    1.0
1519083    1.0
Name: sentiment, Length: 32000, dtype: float64

In [11]:
y_test

1209501    0.0
1990644    1.0
1084939    0.0
1590006    1.0
2558567    0.0
          ... 
1932246    1.0
1317813    0.0
868565     1.0
745594     1.0
418299     0.0
Name: sentiment, Length: 8000, dtype: float64

# Data Cleaning



In [12]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Define a simple contraction map
CONTRACTION_MAP = {
    "won't": "will not",
    "can't": "cannot",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "that's": "that is",
    "we're": "we are",
    "they're": "they are",
    "isn't": "is not",
    "aren't": "are not",
    "haven't": "have not",
    "hasn't": "has not",
}

# Function to expand contractions
def expand_contractions(text):
    for contraction, expansion in CONTRACTION_MAP.items():
        text = re.sub(contraction, expansion, text)
    return text

# Preprocess the reviews
def preprocess_reviews(reviews):
    # Convert to lowercase and handle NaN values
    reviews = reviews.apply(lambda x: str(x).lower() if pd.notna(x) else '')
    
    # Remove HTML and URLs
    reviews = reviews.apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    reviews = reviews.apply(lambda x: re.sub(r'http\S+', '', x))

    # Remove non-alphabetical characters
    reviews = reviews.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

    # Remove extra spaces
    reviews = reviews.apply(lambda x: re.sub(' +', ' ', x))

    # Perform contractions
    reviews = reviews.apply(expand_contractions)

    return reviews

# Preprocess the training set
X_train_preprocessed = preprocess_reviews(X_train)

# Print average length of reviews before and after cleaning
avg_length_before = X_train.apply(lambda x: len(str(x))).mean()
avg_length_after = X_train_preprocessed.apply(len).mean()
print(f"\nAverage Length of Reviews (Before Cleaning): {avg_length_before} characters")
print(f"Average Length of Reviews (After Cleaning): {avg_length_after} characters")

  reviews = reviews.apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())



Average Length of Reviews (Before Cleaning): 315.87378125 characters
Average Length of Reviews (After Cleaning): 298.18421875 characters


# Pre-processing

### -- remove the stop words
### -- perform lemmatization

In [13]:
# Initialize NLTK's stopwords and WordNet lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to remove stop words and perform lemmatization
def preprocess_nltk(review):
    if pd.notna(review):
        words = nltk.word_tokenize(str(review).lower())  # Convert to lowercase
        words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
        return ' '.join(words)
    else:
        return ''

# Preprocess the training set using NLTK
X_train_nltk_preprocessed = X_train_preprocessed.apply(preprocess_nltk)

# Print three sample reviews before and after NLTK preprocessing
sample_reviews_indices = X_train_preprocessed.sample(3).index

for index in sample_reviews_indices:
    print(f"\nSample Review {index} Before Preprocessing:")
    print(X_train_preprocessed.loc[index])

    print(f"\nSample Review {index} After NLTK Preprocessing:")
    print(X_train_nltk_preprocessed.loc[index])

# Print average length of reviews before and after NLTK processing
avg_length_before_nltk = X_train_preprocessed.apply(len).mean()
avg_length_after_nltk = X_train_nltk_preprocessed.apply(len).mean()
print(f"\nAverage Length of Reviews (Before NLTK Processing): {avg_length_before_nltk} characters")
print(f"Average Length of Reviews (After NLTK Processing): {avg_length_after_nltk} characters")


Sample Review 1567770 Before Preprocessing:
we had this exact machine for years it finally gave out when we tried to order a new one through our bank it they only had the newer models for we ordered this model new and it is great for of the cost very pleased

Sample Review 1567770 After NLTK Preprocessing:
exact machine year finally gave tried order new one bank newer model ordered model new great cost pleased

Sample Review 1037094 Before Preprocessing:
pen refill writes smooth at first but then starts to have problems with flow both refills worked like that so it was a problem with the design then there was the problem with the pen refill leaking inside the pen on the second refill alcohol hand sanitizer will loosen the in to clean your pen i am sticking with to original refills from now on

Sample Review 1037094 After NLTK Preprocessing:
pen refill writes smooth first start problem flow refill worked like problem design problem pen refill leaking inside pen second refill alcohol ha

# TF-IDF Feature Extraction

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust the max_features parameter

# Fit and transform the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_nltk_preprocessed)

# Transform the test set
X_test_tfidf = tfidf_vectorizer.transform(X_test.apply(preprocess_nltk))

# Print the shape of the TF-IDF matrices
print(f"\nShape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")


Shape of X_train_tfidf: (32000, 10000)
Shape of X_test_tfidf: (8000, 10000)


# Perceptron

In [15]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Perceptron model
perceptron_model = Perceptron(random_state=42)

# Train the Perceptron model on the TF-IDF features
perceptron_model.fit(X_train_tfidf, y_train)

# Predictions on the training set
y_train_pred = perceptron_model.predict(X_train_tfidf)

# Predictions on the test set
y_test_pred = perceptron_model.predict(X_test_tfidf)

# Calculate metrics for the training set
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Calculate metrics for the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print the results
print(f"\nTraining Set Metrics:")
print(f"Accuracy: {accuracy_train}")
print(f"Precision: {precision_train}")
print(f"Recall: {recall_train}")
print(f"F1-score: {f1_train}")

print(f"\nTesting Set Metrics:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1-score: {f1_test}")


Training Set Metrics:
Accuracy: 0.9260625
Precision: 0.9460309143306261
Recall: 0.9035405980232704
F1-score: 0.9242976898956934

Testing Set Metrics:
Accuracy: 0.8325
Precision: 0.8388241256969082
Recall: 0.8246138515196811
F1-score: 0.8316582914572863


# SVM

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the SVM model
svm_model = SVC(random_state=42)

# Train the SVM model on the TF-IDF features
svm_model.fit(X_train_tfidf, y_train)

# Predictions on the training set
y_train_pred_svm = svm_model.predict(X_train_tfidf)

# Predictions on the test set
y_test_pred_svm = svm_model.predict(X_test_tfidf)

# Calculate metrics for the training set
accuracy_train_svm = accuracy_score(y_train, y_train_pred_svm)
precision_train_svm = precision_score(y_train, y_train_pred_svm)
recall_train_svm = recall_score(y_train, y_train_pred_svm)
f1_train_svm = f1_score(y_train, y_train_pred_svm)

# Calculate metrics for the test set
accuracy_test_svm = accuracy_score(y_test, y_test_pred_svm)
precision_test_svm = precision_score(y_test, y_test_pred_svm)
recall_test_svm = recall_score(y_test, y_test_pred_svm)
f1_test_svm = f1_score(y_test, y_test_pred_svm)

# Print the results
print(f"\nTraining Set Metrics (SVM):")
print(f"Accuracy: {accuracy_train_svm}")
print(f"Precision: {precision_train_svm}")
print(f"Recall: {recall_train_svm}")
print(f"F1-score: {f1_train_svm}")

print(f"\nTesting Set Metrics (SVM):")
print(f"Accuracy: {accuracy_test_svm}")
print(f"Precision: {precision_test_svm}")
print(f"Recall: {recall_test_svm}")
print(f"F1-score: {f1_test_svm}")



Training Set Metrics (SVM):
Accuracy: 0.9784375
Precision: 0.9786581549630742
Recall: 0.9781683973476792
F1-score: 0.978413214866725

Testing Set Metrics (SVM):
Accuracy: 0.891625
Precision: 0.8882309400444115
Recall: 0.8968609865470852
F1-score: 0.8925251022685012


# Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the Logistic Regression model on the TF-IDF features
logreg_model.fit(X_train_tfidf, y_train)

# Predictions on the training set
y_train_pred_logreg = logreg_model.predict(X_train_tfidf)

# Predictions on the test set
y_test_pred_logreg = logreg_model.predict(X_test_tfidf)

# Calculate metrics for the training set
accuracy_train_logreg = accuracy_score(y_train, y_train_pred_logreg)
precision_train_logreg = precision_score(y_train, y_train_pred_logreg)
recall_train_logreg = recall_score(y_train, y_train_pred_logreg)
f1_train_logreg = f1_score(y_train, y_train_pred_logreg)

# Calculate metrics for the test set
accuracy_test_logreg = accuracy_score(y_test, y_test_pred_logreg)
precision_test_logreg = precision_score(y_test, y_test_pred_logreg)
recall_test_logreg = recall_score(y_test, y_test_pred_logreg)
f1_test_logreg = f1_score(y_test, y_test_pred_logreg)

# Print the results
print(f"\nTraining Set Metrics (Logistic Regression):")
print(f"Accuracy: {accuracy_train_logreg}")
print(f"Precision: {precision_train_logreg}")
print(f"Recall: {recall_train_logreg}")
print(f"F1-score: {f1_train_logreg}")

print(f"\nTesting Set Metrics (Logistic Regression):")
print(f"Accuracy: {accuracy_test_logreg}")
print(f"Precision: {precision_test_logreg}")
print(f"Recall: {recall_test_logreg}")
print(f"F1-score: {f1_test_logreg}")


Training Set Metrics (Logistic Regression):
Accuracy: 0.90859375
Precision: 0.9146612483332275
Recall: 0.9011009633429251
F1-score: 0.9078304710887034

Testing Set Metrics (Logistic Regression):
Accuracy: 0.884625
Precision: 0.8813224771773994
Recall: 0.8898854010961634
F1-score: 0.8855832403619686


# Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Train the Multinomial Naive Bayes model on the TF-IDF features
nb_model.fit(X_train_tfidf, y_train)

# Predictions on the training set
y_train_pred_nb = nb_model.predict(X_train_tfidf)

# Predictions on the test set
y_test_pred_nb = nb_model.predict(X_test_tfidf)

# Calculate metrics for the training set
accuracy_train_nb = accuracy_score(y_train, y_train_pred_nb)
precision_train_nb = precision_score(y_train, y_train_pred_nb)
recall_train_nb = recall_score(y_train, y_train_pred_nb)
f1_train_nb = f1_score(y_train, y_train_pred_nb)

# Calculate metrics for the test set
accuracy_test_nb = accuracy_score(y_test, y_test_pred_nb)
precision_test_nb = precision_score(y_test, y_test_pred_nb)
recall_test_nb = recall_score(y_test, y_test_pred_nb)
f1_test_nb = f1_score(y_test, y_test_pred_nb)

# Print the results
print(f"\nTraining Set Metrics (Multinomial Naive Bayes):")
print(f"Accuracy: {accuracy_train_nb}")
print(f"Precision: {precision_train_nb}")
print(f"Recall: {recall_train_nb}")
print(f"F1-score: {f1_train_nb}")

print(f"\nTesting Set Metrics (Multinomial Naive Bayes):")
print(f"Accuracy: {accuracy_test_nb}")
print(f"Precision: {precision_test_nb}")
print(f"Recall: {recall_test_nb}")
print(f"F1-score: {f1_test_nb}")


Training Set Metrics (Multinomial Naive Bayes):
Accuracy: 0.88121875
Precision: 0.8892225132562448
Recall: 0.870699361941699
F1-score: 0.8798634596542242

Testing Set Metrics (Multinomial Naive Bayes):
Accuracy: 0.86225
Precision: 0.860932077342588
Recall: 0.8652217239661186
F1-score: 0.8630715705765408
