In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag, word_tokenize
import re
from bs4 import BeautifulSoup
import contractions
from bs4 import MarkupResemblesLocatorWarning
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 

[nltk_data] Downloading package wordnet to /home/omghag/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/omghag/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/omghag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/omghag/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz
#          https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz

"""
downloaded the dataset locally through the above links using terminal wget command    
"""



'\ndownloaded the dataset locally through the above links using terminal wget command    \n'

# Dataset Preparation

## Read Data

In [4]:
df = pd.read_csv(r'data/amazon_reviews_us_Office_Products_v1_00.tsv.gz', sep='\t', on_bad_lines='skip', low_memory=False)

In [5]:
df.shape

(2640254, 15)

In [6]:
df.head(3)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31


## Keep Reviews and Ratings

In [7]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='str')

In [8]:
print(df['star_rating'].dtype)
print(df['star_rating'].unique())

str
<ArrowStringArray>
['5', '1', '4', '2', '3', '2015-06-05', '2015-02-11', nan, '2014-02-14']
Length: 9, dtype: str


In [9]:
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
print(df['star_rating'].dtype)
print(df['star_rating'].unique())

float64
[ 5.  1.  4.  2.  3. nan]


In [10]:
df.dropna(subset=['review_body', 'star_rating'], inplace=True)

In [11]:
df.shape

(2640080, 15)

In [12]:
df = df[['review_body', 'star_rating']]  # selecting only relevant columns
print(df.head(3))
print(df['star_rating'].value_counts().sort_index())  # checking distribution of classes

                                         review_body  star_rating
0                                     Great product.          5.0
1  What's to say about this commodity item except...          5.0
2    Haven't used yet, but I am sure I will like it.          5.0
star_rating
1.0     306967
2.0     138381
3.0     193680
4.0     418348
5.0    1582704
Name: count, dtype: int64




 ## Relabeling and Sampling
 
First form three classes and print their statistics. Then randomly select 100,000 reviews from the positive and 100,000 reviews from the negative



In [13]:
df = df[df['star_rating'] != 3]  # removing neutral reviews
df['sentiment'] = np.where(df['star_rating'] > 3, 1, 0)  # positive:1, negative:0

In [14]:
print(df.shape)
df['sentiment'].value_counts()

(2446400, 3)


sentiment
1    2001052
0     445348
Name: count, dtype: int64

In [15]:
random_seed = 42
positive_df = df[df['sentiment'] == 1].sample(n=100000, random_state=random_seed)
negative_df = df[df['sentiment'] == 0].sample(n=100000, random_state=random_seed)
df = pd.concat([positive_df, negative_df])

In [16]:
print(df.shape)
print(df['sentiment'].value_counts())

(200000, 3)
sentiment
1    100000
0    100000
Name: count, dtype: int64


# Data Cleaning



In [17]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [18]:
avg_length_before = df['review_body'].str.len().mean()
avg_length_before

np.float64(318.00717)

In [19]:
df['review_body'] = df['review_body'].apply(preprocess_text)

In [20]:
avg_length_after = df['review_body'].str.len().mean()
avg_length_after

np.float64(301.77087)

In [21]:
print(df['review_body'].head(3))

1049807    just as advertised and quickly shipped very pl...
2439572    this fountain pen has an great feel to it heav...
673985     i have order this replacement toner several ti...
Name: review_body, dtype: str


# Pre-processing

In [22]:
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    """Convert treebank POS tags to WordNet POS tags"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

## remove the stop words 

In [23]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    """Remove stopwords but keep negation words"""
    stop_words = set(stopwords.words('english'))
    
    # CRITICAL: Keep negation words for sentiment analysis
    negations = {
        'no', 'not', 'nor', 'never', 'neither', 'nobody', 'nothing', 
        'nowhere', 'none', 'hardly', 'scarcely', 'barely'
    }
    # Remove negation words from stopwords list
    stop_words = stop_words - negations
    
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    return ' '.join(filtered)

# Test it on a sample
sample_text = "this is not a good product and i do not recommend it"
print("Original:", sample_text)
print("After stopword removal:", remove_stopwords(sample_text))
# Should keep "not" in the output

Original: this is not a good product and i do not recommend it
After stopword removal: not good product not recommend


In [24]:
samples_before_preprocessing = df['review_body'].head(3).copy()
print("Samples before preprocessing:")
print(samples_before_preprocessing)
avg_length_before_preprocessing = df['review_body'].str.len().mean()
print(f"Average length before preprocessing:{ avg_length_before_preprocessing: .4f}" )

Samples before preprocessing:
1049807    just as advertised and quickly shipped very pl...
2439572    this fountain pen has an great feel to it heav...
673985     i have order this replacement toner several ti...
Name: review_body, dtype: str
Average length before preprocessing: 301.7709


In [25]:
from nltk.corpus import stopwords
# Save before preprocessing
samples_before_stopwords_removal = df['review_body'].head(3).copy()
print("Samples before removing stop words:")
print(samples_before_stopwords_removal)
avg_length_before_stopwords_removal = df['review_body'].str.len().mean()
print("Average length before removing stop words:", avg_length_before_stopwords_removal)
# Now remove stop words
# Apply stopword removal (keeping negations)
df['review_body'] = df['review_body'].apply(remove_stopwords)

# After all preprocessing
samples_after_stopwords_removal = df['review_body'].head(3).copy()
print("Samples after removing stop words:")
print(samples_after_stopwords_removal)
avg_length_after_stopwords_removal = df['review_body'].str.len().mean()
print("Average length after removing stop words:", avg_length_after_stopwords_removal)

Samples before removing stop words:
1049807    just as advertised and quickly shipped very pl...
2439572    this fountain pen has an great feel to it heav...
673985     i have order this replacement toner several ti...
Name: review_body, dtype: str
Average length before removing stop words: 301.77087
Samples after removing stop words:
1049807                   advertised quickly shipped pleased
2439572    fountain pen great feel heavy not much writing...
673985     order replacement toner several times canon sm...
Name: review_body, dtype: str
Average length after removing stop words: 195.60442


## perform lemmatization  

In [26]:
from nltk.stem import WordNetLemmatizer
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_with_pos(text):
    """Lemmatize text with POS tagging"""
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    # Get POS tags for all words
    pos_tags = pos_tag(words)
    # Lemmatize each word with its POS tag
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized)

In [27]:
#save before lemmatization
samples_before_lemmatization = df['review_body'].head(3).copy()
print("Samples before lemmatization:")
print(samples_before_lemmatization)
avg_length_before_lemmatization = df['review_body'].str.len().mean()
print("Average length before lemmatization:", avg_length_before_lemmatization)

# Apply lemmatization
df['review_body'] = df['review_body'].apply(lemmatize_with_pos)

# After lemmatization
samples_after_lemmatization = df['review_body'].head(3).copy()
print("Samples after lemmatization:")
print(samples_after_lemmatization)
avg_length_after_lemmatization = df['review_body'].str.len().mean()
print("Average length after lemmatization:", avg_length_after_lemmatization)

Samples before lemmatization:
1049807                   advertised quickly shipped pleased
2439572    fountain pen great feel heavy not much writing...
673985     order replacement toner several times canon sm...
Name: review_body, dtype: str
Average length before lemmatization: 195.60442
Samples after lemmatization:
1049807                      advertised quickly ship pleased
2439572    fountain pen great feel heavy not much write e...
673985     order replacement toner several time canon sma...
Name: review_body, dtype: str
Average length after lemmatization: 185.87826


In [28]:
samples_after_preprocessing = df['review_body'].head(3).copy()
print("Samples after preprocessing:")
print(samples_after_preprocessing)
avg_length_after_preprocessing = df['review_body'].str.len().mean()
print(f"Average length after preprocessing: {avg_length_after_preprocessing: .4f}")

Samples after preprocessing:
1049807                      advertised quickly ship pleased
2439572    fountain pen great feel heavy not much write e...
673985     order replacement toner several time canon sma...
Name: review_body, dtype: str
Average length after preprocessing:  185.8783


# Bigram Feature Extraction

In [29]:
from nltk import bigrams
from sklearn.feature_extraction.text import CountVectorizer

def extract_bigrams(text):
    words = text.split()
    if len(words) < 2:
        return ""  # Return empty string if can't form bigrams
    bigram_list = list(bigrams(words))
    # Join each bigram with underscore
    bigram_strings = ['_'.join(bigram) for bigram in bigram_list]
    return ' '.join(bigram_strings)

# Re-apply to all reviews
df['bigrams'] = df['review_body'].apply(extract_bigrams)

# Check for any issues
print(f"Any empty bigrams? {(df['bigrams'] == '').sum()}")
print(f"Sample bigram: {df['bigrams'].iloc[0]}")

# Now use CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['bigrams'])
y = df['sentiment']

print(f"Feature matrix shape: {X.shape}")

Any empty bigrams? 6018
Sample bigram: advertised_quickly quickly_ship ship_pleased
Feature matrix shape: (200000, 1742717)


In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"Training labels: {y_train.shape}")
print(f"Testing labels: {y_test.shape}")

Training set: (160000, 1742717)
Testing set: (40000, 1742717)
Training labels: (160000,)
Testing labels: (40000,)


# Perceptron

In [31]:

#Load and train Perceptron model
perceptron = Perceptron(random_state=42, max_iter=10000)
perceptron.fit(X_train, y_train)

#Predictions
y_train_pred = perceptron.predict(X_train)
y_test_pred = perceptron.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"Perceptron Training Accuracy: {train_acc:.4f}")
print(f"Perceptron Training Precision: {train_prec:.4f}")
print(f"Perceptron Training Recall: {train_rec:.4f}")
print(f"Perceptron Training F1-score: {train_f1:.4f}")
print(f"Perceptron Testing Accuracy: {test_acc:.4f}")
print(f"Perceptron Testing Precision: {test_prec:.4f}")
print(f"Perceptron Testing Recall: {test_rec:.4f}")
print(f"Perceptron Testing F1-score: {test_f1:.4f}")

Perceptron Training Accuracy: 0.9949
Perceptron Training Precision: 0.9908
Perceptron Training Recall: 0.9991
Perceptron Training F1-score: 0.9949
Perceptron Testing Accuracy: 0.8815
Perceptron Testing Precision: 0.8727
Perceptron Testing Recall: 0.8932
Perceptron Testing F1-score: 0.8829


# SVM

In [32]:
#Load and train SVM model
svm = LinearSVC(random_state=42, max_iter=10000)
svm.fit(X_train, y_train)

#Predictions
y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"SVM Training Accuracy: {train_acc:.4f}")
print(f"SVM Training Precision: {train_prec:.4f}")
print(f"SVM Training Recall: {train_rec:.4f}")
print(f"SVM Training F1-score: {train_f1:.4f}")
print(f"SVM Testing Accuracy: {test_acc:.4f}")
print(f"SVM Testing Precision: {test_prec:.4f}")
print(f"SVM Testing Recall: {test_rec:.4f}")
print(f"SVM Testing F1-score: {test_f1:.4f}")

SVM Training Accuracy: 0.9952
SVM Training Precision: 0.9907
SVM Training Recall: 0.9997
SVM Training F1-score: 0.9952
SVM Testing Accuracy: 0.8702
SVM Testing Precision: 0.8460
SVM Testing Recall: 0.9050
SVM Testing F1-score: 0.8745


# Logistic Regression

In [33]:
# load and train Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

#Predictions
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"Logistic Regression Training Accuracy: {train_acc:.4f}")
print(f"Logistic Regression Training Precision: {train_prec:.4f}")
print(f"Logistic Regression Training Recall: {train_rec:.4f}")
print(f"Logistic Regression Training F1-score: {train_f1:.4f}")
print(f"Logistic Regression Testing Accuracy: {test_acc:.4f}")
print(f"Logistic Regression Testing Precision: {test_prec:.4f}")
print(f"Logistic Regression Testing Recall: {test_rec:.4f}")
print(f"Logistic Regression Testing F1-score: {test_f1:.4f}")


Logistic Regression Training Accuracy: 0.9870
Logistic Regression Training Precision: 0.9772
Logistic Regression Training Recall: 0.9973
Logistic Regression Training F1-score: 0.9872
Logistic Regression Testing Accuracy: 0.8845
Logistic Regression Testing Precision: 0.8561
Logistic Regression Testing Recall: 0.9242
Logistic Regression Testing F1-score: 0.8889


# Naive Bayes

In [34]:
# load and train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

#Predictions
y_train_pred = nb.predict(X_train)
y_test_pred = nb.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"Naive Bayes Training Accuracy: {train_acc:.4f}")
print(f"Naive Bayes Training Precision: {train_prec:.4f}")
print(f"Naive Bayes Training Recall: {train_rec:.4f}")
print(f"Naive Bayes Training F1-score: {train_f1:.4f}")
print(f"Naive Bayes Testing Accuracy: {test_acc:.4f}")
print(f"Naive Bayes Testing Precision: {test_prec:.4f}")
print(f"Naive Bayes Testing Recall: {test_rec:.4f}")
print(f"Naive Bayes Testing F1-score: {test_f1:.4f}")

Naive Bayes Training Accuracy: 0.9697
Naive Bayes Training Precision: 0.9751
Naive Bayes Training Recall: 0.9639
Naive Bayes Training F1-score: 0.9695
Naive Bayes Testing Accuracy: 0.8833
Naive Bayes Testing Precision: 0.8745
Naive Bayes Testing Recall: 0.8949
Naive Bayes Testing F1-score: 0.8846
