In [146]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag, word_tokenize
import re
from bs4 import BeautifulSoup
from bs4 import MarkupResemblesLocatorWarning
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 

[nltk_data] Downloading package wordnet to /home/omghag/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/omghag/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/omghag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/omghag/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [147]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz
#          https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz

"""
downloaded the dataset locally through the above links using terminal wget command    
"""



'\ndownloaded the dataset locally through the above links using terminal wget command    \n'

# Dataset Preparation

## Read Data

In [182]:
df = pd.read_csv(r'data/amazon_reviews_us_Office_Products_v1_00.tsv.gz', sep='\t', on_bad_lines='skip', low_memory=False)

In [183]:
df.shape

(2640254, 15)

In [184]:
df.head(3)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31


## Keep Reviews and Ratings

In [185]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='str')

In [186]:
print(df['star_rating'].dtype)
print(df['star_rating'].unique())

str
<StringArray>
['5', '1', '4', '2', '3', '2015-06-05', '2015-02-11', nan, '2014-02-14']
Length: 9, dtype: str


In [187]:
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
print(df['star_rating'].dtype)
print(df['star_rating'].unique())

float64
[ 5.  1.  4.  2.  3. nan]


In [188]:
df.dropna(subset=['review_body', 'star_rating'], inplace=True)

In [189]:
df.shape

(2640080, 15)

In [190]:
df = df[['review_body', 'star_rating']]  # selecting only relevant columns
print(df.head(3))
print(df['star_rating'].value_counts().sort_index())  # checking distribution of classes

                                         review_body  star_rating
0                                     Great product.          5.0
1  What's to say about this commodity item except...          5.0
2    Haven't used yet, but I am sure I will like it.          5.0
star_rating
1.0     306967
2.0     138381
3.0     193680
4.0     418348
5.0    1582704
Name: count, dtype: int64




 ## Relabeling and Sampling
 
First form three classes and print their statistics. Then randomly select 100,000 reviews from the positive and 100,000 reviews from the negative



In [191]:
df = df[df['star_rating'] != 3]  # removing neutral reviews
df['sentiment'] = np.where(df['star_rating'] > 3, 1, 0)  # positive:1, negative:0

In [192]:
print(df.shape)
df['sentiment'].value_counts()

(2446400, 3)


sentiment
1    2001052
0     445348
Name: count, dtype: int64

In [193]:
random_seed = 42
positive_df = df[df['sentiment'] == 1].sample(n=100000, random_state=random_seed)
negative_df = df[df['sentiment'] == 0].sample(n=100000, random_state=random_seed)
df = pd.concat([positive_df, negative_df])

In [194]:
print(df.shape)
print(df['sentiment'].value_counts())

(200000, 3)
sentiment
1    100000
0    100000
Name: count, dtype: int64


# Data Cleaning



In [195]:
CONTRACTIONS_MAP = {
    "ain't": "is not",
    "amn't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "daren't": "dare not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "everyone's": "everyone is",
    "gimme": "give me",
    "gonna": "going to",
    "gotta": "got to",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "innit": "is it not",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "kinda": "kind of",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "ne'er": "never",
    "o'clock": "of the clock",
    "o'er": "over",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "outta": "out of",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "somebody's": "somebody is",
    "someone's": "someone is",
    "something's": "something is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "tis": "it is",
    "twas": "it was",
    "to've": "to have",
    "wanna": "want to",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "whatcha": "what are you",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who'll've": "who will have",
    "who're": "who are",
    "who's": "who is",
    "why's": "why is",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [196]:
def remove_contractions(text):
    # Sort contractions by length (longest first) to handle compound contractions
    contractions_sorted = sorted(CONTRACTIONS_MAP.keys(), key=len, reverse=True)
    
    # Build pattern with word boundaries
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_sorted) + r')\b', 
                        flags=re.IGNORECASE)
    
    def expand_match(contraction):
        match = contraction.group(0)
        match_lower = match.lower()
        
        if match_lower in CONTRACTIONS_MAP:
            expanded = CONTRACTIONS_MAP[match_lower]
            
            # Preserve original capitalization
            if match[0].isupper():
                expanded = expanded[0].upper() + expanded[1:]
            
            return expanded
        
        return match
    
    # Keep expanding until no more contractions found
    prev_text = ""
    while prev_text != text:
        prev_text = text
        text = pattern.sub(expand_match, text)
    
    return text

sample_text = "I can't do this. She's going to the market. Y'all've been great!"
print("Original Text: ", sample_text)
expanded_text = remove_contractions(sample_text)
print("Expanded Text: ", expanded_text)

Original Text:  I can't do this. She's going to the market. Y'all've been great!
Expanded Text:  I cannot do this. She is going to the market. You all have been great!


In [197]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Expand contractions
    text = remove_contractions(text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [198]:
avg_length_before = df['review_body'].str.len().mean()
avg_length_before

np.float64(318.00717)

In [199]:
df['review_body'] = df['review_body'].apply(preprocess_text)

In [200]:
avg_length_after = df['review_body'].str.len().mean()
avg_length_after

np.float64(302.08331)

In [201]:
print(df['review_body'].head(3))

1049807    just as advertised and quickly shipped very pl...
2439572    this fountain pen has an great feel to it heav...
673985     i have order this replacement toner several ti...
Name: review_body, dtype: str


# Pre-processing

In [202]:
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    """Convert treebank POS tags to WordNet POS tags"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

In [203]:
from nltk.stem import WordNetLemmatizer
def lemmatize_with_pos(text):
    """
    Enhanced lemmatization that tries multiple POS tags for better results
    """
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    
    if not words:
        return ""
    
    # POS tag the available text
    pos_tags = pos_tag(words)
    
    lemmatized = []
    for word, pos in pos_tags:
        # Get primary WordNet POS
        primary_pos = get_wordnet_pos(pos)
        
        # Try lemmatizing with the detected POS
        lemmatized_word = lemmatizer.lemmatize(word, primary_pos)
        
        # If word didn't change and it might be a verb, try verb lemmatization
        if lemmatized_word == word and primary_pos != wordnet.VERB:
            verb_form = lemmatizer.lemmatize(word, wordnet.VERB)
            # Use verb form if it's different (likely was actually a verb)
            if verb_form != word:
                lemmatized_word = verb_form
        
        lemmatized.append(lemmatized_word)
    
    return ' '.join(lemmatized)

## remove the stop words 

In [204]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    """Remove stopwords but keep negation words"""
    stop_words = set(stopwords.words('english'))
    
    # CRITICAL: Keep negation words for sentiment analysis
    negations = {
        'no', 'not', 'nor', 'never', 'neither', 'nobody', 'nothing', 
        'nowhere', 'none', 'hardly', 'scarcely', 'barely'
    }
    # Remove negation words from stopwords list
    stop_words = stop_words - negations
    
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    return ' '.join(filtered)

# Test it on a sample
sample_text = "this is not a good product and i do not recommend it"
print("Original:", sample_text)
print("After stopword removal:", remove_stopwords(sample_text))
# Should keep "not" in the output

Original: this is not a good product and i do not recommend it
After stopword removal: not good product not recommend


In [205]:
samples_before_preprocessing = df['review_body'].head(3).copy()
print("Samples before preprocessing:")
print(samples_before_preprocessing)
avg_length_before_preprocessing = df['review_body'].str.len().mean()
print(f"Average length before preprocessing:{ avg_length_before_preprocessing: .4f}" )

Samples before preprocessing:
1049807    just as advertised and quickly shipped very pl...
2439572    this fountain pen has an great feel to it heav...
673985     i have order this replacement toner several ti...
Name: review_body, dtype: str
Average length before preprocessing: 302.0833


In [206]:
from nltk.corpus import stopwords
# Save before preprocessing
samples_before_stopwords_removal = df['review_body'].head(3).copy()
print("Samples before removing stop words:")
print(samples_before_stopwords_removal)
avg_length_before_stopwords_removal = df['review_body'].str.len().mean()
print("Average length before removing stop words:", avg_length_before_stopwords_removal)
# Now remove stop words
# Apply stopword removal (keeping negations)
df['review_body'] = df['review_body'].apply(remove_stopwords)

# After all preprocessing
samples_after_stopwords_removal = df['review_body'].head(3).copy()
print("Samples after removing stop words:")
print(samples_after_stopwords_removal)
avg_length_after_stopwords_removal = df['review_body'].str.len().mean()
print("Average length after removing stop words:", avg_length_after_stopwords_removal)

Samples before removing stop words:
1049807    just as advertised and quickly shipped very pl...
2439572    this fountain pen has an great feel to it heav...
673985     i have order this replacement toner several ti...
Name: review_body, dtype: str
Average length before removing stop words: 302.08331
Samples after removing stop words:
1049807                   advertised quickly shipped pleased
2439572    fountain pen great feel heavy not much writing...
673985     order replacement toner several times canon sm...
Name: review_body, dtype: str
Average length after removing stop words: 196.017205


## perform lemmatization  

In [207]:
#save before lemmatization
samples_before_lemmatization = df['review_body'].head(3).copy()
print("Samples before lemmatization:")
print(samples_before_lemmatization)
avg_length_before_lemmatization = df['review_body'].str.len().mean()
print("Average length before lemmatization:", avg_length_before_lemmatization)

# Apply lemmatization
df['review_body'] = df['review_body'].apply(lemmatize_with_pos)

# After lemmatization
samples_after_lemmatization = df['review_body'].head(3).copy()
print("Samples after lemmatization:")
print(samples_after_lemmatization)
avg_length_after_lemmatization = df['review_body'].str.len().mean()
print("Average length after lemmatization:", avg_length_after_lemmatization)

Samples before lemmatization:
1049807                   advertised quickly shipped pleased
2439572    fountain pen great feel heavy not much writing...
673985     order replacement toner several times canon sm...
Name: review_body, dtype: str
Average length before lemmatization: 196.017205
Samples after lemmatization:
1049807                        advertise quickly ship please
2439572    fountain pen great feel heavy not much write e...
673985     order replacement toner several time canon sma...
Name: review_body, dtype: str
Average length after lemmatization: 185.24457


In [208]:
samples_after_preprocessing = df['review_body'].head(3).copy()
print("Samples after preprocessing:")
print(samples_after_preprocessing)
avg_length_after_preprocessing = df['review_body'].str.len().mean()
print(f"Average length after preprocessing: {avg_length_after_preprocessing: .4f}")

Samples after preprocessing:
1049807                        advertise quickly ship please
2439572    fountain pen great feel heavy not much write e...
673985     order replacement toner several time canon sma...
Name: review_body, dtype: str
Average length after preprocessing:  185.2446


# Bigram Feature Extraction

In [209]:
from sklearn.feature_extraction import DictVectorizer
from nltk import bigrams

def bigram_features_tuple(text):
    """
    Extract bigrams as binary dictionary with TUPLE keys
    Exactly like your friend's approach
    """
    words = text.split()
    if len(words) < 2:
        return {}
    
    # Create bigrams and use tuples as dictionary keys
    bigram_list = bigrams(words)
    return {bg: 1 for bg in bigram_list}

test_text = "advertise quickly ship please ship"
print("Test text:", test_text)
result = bigram_features_tuple(test_text)
print("Bigram features (tuples):", result)
print(f"Number of features: {len(result)}")
print(f"Type of keys: {type(list(result.keys())[0])}")

Test text: advertise quickly ship please ship
Bigram features (tuples): {('advertise', 'quickly'): 1, ('quickly', 'ship'): 1, ('ship', 'please'): 1, ('please', 'ship'): 1}
Number of features: 4
Type of keys: <class 'tuple'>


In [None]:

df['bigrams_tuple'] = df['review_body'].apply(bigram_features_tuple)

print("\nDone! Checking samples:")
print("\nFirst review bigrams:")
print(df['bigrams_tuple'].iloc[0])
print(f"\nNumber of bigrams in first review: {len(df['bigrams_tuple'].iloc[0])}")

print("\nSecond review bigrams:")
print(dict(list(df['bigrams_tuple'].iloc[1].items())[:5]))  # Show first 5
print(f"Number of bigrams in second review: {len(df['bigrams_tuple'].iloc[1])}")

Extracting bigram features with tuple keys...
This may take a minute...

Done! Checking samples:

First review bigrams:
{('advertise', 'quickly'): 1, ('quickly', 'ship'): 1, ('ship', 'please'): 1}

Number of bigrams in first review: 3

Second review bigrams:
{('fountain', 'pen'): 1, ('pen', 'great'): 1, ('great', 'feel'): 1, ('feel', 'heavy'): 1, ('heavy', 'not'): 1}
Number of bigrams in second review: 44


In [None]:
from sklearn.feature_extraction import DictVectorizer
vectorizer_tuple = DictVectorizer(sparse=True)

# Convert to sparse matrix
X_tuple = vectorizer_tuple.fit_transform(df['bigrams_tuple'])
y = df['sentiment'].values

print(f"\nFeature matrix shape: {X_tuple.shape}")
print(f"Number of samples: {X_tuple.shape[0]}")
print(f"Number of unique bigram features: {X_tuple.shape[1]}")
print(f"Matrix type: {type(X_tuple)}")
print(f"Sparsity: {(1 - X_tuple.nnz / (X_tuple.shape[0] * X_tuple.shape[1])) * 100:.4f}%")

# Check some feature names
print(f"\nSample feature names (first 10):")
feature_names = vectorizer_tuple.get_feature_names_out()
for i in range(min(10, len(feature_names))):
    print(f"  {i}: {feature_names[i]}")


Fitting and transforming...

Feature matrix shape: (200000, 1680349)
Number of samples: 200000
Number of unique bigram features: 1680349
Matrix type: <class 'scipy.sparse._csr.csr_matrix'>
Sparsity: 99.9983%

Sample feature names (first 10):
  0: ('I', 'I')
  1: ('I', 'abandon')
  2: ('I', 'abd')
  3: ('I', 'able')
  4: ('I', 'absolute')
  5: ('I', 'absolutely')
  6: ('I', 'absolutley')
  7: ('I', 'abuse')
  8: ('I', 'academic')
  9: ('I', 'accept')


In [217]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tuple, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"Training labels: {y_train.shape}")
print(f"Testing labels: {y_test.shape}")

Training set: (160000, 1680349)
Testing set: (40000, 1680349)
Training labels: (160000,)
Testing labels: (40000,)


# Perceptron

In [218]:

#Load and train Perceptron model
perceptron = Perceptron(random_state=42, max_iter=10000)
perceptron.fit(X_train, y_train)

#Predictions
y_train_pred = perceptron.predict(X_train)
y_test_pred = perceptron.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"Perceptron Training Accuracy: {train_acc:.4f}")
print(f"Perceptron Training Precision: {train_prec:.4f}")
print(f"Perceptron Training Recall: {train_rec:.4f}")
print(f"Perceptron Training F1-score: {train_f1:.4f}")
print(f"Perceptron Testing Accuracy: {test_acc:.4f}")
print(f"Perceptron Testing Precision: {test_prec:.4f}")
print(f"Perceptron Testing Recall: {test_rec:.4f}")
print(f"Perceptron Testing F1-score: {test_f1:.4f}")

Perceptron Training Accuracy: 0.9945
Perceptron Training Precision: 0.9910
Perceptron Training Recall: 0.9981
Perceptron Training F1-score: 0.9946
Perceptron Testing Accuracy: 0.8828
Perceptron Testing Precision: 0.8781
Perceptron Testing Recall: 0.8889
Perceptron Testing F1-score: 0.8835


# SVM

In [219]:
#Load and train SVM model
svm = LinearSVC(random_state=42, max_iter=10000, C=0.1)
svm.fit(X_train, y_train)

#Predictions
y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"SVM Training Accuracy: {train_acc:.4f}")
print(f"SVM Training Precision: {train_prec:.4f}")
print(f"SVM Training Recall: {train_rec:.4f}")
print(f"SVM Training F1-score: {train_f1:.4f}")
print(f"SVM Testing Accuracy: {test_acc:.4f}")
print(f"SVM Testing Precision: {test_prec:.4f}")
print(f"SVM Testing Recall: {test_rec:.4f}")
print(f"SVM Testing F1-score: {test_f1:.4f}")

SVM Training Accuracy: 0.9890
SVM Training Precision: 0.9803
SVM Training Recall: 0.9981
SVM Training F1-score: 0.9891
SVM Testing Accuracy: 0.8822
SVM Testing Precision: 0.8535
SVM Testing Recall: 0.9226
SVM Testing F1-score: 0.8867


# Logistic Regression

In [None]:
# load and train Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

#Predictions
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"Logistic Regression Training Accuracy: {train_acc:.4f}")
print(f"Logistic Regression Training Precision: {train_prec:.4f}")
print(f"Logistic Regression Training Recall: {train_rec:.4f}")
print(f"Logistic Regression Training F1-score: {train_f1:.4f}")
print(f"Logistic Regression Testing Accuracy: {test_acc:.4f}")
print(f"Logistic Regression Testing Precision: {test_prec:.4f}")
print(f"Logistic Regression Testing Recall: {test_rec:.4f}")
print(f"Logistic Regression Testing F1-score: {test_f1:.4f}")


Logistic Regression Training Accuracy: 0.9871
Logistic Regression Training Precision: 0.9772
Logistic Regression Training Recall: 0.9974
Logistic Regression Training F1-score: 0.9872
Logistic Regression Testing Accuracy: 0.8870
Logistic Regression Testing Precision: 0.8590
Logistic Regression Testing Recall: 0.9257
Logistic Regression Testing F1-score: 0.8911


# Naive Bayes

In [221]:
# load and train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

#Predictions
y_train_pred = nb.predict(X_train)
y_test_pred = nb.predict(X_test)

# Calculate metrics
# Training metrics
train_acc = accuracy_score(y_train, y_train_pred)
train_prec = precision_score(y_train, y_train_pred)
train_rec = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Testing metrics
test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print results
print(f"Naive Bayes Training Accuracy: {train_acc:.4f}")
print(f"Naive Bayes Training Precision: {train_prec:.4f}")
print(f"Naive Bayes Training Recall: {train_rec:.4f}")
print(f"Naive Bayes Training F1-score: {train_f1:.4f}")
print(f"Naive Bayes Testing Accuracy: {test_acc:.4f}")
print(f"Naive Bayes Testing Precision: {test_prec:.4f}")
print(f"Naive Bayes Testing Recall: {test_rec:.4f}")
print(f"Naive Bayes Testing F1-score: {test_f1:.4f}")

Naive Bayes Training Accuracy: 0.9693
Naive Bayes Training Precision: 0.9747
Naive Bayes Training Recall: 0.9636
Naive Bayes Training F1-score: 0.9691
Naive Bayes Testing Accuracy: 0.8855
Naive Bayes Testing Precision: 0.8785
Naive Bayes Testing Recall: 0.8947
Naive Bayes Testing F1-score: 0.8866
