In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import warnings 
warnings.filterwarnings('ignore') 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

def load_dataset(file_path):
    df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip')
    df = df[['review_body', 'star_rating']].dropna().reset_index(drop=True)
    return df

def sentiment_class(rating):
    rating = float(rating)
    if rating > 3:
        return 1
    elif rating <= 2:
        return 0
    else:
        return 2

def display_sentiment_statistics(df, total_ratings):
    review_count = df['sentiment'].value_counts().sort_index()

    positive_reviews, negative_reviews, neutral_reviews = review_count[1], review_count[0], review_count[2]

    total_reviews = sum(review_count)

    print(f"Number of reviews for positive class: {positive_reviews}, Number of reviews for negative class: {negative_reviews}, Number of reviews for neutral class: {neutral_reviews}")
    

def filter_reviews(df):
    positive_reviews = df[df['sentiment'] == 1].sample(100000, random_state=30)
    negative_reviews = df[df['sentiment'] == 0].sample(100000, random_state=30)

    df_new = pd.concat([positive_reviews, negative_reviews])
    df_new = df_new[df_new['sentiment'] != 2]

    return df_new  

def expand_contractions(s):
    contraction_patterns = {
        r"won't": "will not",
        r"would't": "would not",
        r"could'nt": "could not",
        r"can't": "can not",
        r"n't": " not",
        r"\'re": " are",
        r"\'s": " is",
        r"\'ll": " will",
        r"\'t": " not",
        r"\'ve": " have",
        r"I've": "I have",
        r"I'm": "I am"
    }

    for pattern, replacement in contraction_patterns.items():
        s = re.sub(pattern, replacement, s)

    return s

def clean_reviews(df):
    df['review_body'] = df['review_body'].astype(str)
    df['cleaned_reviews'] = (
        df['review_body']
        .str.lower()
        .apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
        .apply(lambda x: re.sub(r'https?://\S+', '', x))
        .apply(expand_contractions)
        .apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
        .apply(lambda x: x.strip())
    )

    return df

def calculate_average_lengths(df):
    avg_length_before_cleaning = df['review_body'].apply(len).mean()
    avg_length_after_cleaning = df['cleaned_reviews'].apply(len).mean()

    print(f"Average length before cleaning: {avg_length_before_cleaning:.2f} characters")
    print(f"Average length after cleaning: {avg_length_after_cleaning:.2f} characters")

def preprocess_reviews(df):
    custom_stopwords_list = set(stopwords.words('english')) - {'not', 'no', 'nor', 'neither', 'but', 'however', 'although'}
    df['cleaned_reviews'] = df['cleaned_reviews'].apply(lambda x: " ".join([word for word in x.split() if word not in custom_stopwords_list]))

    lemmatizer = WordNetLemmatizer()
    df['cleaned_reviews'] = df['cleaned_reviews'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

    avg_length_before_cleaning = df['review_body'].apply(len).mean()
    avg_length_after_cleaning = df['cleaned_reviews'].apply(len).mean()

    print(f"\nAverage length before preprocessing: {avg_length_before_cleaning:.2f} characters")
    print(f"Average length after preprocessing: {avg_length_after_cleaning:.2f} characters")

    return df

def vectorize_and_split(df):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['cleaned_reviews'])

    X_train, X_test, Y_train, Y_test = train_test_split(tfidf_matrix, df['sentiment'], test_size=0.2, random_state=30)

    print("Train: ", X_train.shape, Y_train.shape, "Test: ", X_test.shape, Y_test.shape)

    return X_train, X_test, Y_train, Y_test

def print_metrics(prefix, true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')

    print(f"{prefix} Set Metrics:")
    print("\n")
    print("Accuracy for {} set: {:.2f}%".format(prefix, accuracy * 100))
    print("Precision for {} set: {:.2f}".format(prefix, precision))
    print("Recall for {} set: {:.2f}".format(prefix, recall))
    print("F1-Score for {} set: {:.2f}".format(prefix, f1))
    print()
    
def train_perceptron_model(X_train, Y_train, X_test, Y_test):
    perceptron_model = Perceptron()
    perceptron_model.fit(X_train, Y_train)

    train_predictions = perceptron_model.predict(X_train)
    test_predictions = perceptron_model.predict(X_test)

    print_metrics("Perceptron Training", Y_train, train_predictions)
    print_metrics("Perceptron Testing", Y_test, test_predictions)

    return perceptron_model

def train_svm_model(X_train, Y_train, X_test, Y_test):
    linear_svc_model = LinearSVC()
    linear_svc_model.fit(X_train, Y_train)

    linear_svc_train_predictions = linear_svc_model.predict(X_train)
    linear_svc_test_predictions = linear_svc_model.predict(X_test)

    print_metrics("LinearSVC (Training)", Y_train, linear_svc_train_predictions)
    print_metrics("LinearSVC (Testing)", Y_test, linear_svc_test_predictions)

    return linear_svc_model

def train_logistic_regression_model(X_train, Y_train, X_test, Y_test):
    logistic_regression_model = LogisticRegression()

    logistic_regression_model.fit(X_train, Y_train)

    logistic_regression_train_predictions = logistic_regression_model.predict(X_train)
    logistic_regression_test_predictions = logistic_regression_model.predict(X_test)

    print_metrics("Logistic Regression (Training)", Y_train, logistic_regression_train_predictions)
    print_metrics("Logistic Regression (Testing)", Y_test, logistic_regression_test_predictions)

    return logistic_regression_model

def train_naive_bayes_model(X_train, Y_train, X_test, Y_test):
    naive_bayes_model = MultinomialNB()
    naive_bayes_model.fit(X_train, Y_train)

    naive_bayes_train_predictions = naive_bayes_model.predict(X_train)
    naive_bayes_test_predictions = naive_bayes_model.predict(X_test)

    print_metrics("Multinomial Naive Bayes (Training)", Y_train, naive_bayes_train_predictions)
    print_metrics("Multinomial Naive Bayes (Testing)", Y_test, naive_bayes_test_predictions)

    return naive_bayes_model

    

def main():
    file_path = "amazon_reviews_us_Office_Products_v1_00.tsv"
    
    df = load_dataset(file_path)
    
    df['sentiment'] = df['star_rating'].apply(sentiment_class)
    
    total_ratings = len(df)
    
    display_sentiment_statistics(df, total_ratings)
    print("\n")
    
    df = filter_reviews(df)
    
    cleaned_df = clean_reviews(df)
    
    calculate_average_lengths(df)
    
    df = preprocess_reviews(df)
    print("\n")
    
    X_train, X_test, Y_train, Y_test = vectorize_and_split(df)
    print("\n")
    
    trained_perceptron_model = train_perceptron_model(X_train, Y_train, X_test, Y_test)
    
    trained_svm_model = train_svm_model(X_train, Y_train, X_test, Y_test)
    
    trained_logistic_regression_model = train_logistic_regression_model(X_train, Y_train, X_test, Y_test)
    
    trained_naive_bayes_model = train_naive_bayes_model(X_train, Y_train, X_test, Y_test)
    


if __name__ == "__main__":
    main()


Number of reviews for positive class: 2001122, Number of reviews for negative class: 445349, Number of reviews for neutral class: 193686


Average length before cleaning: 318.93 characters
Average length after cleaning: 305.24 characters

Average length before preprocessing: 318.93 characters
Average length after preprocessing: 194.82 characters


Train:  (160000, 126317) (160000,) Test:  (40000, 126317) (40000,)


Perceptron Training Set Metrics:


Accuracy for Perceptron Training set: 92.62%
Precision for Perceptron Training set: 0.93
Recall for Perceptron Training set: 0.93
F1-Score for Perceptron Training set: 0.93

Perceptron Testing Set Metrics:


Accuracy for Perceptron Testing set: 86.98%
Precision for Perceptron Testing set: 0.87
Recall for Perceptron Testing set: 0.87
F1-Score for Perceptron Testing set: 0.87

LinearSVC (Training) Set Metrics:


Accuracy for LinearSVC (Training) set: 94.94%
Precision for LinearSVC (Training) set: 0.95
Recall for LinearSVC (Training) set: 0.95