In [1]:
import numpy as np
import pandas as pd
import time
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import sys
sys.path.append("../src")
import config


In [2]:
def clean_review_for_ml_method(text):

  text = re.sub(r'[“”‘’\'"`]', ' ', text)
  text = re.sub(r'[-–—]', ' ', text)
  text = re.sub(r'[^\w\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [3]:
def run_ml_pipline(
    experiment_name,
    df_train,
    df_test,
    vectorizer_class,
    vectorizer_params,
    model_class,
    model_params,
    key_params_str
):    
    start_time = time.time()
    
    x_train, y_train = df_train['text'], df_train['label']
    x_test, y_test = df_test['text'], df_test['label']
    
    vectorizer = vectorizer_class(**vectorizer_params)
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    
    vocab_size = len(vectorizer.get_feature_names_out())
    
    model = model_class(**model_params)
    model.fit(x_train_vec, y_train)
    
    y_test_pred = model.predict(x_test_vec)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    y_train_pred = model.predict(x_train_vec)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    end_time = time.time()
    duration = end_time - start_time
    
    precision_1 = precision_score(y_test, y_test_pred, pos_label=1)
    recall_1 = recall_score(y_test, y_test_pred, pos_label=1)

    print(f"  Model: {model_class.__name__}")
    print(f"  Vectorizer: {vectorizer_class.__name__} | Vocabulary size: {vocab_size}")
    print(f"  Train Accuracy: {train_accuracy:.2%}")
    print(f"  Test Accuracy:  {test_accuracy:.2%}")
    print(f"  Precision (class 1): {precision_1:.2%}")
    print(f"  Recall (class 1):    {recall_1:.2%}")
    print(f"  Time taken: {duration:.2f} seconds\n")

    result = {
        "Model": model_class.__name__,
        "Vectorizer": vectorizer_class.__name__,
        "Vocabulary Size": vocab_size,
        "Train Accuracy": f"{train_accuracy:.2%}",
        "Test Accuracy": f"{test_accuracy:.2%}",
        "Precision (class 1)": f"{precision_1:.2%}",
        "Recall (class 1)": f"{recall_1:.2%}",
        "Key Params": key_params_str
}
    return result

In [4]:
def load_df(dataset_name = 'imdb'):
    
    if dataset_name == 'imdb':
        df_train = pd.read_csv(config.IMDB_TRAIN_PATH)
        df_test = pd.read_csv(config.IMDB_TEST_PATH)
    elif dataset_name == 'rt':
        df_train = pd.read_csv(config.RT_TRAIN_PATH)
        df_test = pd.read_csv(config.RT_TEST_PATH)
        
    df_train['text'] = df_train['text'].apply(clean_review_for_ml_method)
    df_test['text'] = df_test['text'].apply(clean_review_for_ml_method)
    return df_train, df_test

IMDB

In [5]:
dataset = 'imdb'
df_train, df_test = load_df(dataset_name = dataset)
imdb_results = []

Bag-of-Words (BoW)

In [6]:
results = run_ml_pipline(
    experiment_name="imdb, BoW, LogReg",
    df_train=df_train,
    df_test=df_test,    
    vectorizer_class=CountVectorizer,
    vectorizer_params={
        'max_features': 7000,
        'ngram_range': (1, 1),
        'stop_words': 'english'},
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': 42,
        'n_jobs': -1},
    key_params_str="C=0.1,ngram=(1, 1)"
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: CountVectorizer | Vocabulary size: 7000
  Train Accuracy: 93.58%
  Test Accuracy:  88.73%
  Precision (class 1): 88.20%
  Recall (class 1):    89.42%
  Time taken: 33.64 seconds



Weighting Words: TF-IDF (Term Frequency-Inverse Document Frequency)

In [7]:
results = run_ml_pipline(
    experiment_name="imdb, TF–IDF, LogReg",
    df_train=df_train,
    df_test=df_test,    
     vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 7000,
        'ngram_range': (1, 1),
        'stop_words': 'english'},
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': 42,
        'n_jobs': -1},
    key_params_str="C=0.1,ngram=(1, 1)"
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 7000
  Train Accuracy: 88.07%
  Test Accuracy:  87.39%
  Precision (class 1): 85.89%
  Recall (class 1):    89.48%
  Time taken: 7.95 seconds



TF-IDF with N-grams - Context Window

In [8]:
results = run_ml_pipline(
    experiment_name="imdb, TF–IDF, N-grams, LogReg",
    df_train=df_train,
    df_test=df_test,    
     vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 20000,
        'ngram_range': (1, 2),
        'stop_words': 'english'},
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': 42,
        'n_jobs': -1},
    key_params_str="C=0.1,ngram=(1, 2)"
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 20000
  Train Accuracy: 88.30%
  Test Accuracy:  87.44%
  Precision (class 1): 85.88%
  Recall (class 1):    89.62%
  Time taken: 26.63 seconds



SVM - TF-IDF with N-grams

In [9]:
results = run_ml_pipline(
    experiment_name="imdb, TF–IDF, N-grams, SVM",
    df_train=df_train,
    df_test=df_test,    
     vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 20000,
        'ngram_range': (1, 2),
        'stop_words': 'english'},
    model_class=LinearSVC,
    model_params={
        'C': 0.1,
        'max_iter': 10000,
        'random_state': 42,
        'dual': False},
    key_params_str="C=0.1,ngram=(1, 2)"
)
imdb_results.append(results)

  Model: LinearSVC
  Vectorizer: TfidfVectorizer | Vocabulary size: 20000
  Train Accuracy: 93.42%
  Test Accuracy:  90.24%
  Precision (class 1): 89.20%
  Recall (class 1):    91.56%
  Time taken: 26.51 seconds



In [10]:
imdb_results = pd.DataFrame(imdb_results)
display(imdb_results)

Unnamed: 0,Model,Vectorizer,Vocabulary Size,Train Accuracy,Test Accuracy,Precision (class 1),Recall (class 1),Key Params
0,LogisticRegression,CountVectorizer,7000,93.58%,88.73%,88.20%,89.42%,"C=0.1,ngram=(1, 1)"
1,LogisticRegression,TfidfVectorizer,7000,88.07%,87.39%,85.89%,89.48%,"C=0.1,ngram=(1, 1)"
2,LogisticRegression,TfidfVectorizer,20000,88.30%,87.44%,85.88%,89.62%,"C=0.1,ngram=(1, 2)"
3,LinearSVC,TfidfVectorizer,20000,93.42%,90.24%,89.20%,91.56%,"C=0.1,ngram=(1, 2)"


Rotten Tomatoes

In [11]:
dataset = 'rt'
df_train, df_test = load_df(dataset_name = dataset)
rt_results = []

In [12]:
results = run_ml_pipline(
    experiment_name="rt, BoW, LogReg",
    df_train=df_train,
    df_test=df_test,    
    vectorizer_class=CountVectorizer,
    vectorizer_params={
        'max_features': 6000,
        'ngram_range': (1, 1),
        'stop_words': 'english'},
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': 42,
        'n_jobs': -1},
    key_params_str="C=0.1,ngram=(1, 1)"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: CountVectorizer | Vocabulary size: 6000
  Train Accuracy: 84.79%
  Test Accuracy:  73.51%
  Precision (class 1): 73.48%
  Recall (class 1):    73.55%
  Time taken: 0.22 seconds



In [13]:
results = run_ml_pipline(
    experiment_name="rt, TF–IDF, LogReg",
    df_train=df_train,
    df_test=df_test,    
     vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 6000,
        'ngram_range': (1, 1),
        'stop_words': 'english'},
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': 42,
        'n_jobs': -1},
    key_params_str="C=0.1,ngram=(1, 1)"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 6000
  Train Accuracy: 81.21%
  Test Accuracy:  73.28%
  Precision (class 1): 72.26%
  Recall (class 1):    75.52%
  Time taken: 0.22 seconds



In [14]:
results = run_ml_pipline(
    experiment_name="rt, TF–IDF, N-grams, LogReg",
    df_train=df_train,
    df_test=df_test,    
     vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 15000,
        'ngram_range': (1, 2),
        'stop_words': 'english'},
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': 42,
        'n_jobs': -1},
    key_params_str="C=0.1,ngram=(1, 2)"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 15000
  Train Accuracy: 83.46%
  Test Accuracy:  73.23%
  Precision (class 1): 72.28%
  Recall (class 1):    75.33%
  Time taken: 0.48 seconds



In [15]:
results = run_ml_pipline(
    experiment_name="rt, TF–IDF, N-grams, SVM",
    df_train=df_train,
    df_test=df_test,    
     vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 15000,
        'ngram_range': (1, 2),
        'stop_words': 'english'},
    model_class=LinearSVC,
    model_params={
        'C': 0.1,
        'max_iter': 10000,
        'random_state': 42,
        'dual': False},
    key_params_str="C=0.1,ngram=(1, 2)"
)
rt_results.append(results)

  Model: LinearSVC
  Vectorizer: TfidfVectorizer | Vocabulary size: 15000
  Train Accuracy: 91.07%
  Test Accuracy:  74.64%
  Precision (class 1): 74.19%
  Recall (class 1):    75.52%
  Time taken: 0.45 seconds



In [16]:
rt_results = pd.DataFrame(rt_results)
display(rt_results)

Unnamed: 0,Model,Vectorizer,Vocabulary Size,Train Accuracy,Test Accuracy,Precision (class 1),Recall (class 1),Key Params
0,LogisticRegression,CountVectorizer,6000,84.79%,73.51%,73.48%,73.55%,"C=0.1,ngram=(1, 1)"
1,LogisticRegression,TfidfVectorizer,6000,81.21%,73.28%,72.26%,75.52%,"C=0.1,ngram=(1, 1)"
2,LogisticRegression,TfidfVectorizer,15000,83.46%,73.23%,72.28%,75.33%,"C=0.1,ngram=(1, 2)"
3,LinearSVC,TfidfVectorizer,15000,91.07%,74.64%,74.19%,75.52%,"C=0.1,ngram=(1, 2)"
