In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import pandas as pd
import time
import os
import sys
notebook_dir = os.path.abspath(os.getcwd())
project_root = os.path.dirname(notebook_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
import src.config as config

In [2]:
def run_classical_ml_experiment(
    experiment_name,
    x_train, y_train, x_test, y_test,
    vectorizer_class,
    vectorizer_params,
    model_class,
    model_params,
    key_params_str
):    
    start_time = time.time()
    
    vectorizer = vectorizer_class(**vectorizer_params)
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    
    vocab_size = len(vectorizer.get_feature_names_out())
    
    model = model_class(**model_params)
    model.fit(x_train_vec, y_train)
    
    y_test_pred = model.predict(x_test_vec)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    y_train_pred = model.predict(x_train_vec)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    end_time = time.time()
    duration = end_time - start_time
    
    print(f"  Model: {model_class.__name__}")
    print(f"  Vectorizer: {vectorizer_class.__name__} | Vocabulary size: {vocab_size}")
    print(f"  Train Accuracy: {train_accuracy:.2%}")
    print(f"  Test Accuracy:  {test_accuracy:.2%}")
    print(f"  Time taken: {duration:.2f} seconds\n")
    
    result = {
        "Model": model_class.__name__,
        "Vectorizer": vectorizer_class.__name__,
        "Vocabulary Size": vocab_size,
        "Test Accuracy": f"{test_accuracy:.2%}",
        "Key Params": key_params_str
        
    }
    return result

# Data Import

In [3]:
imdb_train = pd.read_csv(config.CLEAN_IMDB_TRAIN_PATH)
imdb_test = pd.read_csv(config.CLEAN_IMDB_TEST_PATH)

imdb_x_train = imdb_train[config.TEXT_COL]
imdb_y_train = imdb_train[config.LABEL_COL]
imdb_x_test = imdb_test[config.TEXT_COL]
imdb_y_test = imdb_test[config.LABEL_COL]

In [4]:
rt_train = pd.read_csv(config.CLEAN_RT_TRAIN_PATH)
rt_test = pd.read_csv(config.CLEAN_RT_TEST_PATH)

rt_x_train = rt_train[config.TEXT_COL]
rt_y_train = rt_train[config.LABEL_COL]
rt_x_test = rt_test[config.TEXT_COL]
rt_y_test = rt_test[config.LABEL_COL]

# Experiments Across Classical ML Approaches

In [5]:
imdb_results = []
rt_results = []

## Baseline: Bag-of-Words (BoW)

### IMDb

In [6]:
results = run_classical_ml_experiment(
    experiment_name="IMDB - BoW + LogReg",
    x_train=imdb_x_train, y_train=imdb_y_train,
    x_test=imdb_x_test, y_test=imdb_y_test,
    
    vectorizer_class=CountVectorizer,
    vectorizer_params={
        'max_features': 20000,
        'ngram_range': (1, 1),
        'stop_words': 'english'
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1 
    },
    key_params_str="C=0.1,ngram=(1, 1)"
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: CountVectorizer | Vocabulary size: 20000
  Train Accuracy: 96.73%
  Test Accuracy:  87.26%
  Time taken: 21.17 seconds



### Rotten Tomatoes

In [7]:
results = run_classical_ml_experiment(
    experiment_name="Rotten Tomatoes - BoW + LogReg",
    x_train=rt_x_train, y_train=rt_y_train,
    x_test=rt_x_test, y_test=rt_y_test,
    
    vectorizer_class=CountVectorizer,
    vectorizer_params={
        'max_features': 3000,
        'ngram_range': (1, 1),
        'stop_words': 'english'
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 0.1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1   
    },
    key_params_str="C=0.1,ngram=(1, 1)"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: CountVectorizer | Vocabulary size: 3000
  Train Accuracy: 82.70%
  Test Accuracy:  73.16%
  Time taken: 0.22 seconds



## Weighting Words: TF-IDF (Term Frequency-Inverse Document Frequency)

### IMDb

In [9]:
results = run_classical_ml_experiment(
    experiment_name="IMDB - TF–IDF + LogReg",
    x_train=imdb_x_train, y_train=imdb_y_train,
    x_test=imdb_x_test, y_test=imdb_y_test,
    
    vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 20000,
        'ngram_range': (1, 1),
        'stop_words': 'english'
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1 
    },
    key_params_str="C=1,ngram_range=(1, 1)"
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 20000
  Train Accuracy: 93.61%
  Test Accuracy:  87.93%
  Time taken: 6.36 seconds



### Rotten Tomatoes

In [10]:
results = run_classical_ml_experiment(
    experiment_name="Rotten Tomatoes - TF–IDF + LogReg",
    x_train=rt_x_train, y_train=rt_y_train,
    x_test=rt_x_test, y_test=rt_y_test,
    
    vectorizer_class=TfidfVectorizer,
    vectorizer_params={
        'max_features': 3000,
        'ngram_range': (1, 1),
        'stop_words': 'english'
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1   
    },
    key_params_str="C=1,ngram_range=(1, 1)"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 3000
  Train Accuracy: 85.73%
  Test Accuracy:  73.69%
  Time taken: 0.19 seconds



## Adding Local Context: TF-IDF with N-grams

### IMDb

In [11]:
results = run_classical_ml_experiment(
    experiment_name="IMDB - TF–IDF + LogReg",
    x_train=imdb_x_train, y_train=imdb_y_train,
    x_test=imdb_x_test, y_test=imdb_y_test,
    
    vectorizer_class= TfidfVectorizer,
    vectorizer_params={
        'max_features': 20000,
        'ngram_range': (1, 2),
        'stop_words': 'english'
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1 
    },
    key_params_str="C=1,ngram_range=(1, 2)"
    
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 20000
  Train Accuracy: 93.86%
  Test Accuracy:  88.15%
  Time taken: 15.97 seconds



### Rotten Tomatoes

In [12]:
results = run_classical_ml_experiment(
    experiment_name="Rotten Tomatoes - TF–IDF + LogReg",
    x_train=rt_x_train, y_train=rt_y_train,
    x_test=rt_x_test, y_test=rt_y_test,
    
    vectorizer_class=TfidfVectorizer,
    vectorizer_params={
        'max_features': 3000,
        'ngram_range': (1, 2),
        'stop_words': 'english'
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1   
    },
    key_params_str="C=1,ngram_range=(1, 2)"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 3000
  Train Accuracy: 85.44%
  Test Accuracy:  74.20%
  Time taken: 0.36 seconds



### IMDb

In [13]:
results = run_classical_ml_experiment(
    experiment_name="IMDB - TF–IDF + LogReg",
    x_train=imdb_x_train, y_train=imdb_y_train,
    x_test=imdb_x_test, y_test=imdb_y_test,
    
    vectorizer_class= TfidfVectorizer,
    vectorizer_params={
    'stop_words': 'english',
    'ngram_range': (1, 2),
    'min_df': 15,
    'max_df': 0.5,
    'sublinear_tf': True,
    'max_features': 20000
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1 
    },
    key_params_str="C=1,ngram_range=(1, 2), min_df=10, max_df=0.5"
)
imdb_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 19536
  Train Accuracy: 93.95%
  Test Accuracy:  88.60%
  Time taken: 15.54 seconds



### Rotten Tomatoes

In [14]:
results = run_classical_ml_experiment(
    experiment_name="Rotten Tomatoes - TF–IDF + LogReg",
    x_train=rt_x_train, y_train=rt_y_train,
    x_test=rt_x_test, y_test=rt_y_test,
    
    vectorizer_class=TfidfVectorizer,
    vectorizer_params={
        'max_features': 3000,
        'ngram_range': (1, 2),
        'stop_words': 'english',
        'min_df': 5,
        'max_df': 0.5,
        'sublinear_tf': True
    },
    
    model_class=LogisticRegression,
    model_params={
        'solver': 'saga',
        'C': 1,
        'max_iter': 1000,
        'random_state': config.RANDOM_SEED,
        'n_jobs': -1   
    },
    key_params_str="C=1,ngram_range=(1, 2), min_df=5, max_df=0.5"
)
rt_results.append(results)

  Model: LogisticRegression
  Vectorizer: TfidfVectorizer | Vocabulary size: 3000
  Train Accuracy: 85.27%
  Test Accuracy:  74.64%
  Time taken: 0.36 seconds



## SVM

### IMDb

In [15]:
results = run_classical_ml_experiment(
    experiment_name="IMDB - TF–IDF + SVM",
    x_train=imdb_x_train, y_train=imdb_y_train,
    x_test=imdb_x_test, y_test=imdb_y_test,
    
    vectorizer_class= TfidfVectorizer,
    vectorizer_params={
    'stop_words': 'english',
    'ngram_range': (1, 2),
    'min_df': 10,
    'max_df': 0.5,
    'sublinear_tf': True,
    'max_features': 20000
    },
    
    model_class=LinearSVC,
    model_params={
        'C': 0.1,
        'max_iter': 10000,
        'random_state': config.RANDOM_SEED,
        'dual': False
    },
    key_params_str="C=0.1,ngram_range=(1, 2), min_df=10, max_df=5"
)
imdb_results.append(results)

  Model: LinearSVC
  Vectorizer: TfidfVectorizer | Vocabulary size: 20000
  Train Accuracy: 94.28%
  Test Accuracy:  88.72%
  Time taken: 15.37 seconds



### Rotten Tomatoes

In [16]:
results = run_classical_ml_experiment(
    experiment_name="Rotten Tomatoes - TF–IDF + LogReg",
    x_train=rt_x_train, y_train=rt_y_train,
    x_test=rt_x_test, y_test=rt_y_test,
    
    vectorizer_class=TfidfVectorizer,
    vectorizer_params={
        'max_features': 3000,
        'ngram_range': (1, 2),
        'stop_words': 'english',
        'min_df': 5,
    'max_df': 0.5,
    'sublinear_tf': True
    },
    
   model_class=LinearSVC,
   model_params={
        'C': 0.1,
        'max_iter': 10000,
        'random_state': config.RANDOM_SEED,
        'dual': False
    },
    key_params_str="C=1,ngram_range=(1, 2), min_df=5, max_df=0.5"
)
rt_results.append(results)

  Model: LinearSVC
  Vectorizer: TfidfVectorizer | Vocabulary size: 3000
  Train Accuracy: 85.13%
  Test Accuracy:  74.61%
  Time taken: 0.33 seconds



In [19]:
imdb_df = pd.DataFrame(imdb_results)
display(imdb_df)

Unnamed: 0,Model,Vectorizer,Vocabulary Size,Test Accuracy,Key Params
0,LogisticRegression,CountVectorizer,20000,87.26%,"C=0.1,ngram=(1, 1)"
1,LogisticRegression,TfidfVectorizer,20000,87.93%,"C=1,ngram_range=(1, 1)"
2,LogisticRegression,TfidfVectorizer,20000,88.15%,"C=1,ngram_range=(1, 2)"
3,LogisticRegression,TfidfVectorizer,19536,88.60%,"C=1,ngram_range=(1, 2), min_df=10, max_df=0.5"
4,LinearSVC,TfidfVectorizer,20000,88.72%,"C=0.1,ngram_range=(1, 2), min_df=10, max_df=5"


In [20]:
rt_df = pd.DataFrame(rt_results)
display(rt_df)

Unnamed: 0,Model,Vectorizer,Vocabulary Size,Test Accuracy,Key Params
0,LogisticRegression,CountVectorizer,3000,73.16%,"C=0.1,ngram=(1, 1)"
1,LogisticRegression,TfidfVectorizer,3000,73.69%,"C=1,ngram_range=(1, 1)"
2,LogisticRegression,TfidfVectorizer,3000,74.20%,"C=1,ngram_range=(1, 2)"
3,LogisticRegression,TfidfVectorizer,3000,74.64%,"C=1,ngram_range=(1, 2), min_df=5, max_df=0.5"
4,LinearSVC,TfidfVectorizer,3000,74.61%,"C=1,ngram_range=(1, 2), min_df=5, max_df=0.5"
