In [12]:
%pip install -q optuna lightgbm

Note: you may need to restart the kernel to use updated packages.




This experiment compares the performance of several classification algorithms (Logistic Regression, Naive Bayes, SVM, XGBoost, LightGBM). We apply Hyperparameter Tuning (HPT) to the complex models using Optuna, while fixing the feature engineering pipeline based on previous optimal choices:

* **Vectorization:** TF-IDF (Term Frequency-Inverse Document Frequency)
* **N-gram Range:** Bigram `(1, 2)` (Unigrams and Bigrams)
* **Max Features:** 1000
* **Imbalance Handling:** Undersampling (`RandomUnderSampler`)

## 1. Setup and Dependencies

### 1.1 Import Libraries

In [13]:
import optuna
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC # Using LinearSVC for sparse data compatibility
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## 2. MLflow and Data Preparation

### 2.1 MLflow Configuration

In [14]:
# Set the remote tracking server URI
mlflow.set_tracking_uri("http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/")

# Set or create a new experiment
mlflow.set_experiment("Experiment 5 - Model Comparision (TFIDF Bigram 1000 + Undersampling)")

<Experiment: artifact_location='s3://mlfow-bucket-2025/927563970798110109', creation_time=1762945494781, experiment_id='927563970798110109', last_update_time=1762945494781, lifecycle_stage='active', name='Experiment 5 - Model Comparision (TFIDF Bigram 1000 + Undersampling)', tags={}>

### 2.2 Data Loading, Remapping, and Feature Engineering

In [7]:
# Load preprocessed data and clean missing values
df = pd.read_csv('../data/reddit_preprocessing.csv').dropna(subset=['clean_comment'])

# Remap class labels [-1, 0, 1] to [0, 1, 2] for model compatibility
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})
df = df.dropna(subset=['category']) # Final check for NaNs

print(f"Data shape: {df.shape}")

Data shape: (36662, 2)


In [15]:
# Define fixed feature parameters
ngram_range = (1, 2)  # Bigram setting
max_features = 1000   # Fixed Max Features
imbalance_method = "undersampling"

# Split data BEFORE vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], 
                                                              test_size=0.2, random_state=42, stratify=df['category'])

# 1. Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)  
X_test_vec = vectorizer.transform(X_test)  

# 2. Undersampling using RandomUnderSampler
sampler = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = sampler.fit_resample(X_train_vec, y_train)

print(f"Original Training Data Shape: {X_train_vec.shape}")
print(f"Resampled Training Data Shape after Undersampling: {X_train_res.shape}")
print(f"Test Data Shape: {X_test_vec.shape}")

Original Training Data Shape: (29329, 1000)
Resampled Training Data Shape after Undersampling: (19794, 1000)
Test Data Shape: (7333, 1000)


## 3. MLflow Logging and Evaluation Helper

In [16]:
def log_mlflow(model_name, model, params=None):
    """Trains and logs a single model run to MLflow with evaluation metrics."""
    with mlflow.start_run():
        # Set tags and log fixed parameters
        mlflow.set_tag("mlflow.runName", f"{model_name}_Undersample_TFIDF(1000)_HPT")
        mlflow.set_tag("experiment_type", "multi_algo_hpt")
        mlflow.log_param("algo_name", model_name)
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", str(ngram_range))
        mlflow.log_param("max_features", max_features)
        mlflow.log_param("imbalance_handling", imbalance_method)
        
        # Log specific model hyperparameters
        if params:
            for key, value in params.items():
                mlflow.log_param(key, value)

        # Train model on resampled data
        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test_vec)

        # Log metrics
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix (optional but good practice)
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.title(f"Confusion Matrix: {model_name}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.savefig(f"conf_matrix_{model_name}.png")
        mlflow.log_artifact(f"conf_matrix_{model_name}.png")
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        
        print(f"Logged {model_name} with Accuracy: {accuracy:.4f}")

## 4. Hyperparameter Tuning Objectives (Optuna)

In [17]:
N_TRIALS = 10 # Keep trials low for complexity management

# --- 4.1 Logistic Regression Objective ---
def objective_lr(trial):
    C = trial.suggest_float('C', 1e-3, 10.0, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs'])
    
    model = LogisticRegression(C=C, solver=solver, random_state=42, multi_class='auto', max_iter=1000)
    model.fit(X_train_res, y_train_res)
    return accuracy_score(y_test, model.predict(X_test_vec))

# --- 4.2 Linear SVM Objective ---
def objective_svc(trial):
    C = trial.suggest_float('C', 0.1, 10.0, log=True)
    
    # Use LinearSVC which is better suited for large, sparse datasets
    model = LinearSVC(C=C, random_state=42, max_iter=1000, dual='auto')
    model.fit(X_train_res, y_train_res)
    return accuracy_score(y_test, model.predict(X_test_vec))

# --- 4.3 XGBoost Objective ---
def objective_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 0.1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 7)

    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, 
                          random_state=42, use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)
    model.fit(X_train_res, y_train_res)
    return accuracy_score(y_test, model.predict(X_test_vec))

# --- 4.4 LightGBM Objective ---
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 0.1, log=True)
    num_leaves = trial.suggest_int('num_leaves', 10, 50)

    model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves, 
                           random_state=42, verbose=-1)
    model.fit(X_train_res, y_train_res)
    return accuracy_score(y_test, model.predict(X_test_vec))

## 5. Execution and MLflow Logging

In [18]:
def run_tuning_and_log(model_name, objective_func, n_trials):
    """Runs Optuna HPT for a given model and logs the best model to MLflow."""
    print(f"\n--- Starting HPT for {model_name} ({n_trials} trials) ---")
    study = optuna.create_study(direction="maximize", study_name=model_name)
    study.optimize(objective_func, n_trials=n_trials)

    best_params = study.best_params
    print(f"Best {model_name} Params: {best_params}")

    # Re-initialize the best model based on its type
    if model_name == 'LogisticRegression':
        best_model = LogisticRegression(random_state=42, multi_class='auto', max_iter=1000, **best_params)
    elif model_name == 'LinearSVC':
        best_model = LinearSVC(random_state=42, max_iter=1000, dual='auto', **best_params)
    elif model_name == 'XGBoost':
        best_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1, **best_params)
    elif model_name == 'LightGBM':
        best_model = LGBMClassifier(random_state=42, verbose=-1, **best_params)
    else:
        raise ValueError(f"Unknown model type: {model_name}")

    log_mlflow(f"{model_name}_HPT", best_model, params=best_params)


def run_simple_model_and_log(model_name, model_class, params=None):
    """Initializes a simple model (no HPT) and logs it to MLflow."""
    print(f"\n--- Starting baseline run for {model_name} ---")
    if params:
        model = model_class(random_state=42, **params)
    else:
        model = model_class()
    
    # For MNB, alpha should be tuned, but we run a default simple value for comparison
    if model_name == 'MultinomialNB':
        model = MultinomialNB(alpha=1.0) 
        params = {'alpha': 1.0}
    
    log_mlflow(model_name, model, params=params)


# --- Execution Pipeline ---

# 1. Simple Models (Multinomial Naive Bayes)
run_simple_model_and_log('MultinomialNB', MultinomialNB)

# 2. Tuned Models
models_to_tune = [
    ('LogisticRegression', objective_lr),
    ('LinearSVC', objective_svc),
    ('XGBoost', objective_xgboost),
    ('LightGBM', objective_lightgbm)
]

for model_name, objective in models_to_tune:
    run_tuning_and_log(model_name, objective, N_TRIALS)


--- Starting baseline run for MultinomialNB ---




Logged MultinomialNB with Accuracy: 0.7000
üèÉ View run MultinomialNB_Undersample_TFIDF(1000)_HPT at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109/runs/6138376b73db4c21af6c0f255f96bcd4
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109


[I 2025-11-12 16:44:48,288] A new study created in memory with name: LogisticRegression
[I 2025-11-12 16:44:48,347] Trial 0 finished with value: 0.6631665075685258 and parameters: {'C': 0.003554156887954465, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.6631665075685258.
[I 2025-11-12 16:44:48,450] Trial 1 finished with value: 0.7638074457929905 and parameters: {'C': 0.2706288896466599, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7638074457929905.



--- Starting HPT for LogisticRegression (10 trials) ---


[I 2025-11-12 16:44:48,500] Trial 2 finished with value: 0.6650756852584209 and parameters: {'C': 0.0025982079054921326, 'solver': 'liblinear'}. Best is trial 1 with value: 0.7638074457929905.
[I 2025-11-12 16:44:48,641] Trial 3 finished with value: 0.7318969044047456 and parameters: {'C': 0.06854300063095319, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7638074457929905.
[I 2025-11-12 16:44:48,744] Trial 4 finished with value: 0.7494886131187781 and parameters: {'C': 0.1445057242570108, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7638074457929905.
[I 2025-11-12 16:44:48,841] Trial 5 finished with value: 0.778671757807173 and parameters: {'C': 1.278413171841183, 'solver': 'liblinear'}. Best is trial 5 with value: 0.778671757807173.
[I 2025-11-12 16:44:48,916] Trial 6 finished with value: 0.7583526523932906 and parameters: {'C': 0.24415992458386357, 'solver': 'liblinear'}. Best is trial 5 with value: 0.778671757807173.
[I 2025-11-12 16:44:48,962] Trial 7 finished with value: 

Best LogisticRegression Params: {'C': 1.278413171841183, 'solver': 'liblinear'}




Logged LogisticRegression_HPT with Accuracy: 0.7787
üèÉ View run LogisticRegression_HPT_Undersample_TFIDF(1000)_HPT at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109/runs/69897c926c81491c8d0e5555a9d22881
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109


[I 2025-11-12 16:45:41,308] A new study created in memory with name: LinearSVC



--- Starting HPT for LinearSVC (10 trials) ---


[I 2025-11-12 16:45:41,562] Trial 0 finished with value: 0.783035592526933 and parameters: {'C': 6.724496257841721}. Best is trial 0 with value: 0.783035592526933.
[I 2025-11-12 16:45:41,683] Trial 1 finished with value: 0.7849447702168281 and parameters: {'C': 0.5263589688942403}. Best is trial 1 with value: 0.7849447702168281.
[I 2025-11-12 16:45:41,830] Trial 2 finished with value: 0.7834447020319105 and parameters: {'C': 1.5873598423695072}. Best is trial 1 with value: 0.7849447702168281.
[I 2025-11-12 16:45:42,008] Trial 3 finished with value: 0.7837174417018955 and parameters: {'C': 1.9376355032265133}. Best is trial 1 with value: 0.7849447702168281.
[I 2025-11-12 16:45:42,148] Trial 4 finished with value: 0.7834447020319105 and parameters: {'C': 2.1900012916767624}. Best is trial 1 with value: 0.7849447702168281.
[I 2025-11-12 16:45:42,259] Trial 5 finished with value: 0.780035456157098 and parameters: {'C': 0.13232583607003803}. Best is trial 1 with value: 0.7849447702168281.
[

Best LinearSVC Params: {'C': 0.5263589688942403}




Logged LinearSVC_HPT with Accuracy: 0.7849
üèÉ View run LinearSVC_HPT_Undersample_TFIDF(1000)_HPT at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109/runs/4a3eacc03c114d33bf413326b80f5b53
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109


[I 2025-11-12 16:46:32,617] A new study created in memory with name: XGBoost



--- Starting HPT for XGBoost (10 trials) ---


[I 2025-11-12 16:47:14,560] Trial 0 finished with value: 0.588163098322651 and parameters: {'n_estimators': 199, 'learning_rate': 0.0028322158800103484, 'max_depth': 6}. Best is trial 0 with value: 0.588163098322651.
[I 2025-11-12 16:47:45,946] Trial 1 finished with value: 0.7061230055911633 and parameters: {'n_estimators': 160, 'learning_rate': 0.04039843724008596, 'max_depth': 6}. Best is trial 1 with value: 0.7061230055911633.
[I 2025-11-12 16:48:02,803] Trial 2 finished with value: 0.569889540433656 and parameters: {'n_estimators': 71, 'learning_rate': 0.001662230581776005, 'max_depth': 6}. Best is trial 1 with value: 0.7061230055911633.
[I 2025-11-12 16:48:25,636] Trial 3 finished with value: 0.6454384290195009 and parameters: {'n_estimators': 149, 'learning_rate': 0.01598776649910289, 'max_depth': 5}. Best is trial 1 with value: 0.7061230055911633.
[I 2025-11-12 16:48:51,762] Trial 4 finished with value: 0.5779353606982136 and parameters: {'n_estimators': 89, 'learning_rate': 0.0

Best XGBoost Params: {'n_estimators': 118, 'learning_rate': 0.08748905697283263, 'max_depth': 6}




Logged XGBoost_HPT with Accuracy: 0.7281
üèÉ View run XGBoost_HPT_Undersample_TFIDF(1000)_HPT at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109/runs/ba5abf9dc9884bb0a686a0157c0028f2
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109


[I 2025-11-12 16:51:52,556] A new study created in memory with name: LightGBM



--- Starting HPT for LightGBM (10 trials) ---


[I 2025-11-12 16:52:00,311] Trial 0 finished with value: 0.7823537433519705 and parameters: {'n_estimators': 157, 'learning_rate': 0.04099673118007485, 'num_leaves': 45}. Best is trial 0 with value: 0.7823537433519705.
[I 2025-11-12 16:52:02,712] Trial 1 finished with value: 0.7741715532524206 and parameters: {'n_estimators': 80, 'learning_rate': 0.07745214019639735, 'num_leaves': 24}. Best is trial 0 with value: 0.7823537433519705.
[I 2025-11-12 16:52:09,518] Trial 2 finished with value: 0.7273966998499932 and parameters: {'n_estimators': 163, 'learning_rate': 0.004405280808174647, 'num_leaves': 36}. Best is trial 0 with value: 0.7823537433519705.
[I 2025-11-12 16:52:18,085] Trial 3 finished with value: 0.7828992226919406 and parameters: {'n_estimators': 180, 'learning_rate': 0.032509495822559915, 'num_leaves': 43}. Best is trial 3 with value: 0.7828992226919406.
[I 2025-11-12 16:52:21,370] Trial 4 finished with value: 0.7548070366834856 and parameters: {'n_estimators': 165, 'learning

Best LightGBM Params: {'n_estimators': 180, 'learning_rate': 0.032509495822559915, 'num_leaves': 43}




Logged LightGBM_HPT with Accuracy: 0.7829
üèÉ View run LightGBM_HPT_Undersample_TFIDF(1000)_HPT at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109/runs/2136aeac43a24180b4f7131843c3d81b
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/927563970798110109


## 6. Conclusion and Next Steps
The complete set of model performances, including optimized hyperparameters, is now logged in the MLflow UI. The next step is typically stacking or selecting the single best performing model based on comprehensive evaluation metrics, especially F1-scores for the minority classes.

In [20]:
OPTIMAL_METRIC = "weighted avg_f1-score"
MLFLOW_TRACKING_URI = "http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/"
EXPERIMENT_NAME = "Experiment 5 - Model Comparision (TFIDF Bigram 1000 + Undersampling)"


try:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    
    if experiment is None:
        print(f"Error: Experiment '{EXPERIMENT_NAME}' not found on the server.")
    else:
        experiment_id = experiment.experiment_id
        print(f"Fetching runs for Experiment ID: {experiment_id}")

        # Fetch all runs in the experiment
        runs = client.search_runs(experiment_ids=experiment_id)

        run_data = []
        for run in runs:
            # Extract relevant metrics and parameters
            metrics = {k: v for k, v in run.data.metrics.items() if k.endswith('f1-score') or k == 'accuracy'}
            params = run.data.params
            
            run_data.append({
                'run_id': run.info.run_id,
                'status': run.info.status,
                'algo_name': params.get('algo_name', 'N/A'),
                'accuracy': run.data.metrics.get('accuracy', 0.0),
                **metrics,
                'max_features': params.get('max_features', 'N/A'),
                'imbalance_handling': params.get('imbalance_handling', 'N/A'),
                'run_name': run.data.tags.get('mlflow.runName')
            })

        if not run_data:
            print("No completed runs found in the experiment.")
        else:
            df_results = pd.DataFrame(run_data)
            
            # --- EVALUATE AND FIND BEST MODEL ---
            
            # Ensure the optimal metric column is numeric before sorting
            if OPTIMAL_METRIC in df_results.columns:
                df_results[OPTIMAL_METRIC] = pd.to_numeric(df_results[OPTIMAL_METRIC], errors='coerce')
                
                # Sort by the optimal metric (descending)
                df_best = df_results.sort_values(by=OPTIMAL_METRIC, ascending=False)
                best_model_run = df_best.iloc[0]

                print("\n" + "="*50)
                print(f"RESULTS SORTED BY: {OPTIMAL_METRIC}")
                print("="*50)
                # Display the top 5 models
                print(df_best[['run_name', 'algo_name', 'accuracy', OPTIMAL_METRIC, '0_f1-score', '1_f1-score', '2_f1-score']].head())
                print("="*50)

                # --- BEST MODEL SUMMARY ---
                print("\n" + "="*50)
                print("BEST MODEL FOUND:")
                print("="*50)
                print(f"Algorithm: {best_model_run['algo_name']}")
                print(f"Run Name: {best_model_run['run_name']}")
                print(f"Run ID: {best_model_run['run_id']}")
                print(f"Overall Accuracy: {best_model_run['accuracy']:.4f}")
                print(f"{OPTIMAL_METRIC}: {best_model_run[OPTIMAL_METRIC]:.4f}")
                
                # Retrieve all parameters for the best run
                best_run_params = client.get_run(best_model_run['run_id']).data.params
                print("\nBest Model Hyperparameters:")
                # Filter for core hyperparameters (excluding fixed feature/imbalance params)
                hp_keys = ['C', 'learning_rate', 'n_estimators', 'max_depth', 'num_leaves', 'solver', 'alpha']
                best_hps = {k: v for k, v in best_run_params.items() if k in hp_keys}
                
                for k, v in best_hps.items():
                    print(f"  {k}: {v}")
                print("="*50)

            else:
                print(f"Optimal Metric '{OPTIMAL_METRIC}' not found in run metrics. Please check your logging setup.")

except Exception as e:
    print(f"An error occurred while connecting to MLflow or processing data: {e}")

Fetching runs for Experiment ID: 927563970798110109

RESULTS SORTED BY: weighted avg_f1-score
                                            run_name               algo_name  \
2          LinearSVC_HPT_Undersample_TFIDF(1000)_HPT           LinearSVC_HPT   
0           LightGBM_HPT_Undersample_TFIDF(1000)_HPT            LightGBM_HPT   
3  LogisticRegression_HPT_Undersample_TFIDF(1000)...  LogisticRegression_HPT   
1            XGBoost_HPT_Undersample_TFIDF(1000)_HPT             XGBoost_HPT   
4          MultinomialNB_Undersample_TFIDF(1000)_HPT           MultinomialNB   

   accuracy  weighted avg_f1-score  0_f1-score  1_f1-score  2_f1-score  
2  0.784945               0.782771    0.835863    0.800831    0.666874  
0  0.782899               0.780663    0.835173    0.800698    0.658816  
3  0.778672               0.776854    0.831210    0.796804    0.655405  
1  0.728079               0.724808    0.781575    0.741100    0.606658  
4  0.699986               0.703944    0.735410    0.737449  