In [1]:
# Imports

import optuna
import mlflow
import dagshub
import mlflow.sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Setting up DagsHub

dagshub.init(repo_owner='SushrutGaikwad', repo_name='youtube-comments-analyzer', mlflow=True)

# Data

In [3]:
PREPROCESSED_DATA_PATH = "../data/processed/reddit_preprocessed.csv"
df = pd.read_csv(PREPROCESSED_DATA_PATH)
df.dropna(subset=["clean_comment"], inplace=True)
df.shape

(36662, 2)

# Running the experiment

In [4]:
# Setting experiment name

mlflow.set_experiment("Exp 4: ML algorithms with hyperparameter tuning")

<Experiment: artifact_location='mlflow-artifacts:/3f66177ebab24d6392ce8f143c28100b', creation_time=1749046747756, experiment_id='5', last_update_time=1749046747756, lifecycle_stage='active', name='Exp 4: ML algorithms with hyperparameter tuning', tags={}>

## Preprocessing

In [None]:
# Remapping class labels from {-1, 0, 1} to {2, 0, 1}
mapping = {
    -1: 2,
    0: 0,
    1: 1
}
df["category"] = df["category"].map(mapping)

# Removing missing sentiments
df.dropna(subset=["category"], inplace=True)

# Feature engineering
ngram_range = (1, 2)
max_features = 1000
vectorizer = CountVectorizer(
    ngram_range=ngram_range,
    max_features=max_features
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_comment"],
    df["category"],
    test_size=0.2,
    random_state=42,
    stratify=df["category"]
)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_vectorized, y_train = rus.fit_resample(
    X_train_vectorized,
    y_train
)

## Running the experiment

In [6]:
# Function to log results to MLFlow
def log_to_mlflow(
    model_name,
    model,
    X_train,
    X_test,
    y_train,
    y_test
):
    with mlflow.start_run():
        # Tags
        mlflow.set_tag(
            "mlflow.runName", f"{model_name}_BoW_bigrams_1000_undersampling"
        )
        mlflow.set_tag("experiment_type", "ML_models_comparison")
        
        # Logging model name as a parameter
        mlflow.log_param("model_name", model_name)
        
        # Initializing and training the model
        model.fit(X_train, y_train)
        
        # Making predictions on the test set and logging metrics
        y_pred = model.predict(X_test)
        
        # Logging accuracy
        accuracy = accuracy_score(
            y_true=y_test,
            y_pred=y_pred
        )
        mlflow.log_metric("accuracy", accuracy)
        
        # Logging classification report metrics
        classification_rep = classification_report(
            y_true=y_test,
            y_pred=y_pred,
            output_dict=True
        )
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}: {metric}", value)
        
        # Logging the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [7]:
# Optuna objective function
def objective(trial):
    C = trial.suggest_float("C", 1e-4, 10.0, log=True)
    kernel = trial.suggest_categorical(
        "kernel", ["linear", "rbf", "poly"]
    )
    
    model = SVC(C=C, kernel=kernel, random_state=42)
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    return accuracy_score(
        y_true=y_test,
        y_pred=y_pred
    )

In [8]:
def run_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)
    
    # Getting the best parameters and logging the best model
    best_params = study.best_params
    best_model = SVC(
        C=best_params["C"],
        kernel=best_params["kernel"],
        random_state=42
    )
    log_to_mlflow(
        model_name="SVM",
        model=best_model,
        X_train=X_train_vectorized,
        X_test=X_test_vectorized,
        y_train=y_train,
        y_test=y_test
    )

In [9]:
run_experiment()

[I 2025-06-04 20:44:42,008] A new study created in memory with name: no-name-556903de-05f9-4284-b5c4-e58affbf0e7f
[I 2025-06-04 20:45:05,493] Trial 0 finished with value: 0.7059866357561707 and parameters: {'C': 0.017190911054901854, 'kernel': 'linear'}. Best is trial 0 with value: 0.7059866357561707.
[I 2025-06-04 20:45:37,227] Trial 1 finished with value: 0.4119732715123415 and parameters: {'C': 0.0003919795567207438, 'kernel': 'linear'}. Best is trial 0 with value: 0.7059866357561707.
[I 2025-06-04 20:47:03,187] Trial 2 finished with value: 0.7755352516023456 and parameters: {'C': 3.178404042273176, 'kernel': 'linear'}. Best is trial 2 with value: 0.7755352516023456.
[I 2025-06-04 20:48:42,406] Trial 3 finished with value: 0.7755352516023456 and parameters: {'C': 3.6554479424644857, 'kernel': 'linear'}. Best is trial 2 with value: 0.7755352516023456.
[I 2025-06-04 20:49:32,257] Trial 4 finished with value: 0.7717168962225556 and parameters: {'C': 6.665191538516008, 'kernel': 'rbf'}.

🏃 View run SVM_BoW_bigrams_1000_undersampling at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/5/runs/83fa547981aa40a89fa4c0569d07fc31
🧪 View experiment at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/5
