In [1]:
import dagshub
dagshub.init(repo_owner='Shubhamraut97', repo_name='experemntracking', mlflow=True)

In [2]:
import mlflow
import mlflow.sklearn


In [3]:
mlflow.set_tracking_uri("https://dagshub.com/Shubhamraut97/experemntracking.mlflow")

In [4]:
import optuna
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
mlflow.set_experiment("mdoel selection experiment")

<Experiment: artifact_location='mlflow-artifacts:/c6df735b6d4d46aa9236a6a6da7dd0f6', creation_time=1753701898670, experiment_id='6', last_update_time=1753701898670, lifecycle_stage='active', name='mdoel selection experiment', tags={}>

In [6]:
df= pd.read_csv('processed_data.csv').dropna(subset=['clean_comment'])
df.shape

(36661, 2)

In [7]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})


In [8]:
df=df.dropna(subset=['category'])

In [9]:
n_grams = (1, 3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=n_grams, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)
def log_mlflow(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_grams", n_grams)
        mlflow.log_param("max_features", max_features)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)

        report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)
def objective_multinomial_nb(trial):
    alpha = trial.suggest_float("alpha", 0.01, 2.0)
    fit_prior = trial.suggest_categorical("fit_prior", [True, False])

    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return accuracy_score(y_test, y_pred)

# Run the tuning + logging
def run_optuna_multinomial_nb():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_multinomial_nb, n_trials=30)

    best_params = study.best_params
    best_model = MultinomialNB(**best_params)

    log_mlflow("MultinomialNB", best_model, X_train, y_train, X_test, y_test)

# Call it
run_optuna_multinomial_nb()



[I 2025-07-28 17:59:47,605] A new study created in memory with name: no-name-0b1acb0b-f797-41b8-97bf-1b1ca6fca4ad
[I 2025-07-28 17:59:47,615] Trial 0 finished with value: 0.7244768547875713 and parameters: {'alpha': 1.1723702147167556, 'fit_prior': False}. Best is trial 0 with value: 0.7244768547875713.
[I 2025-07-28 17:59:47,622] Trial 1 finished with value: 0.7272246882265906 and parameters: {'alpha': 0.5337633940546134, 'fit_prior': False}. Best is trial 1 with value: 0.7272246882265906.
[I 2025-07-28 17:59:47,691] Trial 2 finished with value: 0.7217290213485521 and parameters: {'alpha': 1.979405384336242, 'fit_prior': True}. Best is trial 1 with value: 0.7272246882265906.
[I 2025-07-28 17:59:47,698] Trial 3 finished with value: 0.7240541111815684 and parameters: {'alpha': 1.4030674153528104, 'fit_prior': True}. Best is trial 1 with value: 0.7272246882265906.
[I 2025-07-28 17:59:47,704] Trial 4 finished with value: 0.7293384062566054 and parameters: {'alpha': 0.21197128043911648, 'f

🏃 View run MultinomialNB at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/6/runs/97e612b2ab354875ab56e200d40bdba9
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/6
