In [1]:
import dagshub
dagshub.init(repo_owner='Shubhamraut97', repo_name='experemntracking', mlflow=True)

In [2]:
import mlflow
import mlflow.sklearn


In [3]:
mlflow.set_tracking_uri("https://dagshub.com/Shubhamraut97/experemntracking.mlflow")

In [4]:
import optuna
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
mlflow.set_experiment("mdoel selection experiment")

<Experiment: artifact_location='mlflow-artifacts:/c6df735b6d4d46aa9236a6a6da7dd0f6', creation_time=1753701898670, experiment_id='6', last_update_time=1753701898670, lifecycle_stage='active', name='mdoel selection experiment', tags={}>

In [6]:
df= pd.read_csv('processed_data.csv').dropna(subset=['clean_comment'])
df.shape

(36661, 2)

In [7]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})


In [8]:
df=df.dropna(subset=['category'])

In [None]:
n_grams = (1, 3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=n_grams, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)
def log_mlflow(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_grams", n_grams)
        mlflow.log_param("max_features", max_features)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)

        report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)
def objective_logistic_regression(trial):
    C = trial.suggest_float("C", 0.01, 10.0, log=True)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = trial.suggest_categorical("solver", ["liblinear", "saga"])
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    # Some solvers don't support all penalties
    if penalty == "l1" and solver not in ["liblinear", "saga"]:
        raise optuna.exceptions.TrialPruned()

    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        class_weight=class_weight,
        max_iter=100,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def run_optuna_logistic_regression():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_logistic_regression, n_trials=30)

    best_params = study.best_params
    best_model = LogisticRegression(**best_params, max_iter=1000, random_state=42)

    log_mlflow("LogisticRegression", best_model, X_train, y_train, X_test, y_test)

# Run it
run_optuna_logistic_regression()

[I 2025-07-28 18:04:02,671] A new study created in memory with name: no-name-d3ec6fd1-4a9d-48ad-8453-c25a21e32993
[I 2025-07-28 18:04:14,509] Trial 0 finished with value: 0.7828154724159797 and parameters: {'C': 0.7227397256621134, 'penalty': 'l1', 'solver': 'saga', 'class_weight': None}. Best is trial 0 with value: 0.7828154724159797.
[I 2025-07-28 18:04:14,711] Trial 1 finished with value: 0.7508983301627563 and parameters: {'C': 0.18624391943381197, 'penalty': 'l2', 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7828154724159797.
[I 2025-07-28 18:04:14,858] Trial 2 finished with value: 0.7601986894948214 and parameters: {'C': 0.46634419731266247, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': None}. Best is trial 0 with value: 0.7828154724159797.
[I 2025-07-28 18:04:14,933] Trial 3 finished with value: 0.7444514901712111 and parameters: {'C': 0.1043044904487482, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': None}. Best is trial 0 with