In [1]:
import dagshub
dagshub.init(repo_owner='Shubhamraut97', repo_name='experemntracking', mlflow=True)

In [2]:
import mlflow
import mlflow.sklearn


In [3]:
mlflow.set_tracking_uri("https://dagshub.com/Shubhamraut97/experemntracking.mlflow")

In [11]:
import optuna
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier



In [None]:
mlflow.set_experiment("mdoel selection experiment")

<Experiment: artifact_location='mlflow-artifacts:/c6df735b6d4d46aa9236a6a6da7dd0f6', creation_time=1753701898670, experiment_id='6', last_update_time=1753701898670, lifecycle_stage='active', name='mdoel selection experiment', tags={}>

In [6]:
df= pd.read_csv('processed_data.csv').dropna(subset=['clean_comment'])
df.shape

(36661, 2)

In [None]:
df['category']

0        1
1        1
2       -1
3        0
4        1
        ..
36788    0
36789    1
36790    0
36791    1
36792    0
Name: category, Length: 36661, dtype: int64

In [8]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})


In [None]:
df['category']

0        1
1        1
2        2
3        0
4        1
        ..
36788    0
36789    1
36790    0
36791    1
36792    0
Name: category, Length: 36661, dtype: int64

In [10]:
df=df.dropna(subset=['category'])

In [12]:
# TF-IDF and resampling
n_grams = (1, 3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=n_grams, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# MLflow logging
def log_mlflow(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_grams", n_grams)
        mlflow.log_param("max_features", max_features)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)

        report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)

# Optuna objective function for XGBoost
def objective_xgboost(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    # Removed class_weight
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# Run optimization and log best
def run_optuna_xgboost():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_xgboost, n_trials=30)
    best_params = study.best_params

    best_model = XGBClassifier(
        **best_params,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )

    log_mlflow("XGBoost", best_model, X_train, y_train, X_test, y_test)

run_optuna_xgboost()


[I 2025-07-28 17:25:36,425] A new study created in memory with name: no-name-8ec6e7b3-4c74-4378-8270-8514d56b67e6
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-28 17:26:30,130] Trial 0 finished with value: 0.8224476854787571 and parameters: {'n_estimators': 195, 'max_depth': 14, 'learning_rate': 0.22131207528165736}. Best is trial 0 with value: 0.8224476854787571.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-28 17:26:34,691] Trial 1 finished with value: 0.7974001268230818 and parameters: {'n_estimators': 180, 'max_depth': 3, 'learning_rate': 0.28196710504476696}. Best is trial 0 with value: 0.8224476854787571.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-28 17:27:26,985] Trial 2 finished with value: 0.6924540266328472 and parameters: {'n_estimators': 70, 'max_depth': 14, 'learning_rate': 0.01436803787179901

🏃 View run XGBoost at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/6/runs/bdc6874fc470452ba705679a84655352
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/6
