In [1]:
import mlflow

In [2]:
import dagshub
dagshub.init(repo_owner='Shubhamraut97', repo_name='experemntracking', mlflow=True)

In [3]:
mlflow.set_tracking_uri("https://dagshub.com/Shubhamraut97/experemntracking.mlflow")

In [4]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.combine import SMOTEENN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow.sklearn


In [5]:
mlflow.set_experiment("Imbalanced Data Handling Experiment v2")

2025/07/28 19:00:34 INFO mlflow.tracking.fluent: Experiment with name 'Imbalanced Data Handling Experiment v2' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/7fd935efa4254264a8887e973a006745', creation_time=1753708534637, experiment_id='8', last_update_time=1753708534637, lifecycle_stage='active', name='Imbalanced Data Handling Experiment v2', tags={}>

In [6]:
df=pd.read_csv('processed_data.csv').dropna(subset=['clean_comment'])
df.shape

(36661, 2)

In [7]:
def run_imbalanced_exp(imbalance_method):
    ngram_range = (1, 3)
    max_features = 10000

    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

    X = df['clean_comment']
    y = df['category']

    # Initial split (before vectorization)
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Vectorize
    X_train_vectorized = vectorizer.fit_transform(X_train_raw)
    X_test_vectorized = vectorizer.transform(X_test_raw)

    mlflow_sampler = 'None'
    class_weight = None

    # Handle imbalance
    if imbalance_method == 'class_weight':
        class_weight = 'balanced'
        X_train_final = X_train_vectorized
        y_train_final = y_train

    else:
        if imbalance_method == 'oversampling':
            sampler = SMOTE(random_state=42)
            mlflow_sampler = 'SMOTE'
        elif imbalance_method == 'undersampling':
            sampler = RandomUnderSampler(random_state=42)
            mlflow_sampler = 'RandomUnderSampler'
        elif imbalance_method == 'smoteenn':
            sampler = SMOTEENN(random_state=42)
            mlflow_sampler = 'SMOTEENN'
        elif imbalance_method == 'adasyn':
            sampler = ADASYN(random_state=42)
            mlflow_sampler = 'ADASYN'
        else:
            sampler = None

        if sampler:
            X_train_final, y_train_final = sampler.fit_resample(X_train_vectorized, y_train)
        else:
            X_train_final, y_train_final = X_train_vectorized, y_train

    # Set up MLflow experiment
    mlflow.set_experiment("imbalanced_experiment")
    with mlflow.start_run(run_name=f"{imbalance_method}_run"):
        mlflow.set_tag("imbalance_method", imbalance_method)
        mlflow.set_tag("experiment_type", "imbalanced_handling")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("max_features", max_features)
        mlflow.log_param("class_weight", class_weight)
        mlflow.log_param("imbalance_sampler", mlflow_sampler)

        # Model parameters
        n_estimators = 200
        max_depth = 15
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Train model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            class_weight=class_weight
        )
        model.fit(X_train_final, y_train_final)
        y_pred = model.predict(X_test_vectorized)

        # Log metrics
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        class_report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in class_report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'Confusion Matrix - {imbalance_method}')

        os.makedirs("artifacts", exist_ok=True)
        cm_path = f"artifacts/confusion_matrix_{imbalance_method}.png"
        plt.savefig(cm_path)
        plt.close()

        mlflow.log_artifact(cm_path)


# Run all methods
imbalance_methods = ['class_weight', 'oversampling', 'undersampling', 'smoteenn', 'adasyn']
for method in imbalance_methods:
    run_imbalanced_exp(method)


🏃 View run class_weight_run at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5/runs/771f0977e7ef429186a2e0d0fccbe20b
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5
🏃 View run oversampling_run at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5/runs/cd2aa88135944cce8817c618edb4b7b9
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5
🏃 View run undersampling_run at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5/runs/89cdfd8f3bb44dee9df69db2d52f105c
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5
🏃 View run smoteenn_run at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5/runs/a195257d85f54c9db6834c41553bf246
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/5
🏃 View run adasyn_run at: https://dagshub.c