In [1]:

import dagshub
dagshub.init(repo_owner='Vaibha3246', repo_name='influence_mirror', mlflow=True)

import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri("https://dagshub.com/Vaibha3246/influence_mirror.mlflow")

In [2]:
# set or create an experiment
mlflow.set_experiment("exp_5 ml_algo_with_hp_tunning")


<Experiment: artifact_location='mlflow-artifacts:/adb82bab710d416190b0fea77cabca06', creation_time=1760159677050, experiment_id='3', last_update_time=1760159677050, lifecycle_stage='active', name='exp_5 ml_algo_with_hp_tunning', tags={}>

In [3]:
import pandas as pd
df=pd.read_csv('preprocessing.csv').dropna(subset=['text_clean'])

In [4]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
import optuna
import mlflow

# Optional (for better MLflow visualization)
import mlflow.sklearn

# To avoid warnings clutter
import warnings
warnings.filterwarnings("ignore")


In [5]:
# -------------------------
# Step 1: Clean target
# -------------------------
df['sentiment_numeric'] = df['sentiment_numeric'].map({-1: 2, 0: 0, 1: 1})
df = df.dropna(subset=['sentiment_numeric'])

# -------------------------
# Step 2: Select features
# -------------------------
numeric_cols = [col for col in df.columns if col not in [
    'video_id', 'category', 'text', 'text_clean', 'sentiment',
    'dominant_emotion', 'published_at', 'sentiment_numeric'
]]

X = df[numeric_cols]
y = df['sentiment_numeric']

# -------------------------
# Step 3: Train/test split
# -------------------------
X_train_raw, X_test_raw, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, df.index, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# Step 4: Scale numeric features
# -------------------------
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_raw)
X_test_num = scaler.transform(X_test_raw)

# -------------------------
# Step 5: TF-IDF for text
# -------------------------
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_text = tfidf.fit_transform(df.loc[train_idx, 'text_clean'])
X_test_text = tfidf.transform(df.loc[test_idx, 'text_clean'])

# Combine numeric + text
X_train = sp.hstack([X_train_text, sp.csr_matrix(X_train_num)])
X_test = sp.hstack([X_test_text, sp.csr_matrix(X_test_num)])

# -------------------------
# Step 6: Apply SMOTE
# -------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# -------------------------
# Step 7: Define Optuna objective for Random Forest
# -------------------------
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train_res, y_train_res)
    preds = model.predict(X_test)
    f1 = f1_score(y_test, preds, average="macro")
    return f1

# -------------------------
# Step 8: Run Optuna tuning
# -------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

# -------------------------
# Step 9: Train best Random Forest model
# -------------------------
best_params = study.best_params
best_params.update({"random_state": 42, "n_jobs": -1})

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_res, y_train_res)
y_pred = best_model.predict(X_test)

# -------------------------
# Step 10: Log metrics + model in MLflow
# -------------------------
with mlflow.start_run(run_name="RandomForest_TFIDF_SMOTE_Optuna_Best"):
    # Log algorithm name
    mlflow.log_param("algorithm", "RandomForest")

    # Log hyperparameters
    mlflow.log_params(best_params)
    mlflow.log_param("vectorizer_type", "TF-IDF")
    mlflow.log_param("ngram_range", "(1, 3)")
    mlflow.log_param("max_features_tfidf", "10000")
    mlflow.log_param("imbalance_method", "SMOTE")

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_macro", f1_score(y_test, y_pred, average="macro"))

    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix: Random Forest Best Model")
    plt.savefig("confusion_matrix_rf_best.png")
    mlflow.log_artifact("confusion_matrix_rf_best.png")
    plt.close()

    # Save model
    model_path = "random_forest_best_model.pkl"
    joblib.dump(best_model, model_path)
    mlflow.log_artifact(model_path)

    # Save TF-IDF vocabulary
    vocab_json = {k: int(v) for k, v in tfidf.vocabulary_.items()}
    with open("tfidf_vocab.json", "w") as f:
        json.dump(vocab_json, f, indent=4)
    mlflow.log_artifact("tfidf_vocab.json")

print("üéØ MLflow run logged successfully for Random Forest ‚úÖ")


[I 2025-10-22 22:23:19,650] A new study created in memory with name: no-name-4cfaedff-88a1-42a2-895c-9cb2b4943019
[I 2025-10-22 22:23:31,074] Trial 0 finished with value: 0.6375616077491663 and parameters: {'n_estimators': 291, 'max_depth': 17, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.6375616077491663.
[I 2025-10-22 22:23:33,171] Trial 1 finished with value: 0.6031983705707057 and parameters: {'n_estimators': 235, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.6375616077491663.
[I 2025-10-22 22:23:34,793] Trial 2 finished with value: 0.6050406162239538 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.6375616077491663.
[I 2025-10-22 22:23:41,290] Trial 3 finished with value: 0.62949783232905

üèÉ View run RandomForest_TFIDF_SMOTE_Optuna_Best at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/3/runs/e60c21b0d4444f91b690dc3ae41abcfa
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/3
üéØ MLflow run logged successfully for Random Forest ‚úÖ
