In [1]:

import dagshub
dagshub.init(repo_owner='Vaibha3246', repo_name='influence_mirror', mlflow=True)

import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri("https://dagshub.com/Vaibha3246/influence_mirror.mlflow")

In [2]:
# set or create an experiment
mlflow.set_experiment("exp_5 ml_algo_with_hp_tunning")


<Experiment: artifact_location='mlflow-artifacts:/adb82bab710d416190b0fea77cabca06', creation_time=1760159677050, experiment_id='3', last_update_time=1760159677050, lifecycle_stage='active', name='exp_5 ml_algo_with_hp_tunning', tags={}>

In [3]:
import pandas as pd
df=pd.read_csv('preprocessing.csv').dropna(subset=['text_clean'])

In [6]:
# -----------------------------
#  Optimized Naive Bayes (Optuna + SMOTE + MLflow)
# -----------------------------
import optuna
import mlflow
import mlflow.sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import scipy.sparse as sp
import warnings
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# -----------------------------
# Step 1: Clean target
# -----------------------------
df['sentiment_numeric'] = df['sentiment_numeric'].map({-1: 2, 0: 0, 1: 1})
df = df.dropna(subset=['sentiment_numeric'])

# -----------------------------
# Step 2: Select features
# -----------------------------
numeric_cols = [col for col in df.columns if col not in [
    'video_id', 'category', 'text', 'text_clean', 'sentiment',
    'dominant_emotion', 'published_at', 'sentiment_numeric'
]]

X = df[numeric_cols]
y = df['sentiment_numeric']

# -----------------------------
# Step 3: Train/Test Split
# -----------------------------
X_train_raw, X_test_raw, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, df.index, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Step 4: Scale numeric features (0‚Äì1 range for NB)
# -----------------------------
scaler = MinMaxScaler()
X_train_num = scaler.fit_transform(X_train_raw)
X_test_num = scaler.transform(X_test_raw)

# -----------------------------
# Step 5: TF-IDF for text
# -----------------------------
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=7000)
X_train_text = tfidf.fit_transform(df.loc[train_idx, 'text_clean'])
X_test_text = tfidf.transform(df.loc[test_idx, 'text_clean'])

# Combine numeric + text
X_train = sp.hstack([X_train_text, sp.csr_matrix(X_train_num)])
X_test = sp.hstack([X_test_text, sp.csr_matrix(X_test_num)])

# -----------------------------
# Step 6: Apply SMOTE
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# -----------------------------
# Step 7: Optuna Objective (subset for tuning)
# -----------------------------
def objective(trial):
    sample_size = min(2000, X_train_res.shape[0])
    idx = np.random.choice(X_train_res.shape[0], sample_size, replace=False)
    
    X_sample = X_train_res[idx]
    y_sample = np.array(y_train_res)[idx]

    params = {
        "alpha": trial.suggest_float("alpha", 0.001, 2.0, log=True),
        "fit_prior": trial.suggest_categorical("fit_prior", [True, False])
    }

    model = MultinomialNB(**params)
    model.fit(X_sample, y_sample)
    preds = model.predict(X_test)
    return f1_score(y_test, preds, average="macro")

# -----------------------------
# Step 8: Run Optuna Tuning
# -----------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# -----------------------------
# Step 9: Train Full Best Model
# -----------------------------
best_params = study.best_params
best_model = MultinomialNB(**best_params)
best_model.fit(X_train_res, y_train_res)
y_pred = best_model.predict(X_test)

# -----------------------------
# Step 10: MLflow Logging
# -----------------------------
with mlflow.start_run(run_name="NaiveBayes_TFIDF_SMOTE_Optuna_Fast"):
    mlflow.log_param("algorithm", "MultinomialNB")
    mlflow.log_params(best_params)
    mlflow.log_param("vectorizer_type", "TF-IDF")
    mlflow.log_param("ngram_range", "(1,2)")
    mlflow.log_param("max_features", 7000)
    mlflow.log_param("imbalance_method", "SMOTE")

    # Metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_macro", f1_score(y_test, y_pred, average="macro"))

    # Detailed report
    report = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric, val in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", val)

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix: Naive Bayes Best Model")
    plt.savefig("confusion_matrix_naivebayes.png")
    mlflow.log_artifact("confusion_matrix_naivebayes.png")
    plt.close()

    # Save model & preprocessors
    joblib.dump(best_model, "naivebayes_best_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    joblib.dump(tfidf, "tfidf_vectorizer.pkl")
    mlflow.log_artifact("naivebayes_best_model.pkl")
    mlflow.log_artifact("scaler.pkl")
    mlflow.log_artifact("tfidf_vectorizer.pkl")

print(" MLflow run logged successfully for Naive Bayes (TF-IDF + SMOTE + Optuna)")


[I 2025-10-24 12:05:13,746] A new study created in memory with name: no-name-f788bbc3-b8ad-4d13-b47d-ffb7f0b0f569
[I 2025-10-24 12:05:14,265] Trial 0 finished with value: 0.6635349144667767 and parameters: {'alpha': 0.060707414027673295, 'fit_prior': True}. Best is trial 0 with value: 0.6635349144667767.
[I 2025-10-24 12:05:14,277] Trial 1 finished with value: 0.644001019363842 and parameters: {'alpha': 0.0010238917491002817, 'fit_prior': False}. Best is trial 0 with value: 0.6635349144667767.
[I 2025-10-24 12:05:14,291] Trial 2 finished with value: 0.656969071691214 and parameters: {'alpha': 0.0036520975067028524, 'fit_prior': True}. Best is trial 0 with value: 0.6635349144667767.
[I 2025-10-24 12:05:14,308] Trial 3 finished with value: 0.6696268690463592 and parameters: {'alpha': 0.05063857017983732, 'fit_prior': True}. Best is trial 3 with value: 0.6696268690463592.
[I 2025-10-24 12:05:14,325] Trial 4 finished with value: 0.644045356101064 and parameters: {'alpha': 0.001052033803725

üèÉ View run NaiveBayes_TFIDF_SMOTE_Optuna_Fast at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/3/runs/3eff6eecfbb04025a12e710ffc1650c5
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/3
 MLflow run logged successfully for Naive Bayes (TF-IDF + SMOTE + Optuna)
