In [3]:
#05_rf_adasyn

import mlflow
import optuna
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import make_pipeline

# --- Load Data ---
df = pd.read_csv('reddit_preprocessed.csv')
df = df.drop('Unnamed: 0', axis=1, errors='ignore')
df = df.dropna()
df['clean_comment'] = df['clean_comment'].astype(str)

le = LabelEncoder()
y = le.fit_transform(df['category'])
class_names = [str(cls) for cls in le.classes_]

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['clean_comment'], y, test_size=0.2, random_state=42, stratify=y
)

# --- Optuna ---
mlflow.set_experiment("EXP 5 - ML Algos with optuna")
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'n_jobs': 1
    }

    clf = make_pipeline(
        TfidfVectorizer(max_features=3000, ngram_range=(1, 2)),
        ADASYN(random_state=42),
        RandomForestClassifier(**params, random_state=42)
    )

    return cross_val_score(clf, X_train_raw, y_train, cv=3, scoring='accuracy').mean()

print("🚀 Tuning Random Forest...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

# --- Final Training ---
best_params = study.best_params
final_pipeline = make_pipeline(
    TfidfVectorizer(max_features=3000, ngram_range=(1, 2)),
    ADASYN(random_state=42),
    RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
)

final_pipeline.fit(X_train_raw, y_train)
y_pred = final_pipeline.predict(X_test_raw)

# --- Logging ---
with mlflow.start_run(run_name="RF_ADASYN"):
    mlflow.log_params(best_params)

    report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)

    print("\n📊 Random Forest Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    mlflow.log_metric("accuracy", report['accuracy'])
    mlflow.log_metric("macro_f1", report['macro avg']['f1-score'])

    for label in class_names:
        if label in report:
            mlflow.log_metric(f"{label}_precision", report[label]['precision'])
            mlflow.log_metric(f"{label}_recall", report[label]['recall'])
            mlflow.log_metric(f"{label}_f1", report[label]['f1-score'])

🚀 Tuning Random Forest...

📊 Random Forest Report:
              precision    recall  f1-score   support

          -1       0.67      0.52      0.59      1650
           0       0.69      0.93      0.79      2529
           1       0.82      0.70      0.75      3154

    accuracy                           0.74      7333
   macro avg       0.73      0.71      0.71      7333
weighted avg       0.74      0.74      0.73      7333

🏃 View run RF_ADASYN at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/5/runs/b92c7b4ac41e4f5ab1852114dda14ead
🧪 View experiment at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/5


In [1]:
!pip install mlflow dagshub optuna imbalanced-learn

Collecting mlflow
  Downloading mlflow-3.9.0-py3-none-any.whl.metadata (31 kB)
Collecting dagshub
  Downloading dagshub-0.6.5-py3-none-any.whl.metadata (12 kB)
Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.9.0 (from mlflow)
  Downloading mlflow_skinny-3.9.0-py3-none-any.whl.metadata (32 kB)
Collecting mlflow-tracing==3.9.0 (from mlflow)
  Downloading mlflow_tracing-3.9.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.4 (from mlflow)
  Downloading huey-2.6.0-py3-none-any.whl.metadata (4.3 kB)
Collecting skops<

In [2]:
import dagshub
dagshub.init(repo_owner='Ritk-Raikwar', repo_name='reddit-comment-sentiment-analysis', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=223b7dbe-39af-4b8f-b242-f3d5d2f9772e&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=fa3aba286fc378b75759f01d42d142e5233e7c1778cf8d1e30c08ee23cfd0c96


