In [3]:
import mlflow
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import pandas as pd

# --- STEP 1: Data Preparation ---
# Clean data
df = pd.read_csv('/content/reddit_preprocessed.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.dropna()
df['clean_comment'] = df['clean_comment'].astype(str)

# Check class distribution
print("Original Class Distribution:")
print(df['category'].value_counts())

# Vectorize
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df.clean_comment)
y = df.category

# Split Data (Stratified split is important for imbalanced data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nData ready. Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# --- STEP 2: Define Techniques ---
# We define a dictionary of techniques.
# 'resampler': The imblearn object to resample data (or None).
# 'class_weight': Parameter for the Random Forest model (or None).
techniques = {
    "Baseline": {
        "resampler": None,
        "class_weight": None
    },
    "Class_Weight_Balanced": {
        "resampler": None,
        "class_weight": "balanced"
    },
    "Random_Oversampling": {
        "resampler": RandomOverSampler(random_state=42),
        "class_weight": None
    },
    "Random_Undersampling": {
        "resampler": RandomUnderSampler(random_state=42),
        "class_weight": None
    },
    "SMOTE": {
        "resampler": SMOTE(random_state=42),
        "class_weight": None
    },
    "ADASYN": {
        "resampler": ADASYN(random_state=42),
        "class_weight": None
    }
}

# Set Experiment Name
mlflow.set_experiment("EXP 4 - Imbalanced Data Techniques")

print(f"\n⚖️ Starting Experiment 4 (with ADASYN)...\n")

# --- STEP 3: Experiment Loop ---
for tech_name, config in techniques.items():
    print(f"🚀 Running: {tech_name} ...")

    with mlflow.start_run(run_name=tech_name):
        # 1. Log Parameters
        mlflow.log_param("technique", tech_name)
        mlflow.log_param("vectorizer", "TfidfVectorizer")
        mlflow.log_param("max_features", 3000)

        # 2. Apply Resampling (only on Training Data!)
        X_train_resampled, y_train_resampled = X_train, y_train

        if config['resampler'] is not None:
            print(f"   Applying {config['resampler']}...")
            try:
                X_train_resampled, y_train_resampled = config['resampler'].fit_resample(X_train, y_train)
                mlflow.log_param("resampled_train_size", X_train_resampled.shape[0])
            except RuntimeError as e:
                # ADASYN can sometimes fail if the minority class is too sparse/small for neighbors
                print(f"   ⚠️ ADASYN failed: {e}. Skipping this technique.")
                continue

        # 3. Train Model
        rf = RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            random_state=42,
            class_weight=config['class_weight']
        )
        rf.fit(X_train_resampled, y_train_resampled)

        # 4. Evaluate (Always on the original, untouched Test set)
        y_pred = rf.predict(X_test)

        # 5. Log Metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1_weighted = f1_score(y_test, y_pred, average='weighted')

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_weighted", f1_weighted)

        # Log precision/recall for minority class usually reveals the impact of these techniques
        report = classification_report(y_test, y_pred, output_dict=True)
        # Assuming '-1' might be a minority class, let's log its specific recall
        if '-1' in report:
            mlflow.log_metric("recall_class_-1", report['-1']['recall'])
            mlflow.log_metric("precision_class_-1", report['-1']['precision'])

        print(f"   ✅ Accuracy: {accuracy:.4f} | F1 (Weighted): {f1_weighted:.4f}")

print("\n🎉 Experiment 4 Finished!")

Original Class Distribution:
category
 1    15770
 0    12644
-1     8248
Name: count, dtype: int64

Data ready. Train shape: (29329, 3000), Test shape: (7333, 3000)


2026/02/13 18:29:43 INFO mlflow.tracking.fluent: Experiment with name 'EXP 4 - Imbalanced Data Techniques' does not exist. Creating a new experiment.



⚖️ Starting Experiment 4 (with ADASYN)...

🚀 Running: Baseline ...
   ✅ Accuracy: 0.6546 | F1 (Weighted): 0.5860
🏃 View run Baseline at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/4/runs/173635c6394d4fbf9b2c07923dfb34f1
🧪 View experiment at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/4
🚀 Running: Class_Weight_Balanced ...
   ✅ Accuracy: 0.6842 | F1 (Weighted): 0.6751
🏃 View run Class_Weight_Balanced at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/4/runs/c692484aa9644b1787e95157c728f4df
🧪 View experiment at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/4
🚀 Running: Random_Oversampling ...
   Applying RandomOverSampler(random_state=42)...
   ✅ Accuracy: 0.6911 | F1 (Weighted): 0.6828
🏃 View run Random_Oversampling at: https://dagshub.com/Ritk-Raikwar/reddit-comment-sentiment-analysis.mlflow/#/experiments/4/runs/a

In [1]:
!pip install mlflow dagshub imbalanced-learn



In [2]:
import dagshub
dagshub.init(repo_owner='Ritk-Raikwar', repo_name='reddit-comment-sentiment-analysis', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=c21e6013-f94e-426d-9c3a-21afe0546a1a&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=38f1c3e23a755048e8c83c93ee125a4ea413931e270b86e3369a03cbf832c337




Output()