### Handling imbalanced data

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import os

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./preprocessed_data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


### Model Training

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import scipy

In [5]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [None]:
import mlflow
import mlflow.sklearn
import joblib
import pickle
import dagshub

import logging
import time

In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Set up DagsHub credentials for MLflow tracking
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")

if not username or not token:
    raise ValueError("Missing DagsHub credentials in environment variables")

# Construct the authenticated MLflow tracking URI
mlflow_uri = f"https://{username}:{token}@dagshub.com/{username}/YouTube-Sentiment-Insights-Plugin.mlflow"

dagshub.init(repo_owner=username, repo_name="YouTube-Sentiment-Insights-Plugin", mlflow=True)
mlflow.set_tracking_uri(mlflow_uri)

In [9]:
# Set or create an experiment
# Set or create an experiment
experiment = 'Exp 4 - Handling Imbalanced Data'
mlflow.set_experiment(experiment)

2025/10/18 02:56:59 INFO mlflow.tracking.fluent: Experiment with name 'Exp 4 - Handling Imbalanced Data' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/5060d452563b4218ad886674d1c4a697', creation_time=1760736422348, experiment_id='11', last_update_time=1760736422348, lifecycle_stage='active', name='Exp 4 - Handling Imbalanced Data', tags={}>

In [10]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLFlow run ....")

def run_experiment(imbalance_method):
    # TF-IDF Params
    ngram_range = (1, 3)  
    vec_max_features = 1000

    start_time = time.time()
    try:
        # Initialize the Vectorizer
        logging.info("Initialize Vectorizer ....")
        
        vec = TfidfVectorizer(ngram_range=ngram_range, max_features=vec_max_features)
    
        # Train-Test Split
        logging.info("Train test split ....")
        
        X_train, X_test, y_train, y_test = train_test_split(
            df['clean_comment'], df['category'], 
            test_size=0.2, random_state=42, stratify=df['category']
        )
    
        # Text Vectorization
        logging.info("Vectorizing the X_train & X_test ....")
        
        X_train_vec = vec.fit_transform(X_train)
        X_test_vec = vec.transform(X_test)
    
    
        # Handle class imbalance based on the selected method (only applied to the training set)
        logging.info("Initialize the Data Balancing method ....")
        
        if imbalance_method == 'class_weights':
            # Use class_weight in Random Forest
            class_weight = 'balanced'
        else:
            class_weight = None  # Do not apply class_weight if using resampling
            
            # Resampling Techniques (only apply to the training set)
            if imbalance_method == 'oversampling':
                smote = SMOTE(random_state=42)
                X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)
            elif imbalance_method == 'adasyn':
                adasyn = ADASYN(random_state=42)
                X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)
            elif imbalance_method == 'undersampling':
                rus = RandomUnderSampler(random_state=42)
                X_train_vec, y_train = rus.fit_resample(X_train_vec, y_train)
            elif imbalance_method == 'smote_enn':
                smote_enn = SMOTEENN(random_state=42)
                X_train_vec, y_train = smote_enn.fit_resample(X_train_vec, y_train)

        try: 
            # Start MLflow Run
            with mlflow.start_run() as run:
                
                # ------------------ Metadata ------------------
                logging.info("Logging Metadata ....")
                
                mlflow.set_tag('mlflow.runName', f'Imbalance_{imbalance_method}_RandomForest_TFIDF_TriGrams')
                mlflow.set_tag('experiment_type', 'imbalance_handling')
                mlflow.set_tag('model_type', 'RandomForestClassifier')
                mlflow.set_tag('description', f"RandomForest with TF-IDF BIgrams, imbalance handling method={imbalance_method}")
        
                # ------------------ Log Vectorizer Params ------------------
                logging.info("Logging Vectorizer Params ....")
                
                mlflow.log_param("vectorizer_type", "TF-IDF")
                mlflow.log_param("ngram_range", ngram_range)
                mlflow.log_param("vectorizer_max_features", vec_max_features)
        
                # ------------------ Log Model Params ------------------
                logging.info("Logging Model Params ....")
                
                n_estimators = 200
                max_depth = 15
                mlflow.log_param("n_estimators", n_estimators)
                mlflow.log_param("max_depth", max_depth)
                mlflow.log_param("imbalance_method", imbalance_method)
        
                # ------------------ Train Model ------------------
                logging.info("Model Training & Predicion Started ....")
                
                model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
                model.fit(X_train_vec, y_train)
        
                # ------------------ Predictions ------------------
                y_pred = model.predict(X_test_vec)

                logging.info("Model Training & Predicion Ended ....")
                
                # ------------------ Metrics ------------------
                logging.info("Logging Metrics ....")
                
                accuracy = accuracy_score(y_test, y_pred)
                mlflow.log_metric("accuracy", accuracy)
        
                # Log classification report
                classification_rep = classification_report(y_test, y_pred, output_dict=True)
                for label, metrics in classification_rep.items():
                    if isinstance(metrics, dict):
                        for metric, value in metrics.items():
                            mlflow.log_metric(f"{label}_{metric}", value)
            
                # ------------------ Confusion Matrix ------------------
                conf_matrix = confusion_matrix(y_test, y_pred)
                plt.figure(figsize=(8, 6))
                sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.title("Confusion Matrix")
                plt.tight_layout()
        
                mlflow.log_figure(plt.gcf(), "Confusion_Matrix.png")  # log the plot
                plt.close()
                
                # ------------------ Log the Model Properly ------------------
                logging.info("Logging the model ....")
                
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="model",  # will create artifacts/model folder
                    # registered_model_name=f"rfc_TFIDF_BIgram_{vec_max_features}"
                )
        
                end_time = time.time()
        
                logging.info(f"Completed the Experiment in {end_time-start_time} seconds")
        
                logging.info(f"Accuracy -> {accuracy:.2f}")
        except Exception as e:
            logging.error(f"Error occured while Model Trainig: {e}")
            raise
            
    except Exception as e:
        logging.error(f"Error occured while Vectorizing: {e}")
        raise

2025-10-18 03:01:50,131 - INFO - Starting MLFlow run ....


In [11]:
# Run 5 Experiments Results on MLFlow
imbalance_methods = ['class_weights', 'oversampling', 'adasyn', 'undersampling', 'smote_enn']

for method in imbalance_methods:
    # TF-IDF Experiments
    run_experiment(imbalance_method=method)

2025-10-18 03:01:52,865 - INFO - Initialize Vectorizer ....
2025-10-18 03:01:52,865 - INFO - Train test split ....
2025-10-18 03:01:52,883 - INFO - Vectorizing the X_train & X_test ....
2025-10-18 03:01:57,215 - INFO - Initialize the Data Balancing method ....
2025-10-18 03:01:58,623 - INFO - Logging Metadata ....
2025-10-18 03:02:00,102 - INFO - Logging Vectorizer Params ....
2025-10-18 03:02:01,183 - INFO - Logging Model Params ....
2025-10-18 03:02:02,251 - INFO - Model Training & Predicion Started ....
2025-10-18 03:02:08,616 - INFO - Model Training & Predicion Ended ....
2025-10-18 03:02:08,616 - INFO - Logging Metrics ....
2025-10-18 03:02:21,742 - INFO - Logging the model ....
2025-10-18 03:03:12,867 - INFO - Completed the Experiment in 80.00163078308105 seconds
2025-10-18 03:03:12,867 - INFO - Accuracy -> 0.67
2025-10-18 03:03:13,262 - INFO - Initialize Vectorizer ....
2025-10-18 03:03:13,262 - INFO - Train test split ....
2025-10-18 03:03:13,297 - INFO - Vectorizing the X_trai

## Experiment 4 – Handling Class Imbalance

### Objective
To analyze how different **class imbalance handling methods** affect the performance of the **RandomForestClassifier** trained on TF-IDF features.

### Experiment Setup
- **Model:** RandomForestClassifier  
- **Vectorizer:** TF-IDF (ngram_range = (1,2), max_features = 2000)  
- **Imbalance handling methods tested:**
  - SMOTE-ENN
  - Random Undersampling
  - ADASYN
  - Random Oversampling
  - Class Weights (`balanced` parameter in RandomForest)
- **Metrics logged:** Accuracy, Precision, Recall, F1-score (per class and weighted average)  

---

### Results Summary

| Imbalance Method | Accuracy | Weighted F1-score | -1 F1-score | 1 F1-score |
|------------------|-----------|------------------|-------------|-------------|
| SMOTE-ENN        | 0.672     | 0.665            | 0.469       | 0.002       |
| Undersampling    | 0.674     | 0.664            | 0.534       | 0.685       |
| ADASYN           | 0.674     | 0.666            | 0.509       | 0.693       |
| Oversampling     | 0.674     | 0.666            | 0.519       | 0.692       |
| Class Weights    | 0.657     | 0.607            | 0.218       | 0.711       |

---

### Conclusion
- **ADASYN** and **Random Oversampling** provided the **best overall performance**, both achieving **0.674 accuracy** and **0.666 weighted F1-score**.  
- **Class Weights** alone did not improve minority class recall (-1 class) significantly, although it performed well for the positive class (1).  
- **SMOTE-ENN**, while effective for recall of class -1, led to extremely poor results for class 1, likely due to data noise introduced during ENN cleaning.  
- **Undersampling** achieved balanced but slightly lower macro F1 compared to ADASYN and Oversampling.

---

### Recommendations / Next Steps
- Use **ADASYN** or **Random Oversampling** as the preferred imbalance handling strategy.  
- Next experiment: Combine **ADASYN** with **hyperparameter tuning** (e.g., n_estimators, max_depth).  
- Evaluate using **confusion matrices** and **ROC-AUC** to better understand class-wise trade-offs.  
- Consider experimenting with **XGBoost or LightGBM** with built-in class imbalance handling options.

---

### Artifacts
- MLflow logs for imbalance methods  
- Confusion matrix visualizations  
- Feature importance comparison across imbalance methods  
