### How many max features?

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("./preprocessed_data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


### Model Training

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import scipy

In [None]:
import mlflow
import mlflow.sklearn
import joblib
import pickle
import dagshub

import logging
import time

In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Set up DagsHub credentials for MLflow tracking
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")

if not username or not token:
    raise ValueError("Missing DagsHub credentials in environment variables")

# Construct the authenticated MLflow tracking URI
mlflow_uri = f"https://{username}:{token}@dagshub.com/{username}/YouTube-Sentiment-Insights-Plugin.mlflow"

dagshub.init(repo_owner=username, repo_name="YouTube-Sentiment-Insights-Plugin", mlflow=True)
mlflow.set_tracking_uri(mlflow_uri)

In [7]:
# Set or create an experiment
experiment = 'Exp 3 - TfIdf Trigram max_features'
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/4f6dd3dd818943c7bbcbba49a221fe88', creation_time=1760735056124, experiment_id='10', last_update_time=1760735056124, lifecycle_stage='active', name='Exp 3 - TfIdf Trigram max_features', tags={}>

In [8]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLFlow run ....")

def run_experiment(vec_max_features):
    ngram_range = (1, 3)  # Trigram setting

    start_time = time.time()
    try:
        # Initialize the Vectorizer
        logging.info("Initialize Vectorizer ....")
        
        vec = TfidfVectorizer(ngram_range=ngram_range, max_features=vec_max_features)
    
        # Train-Test Split
        logging.info("Train test split ....")
        
        X_train, X_test, y_train, y_test = train_test_split(
            df['clean_comment'], df['category'], 
            test_size=0.2, random_state=42, stratify=df['category']
        )
    
        # Text Vectorization
        logging.info("Vectorizing the X_train & X_test ....")
        
        X_train = vec.fit_transform(X_train)
        X_test = vec.transform(X_test)

        try: 
            # Start MLflow Run
            with mlflow.start_run() as run:
                
                # ------------------ Metadata ------------------
                logging.info("Logging Metadata ....")
                
                mlflow.set_tag('mlflow.runName', f'TFIDF_TriGrams_max_features_{vec_max_features}')
                mlflow.set_tag('experiment_type', 'feature_engineering')
                mlflow.set_tag('model_type', 'RandomForestClassifier')
                mlflow.set_tag('description', f"RandomForest with TF-IDF TriGrams, max_features={vec_max_features}")
        
                # ------------------ Log Vectorizer Params ------------------
                logging.info("Logging Vectorizer Params ....")
                
                mlflow.log_param("vectorizer_type", "TF-IDF")
                mlflow.log_param("ngram_range", ngram_range)
                mlflow.log_param("vectorizer_max_features", vec_max_features)
        
                # ------------------ Log Model Params ------------------
                logging.info("Logging Model Params ....")
                
                n_estimators = 200
                max_depth = 15
                mlflow.log_param("n_estimators", n_estimators)
                mlflow.log_param("max_depth", max_depth)
                
                # ------------------ Train Model ------------------
                logging.info("Model Training & Predicion Started ....")
                
                model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
                model.fit(X_train, y_train)
        
                # ------------------ Predictions ------------------
                y_pred = model.predict(X_test)

                logging.info("Model Training & Predicion Ended ....")
        
                # ------------------ Metrics ------------------
                logging.info("Logging Metrics ....")
                
                accuracy = accuracy_score(y_test, y_pred)
                mlflow.log_metric("accuracy", accuracy)
        
                # Log classification report
                classification_rep = classification_report(y_test, y_pred, output_dict=True)
                for label, metrics in classification_rep.items():
                    if isinstance(metrics, dict):
                        for metric, value in metrics.items():
                            mlflow.log_metric(f"{label}_{metric}", value)
            
                # ------------------ Confusion Matrix ------------------
                conf_matrix = confusion_matrix(y_test, y_pred)
                plt.figure(figsize=(8, 6))
                sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.title("Confusion Matrix")
                plt.tight_layout()
        
                mlflow.log_figure(plt.gcf(), "Confusion_Matrix.png")  # log the plot
                plt.close()
        
                # ------------------ Log the Model Properly ------------------
                logging.info("Logging the model ....")
                
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="model",  # will create artifacts/model folder
                    # registered_model_name=f"rfc_TFIDF_BIgram_{vec_max_features}"
                )
        
                end_time = time.time()

                logging.info(f"Completed the Experiment in {end_time-start_time} seconds")
        
                logging.info(f"Accuracy -> {accuracy:.2f}")
        except Exception as e:
            logging.error(f"Error occured while Model Trainig: {e}")
            raise
            
    except Exception as e:
        logging.error(f"Error occured while Vectorizing: {e}")
        raise

2025-10-18 02:41:26,823 - INFO - Starting MLFlow run ....


In [9]:
# Run 10 Experiments Results on MLFlow
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

for max_features in max_features_values:
    # TF-IDF Experiments
    run_experiment(vec_max_features=max_features)

2025-10-18 02:41:26,828 - INFO - Initialize Vectorizer ....
2025-10-18 02:41:26,828 - INFO - Train test split ....
2025-10-18 02:41:26,847 - INFO - Vectorizing the X_train & X_test ....
2025-10-18 02:41:31,961 - INFO - Logging Metadata ....
2025-10-18 02:41:33,450 - INFO - Logging Vectorizer Params ....
2025-10-18 02:41:34,504 - INFO - Logging Model Params ....
2025-10-18 02:41:35,230 - INFO - Model Training & Predicion Started ....
2025-10-18 02:41:40,865 - INFO - Model Training & Predicion Ended ....
2025-10-18 02:41:40,865 - INFO - Logging Metrics ....
2025-10-18 02:41:54,095 - INFO - Logging the model ....
2025-10-18 02:42:44,798 - INFO - Completed the Experiment in 77.96922636032104 seconds
2025-10-18 02:42:44,798 - INFO - Accuracy -> 0.67
2025-10-18 02:42:45,185 - INFO - Initialize Vectorizer ....
2025-10-18 02:42:45,201 - INFO - Train test split ....
2025-10-18 02:42:45,239 - INFO - Vectorizing the X_train & X_test ....
2025-10-18 02:42:49,890 - INFO - Logging Metadata ....
2025

## Experiment 3 – RandomForest with TF-IDF and Different Max Features

### Objective
Evaluate the effect of **`max_features`** in **TF-IDF vectorizer** (ngram_range = (1,2)) on the performance of **RandomForestClassifier** for sentiment analysis.

### Experiment Setup
- **Model:** RandomForestClassifier  
- **Vectorizer:** TF-IDF  
- **N-gram range:** (1,2)  
- **Max features tested:** 10000, 9000, 8000, 7000, 6000, 5000, 4000, 3000, 2000, 1000  
- **Metrics logged:** Accuracy, Precision, Recall, F1-score (per class and weighted average)  

### Results Summary

| Max Features | Accuracy | Weighted F1-score |
|--------------|----------|-----------------|
| 10000        | 0.635    | 0.557           |
| 9000         | 0.638    | 0.561           |
| 8000         | 0.641    | 0.564           |
| 7000         | 0.643    | 0.566           |
| 6000         | 0.640    | 0.564           |
| 5000         | 0.646    | 0.571           |
| 4000         | 0.640    | 0.567           |
| 3000         | 0.645    | 0.575           |
| 2000         | 0.655    | 0.598           |
| 1000         | 0.657    | 0.608           |

### Conclusion
- Reducing `max_features` to **1000–2000** improves model performance, especially weighted F1-score and accuracy.  
- A smaller vocabulary focuses on the most important features and reduces noise.  
- Very high max_features (9000–10000) does not necessarily improve performance.  

### Recommendations / Next Steps
- Consider combining **TF-IDF (max_features=1000–2000)** with hyperparameter tuning for RandomForest (n_estimators, max_depth, min_samples_split).  
- Test other classifiers like **XGBoost or LGBM** on this optimized feature set.  
- Evaluate the model on **class imbalance metrics** to ensure minority classes (-1) are reasonably predicted.  

### Artifacts
- Confusion matrix plots  
- Trained model pickle files  
- Dataset CSV  
- MLflow logs for experiment tracking  
