### Which Vectorizer?

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import os

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./preprocessed_data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


### Model Training

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import scipy

In [None]:
import mlflow
import mlflow.sklearn
import joblib
import pickle
import dagshub

import logging
import time

In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Set up DagsHub credentials for MLflow tracking
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")

if not username or not token:
    raise ValueError("Missing DagsHub credentials in environment variables")

# Construct the authenticated MLflow tracking URI
mlflow_uri = f"https://{username}:{token}@dagshub.com/{username}/YouTube-Sentiment-Insights-Plugin.mlflow"

dagshub.init(repo_owner=username, repo_name="YouTube-Sentiment-Insights-Plugin", mlflow=True)
mlflow.set_tracking_uri(mlflow_uri)

2025-10-18 02:20:32,174 - INFO - HTTP Request: GET https://dagshub.com/api/v1/repos/shreekoshti199/YouTube-Sentiment-Insights-Plugin "HTTP/1.1 200 OK"


2025-10-18 02:20:32,179 - INFO - Initialized MLflow to track repo "shreekoshti199/YouTube-Sentiment-Insights-Plugin"


2025-10-18 02:20:32,181 - INFO - Repository shreekoshti199/YouTube-Sentiment-Insights-Plugin initialized!


In [20]:
# Set Experiment name
experiment = 'Exp 2 - BoW or Tfidf'
mlflow.set_experiment(experiment)

2025/10/18 02:20:36 INFO mlflow.tracking.fluent: Experiment with name 'Exp 2 - BoW or Tfidf' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/43d18caf969b4c6bb42e39dad7c74be2', creation_time=1760734239294, experiment_id='9', last_update_time=1760734239294, lifecycle_stage='active', name='Exp 2 - BoW or Tfidf', tags={}>

In [21]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLFlow run ....")

def run_experiment(vec_type, ngram_range, vec_max_features, vec_name):
    start_time = time.time()
    try:
        # Initialize the Vectorizer
        logging.info("Initialize Vectorizer ....")
        
        if vec_type == 'BoW':
            vec = CountVectorizer(ngram_range=ngram_range, max_features=vec_max_features)
        else:
            vec = TfidfVectorizer(ngram_range=ngram_range, max_features=vec_max_features)
    
        # Train-Test Split
        logging.info("Train test split ....")
        
        X_train, X_test, y_train, y_test = train_test_split(
            df['clean_comment'], df['category'], 
            test_size=0.2, random_state=42, stratify=df['category']
        )
    
        # Text Vectorization
        logging.info("Vectorizing the X_train & X_test ....")
        
        X_train = vec.fit_transform(X_train)
        X_test = vec.transform(X_test)

        try: 
            # Start MLflow Run
            with mlflow.start_run() as run:
                
                # ------------------ Metadata ------------------
                logging.info("Logging Metadata ....")
                
                mlflow.set_tag('mlflow.runName', f'{vec_type}_{ngram_range}_RandomForest')
                mlflow.set_tag('experiment_type', 'feature_engineering')
                mlflow.set_tag('model_type', 'RandomForestClassifier')
                mlflow.set_tag('description', f"RandomForest with {vec_name}, ngram_range={ngram_range}, max_features={vec_max_features}")
        
                # ------------------ Log Vectorizer Params ------------------
                logging.info("Logging Vectorizer Params ....")
                
                mlflow.log_param("vectorizer_type", vec_type)
                mlflow.log_param("ngram_range", ngram_range)
                mlflow.log_param("vectorizer_max_features", vec_max_features)
        
                # ------------------ Log Model Params ------------------
                logging.info("Logging Model Params ....")
                
                n_estimators = 200
                max_depth = 15
                mlflow.log_param("n_estimators", n_estimators)
                mlflow.log_param("max_depth", max_depth)
                
                # ------------------ Train Model ------------------
                logging.info("Model Training & Predicion Started ....")
                
                model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
                model.fit(X_train, y_train)
        
                # ------------------ Predictions ------------------
                y_pred = model.predict(X_test)

                logging.info("Model Training & Predicion Ended ....")
        
                # ------------------ Metrics ------------------
                logging.info("Logging Metrics ....")
                
                accuracy = accuracy_score(y_test, y_pred)
                mlflow.log_metric("accuracy", accuracy)
        
                # Log classification report
                classification_rep = classification_report(y_test, y_pred, output_dict=True)
                for label, metrics in classification_rep.items():
                    if isinstance(metrics, dict):
                        for metric, value in metrics.items():
                            mlflow.log_metric(f"{label}_{metric}", value)
            
                # ------------------ Confusion Matrix ------------------
                conf_matrix = confusion_matrix(y_test, y_pred)
                plt.figure(figsize=(8, 6))
                sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.title("Confusion Matrix")
                plt.tight_layout()
        
                mlflow.log_figure(plt.gcf(), "Confusion_Matrix.png")  # log the plot
                plt.close()
                
                # ------------------ Log the Model Properly ------------------
                logging.info("Logging the model ....")
                
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="model",  # will create artifacts/model folder
                    # registered_model_name=f"rfc_{vec_name}_{ngram_range}_{vec_max_features}"
                )
        
                end_time = time.time()

                logging.info(f"Completed the Experiment in {end_time-start_time} seconds")
        
                logging.info(f"Accuracy -> {accuracy:.2f}")
                
        except Exception as e:
            logging.error(f"Error occured while Model Trainig: {e}")
            raise
            
    except Exception as e:
        logging.error(f"Error occured while Vectorizing: {e}")
        raise

2025-10-18 02:20:37,115 - INFO - Starting MLFlow run ....


In [22]:
# Run 6 Experiments Results on MLFlow
ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # unigrams, bigrams, trigrams
max_features = 5000  # example

for ngram_range in ngram_ranges:
    # BoW Experiments
    run_experiment(
        vec_type="BoW",
        ngram_range=ngram_range,
        vec_max_features=max_features,
        vec_name="BoW"
    )

    # TF-IDF Experiments
    run_experiment(
        vec_type="TF-IDF",
        ngram_range=ngram_range,
        vec_max_features=max_features,
        vec_name="TF-IDF"
    )

2025-10-18 02:20:37,556 - INFO - Initialize Vectorizer ....
2025-10-18 02:20:37,557 - INFO - Train test split ....
2025-10-18 02:20:37,573 - INFO - Vectorizing the X_train & X_test ....
2025-10-18 02:20:38,740 - INFO - Logging Metadata ....
2025-10-18 02:20:40,223 - INFO - Logging Vectorizer Params ....
2025-10-18 02:20:41,335 - INFO - Logging Model Params ....
2025-10-18 02:20:42,037 - INFO - Model Training & Predicion Started ....
2025-10-18 02:20:45,659 - INFO - Model Training & Predicion Ended ....
2025-10-18 02:20:45,659 - INFO - Logging Metrics ....
2025-10-18 02:21:01,451 - INFO - Logging the model ....
2025-10-18 02:21:45,161 - INFO - Completed the Experiment in 67.60477805137634 seconds
2025-10-18 02:21:45,176 - INFO - Accuracy -> 0.65
2025-10-18 02:21:45,564 - INFO - Initialize Vectorizer ....
2025-10-18 02:21:45,564 - INFO - Train test split ....
2025-10-18 02:21:45,601 - INFO - Vectorizing the X_train & X_test ....
2025-10-18 02:21:46,693 - INFO - Logging Metadata ....
2025

## Experiment 2 – RandomForest with Different Vectorizers

### Objective
Evaluate the performance of **RandomForestClassifier** using different **vectorizers** and **n-gram ranges** for sentiment analysis on YouTube comments.

### Experiment Setup
- **Model:** RandomForestClassifier  
- **Vectorizers tested:** CountVectorizer (BoW), TF-IDF  
- **N-gram ranges tested:** (1,1), (1,2), (1,3)  
- **Metrics logged:** Accuracy, Precision, Recall, F1-score (per class and weighted average)  
- **Data split:** 80% train, 20% test  

### Results Summary

| Vectorizer | Ngram | Accuracy | Weighted F1-score |
|------------|-------|----------|-----------------|
| TF-IDF     | (1,3) | 0.646    | 0.572           |
| BoW        | (1,3) | 0.644    | 0.570           |
| TF-IDF     | (1,2) | 0.651    | 0.577           |
| BoW        | (1,2) | 0.649    | 0.576           |
| TF-IDF     | (1,1) | 0.646    | 0.571           |
| BoW        | (1,1) | 0.644    | 0.568           |

### Conclusion
- The **best performing model** is **RandomForestClassifier with TF-IDF vectorizer and ngram_range=(1,2)**.  
- TF-IDF generally outperforms BoW.  
- Using ngram_range of (1,2) gives slightly better results than (1,1) or (1,3).  

### Recommendations / Next Steps
- **Next experiment:** Optimize `max_features` for the **TF-IDF vectorizer (1,2)** to further improve model performance.  
- Consider hyperparameter tuning for RandomForest (n_estimators, max_depth, etc.) to improve F1-score.  
- Explore other classifiers (e.g., XGBoost, LGBM) on the same feature set.  

### Artifacts
- Confusion matrix plots  
- Trained model pickle files  
- Dataset CSV  
- MLflow logs for experiment tracking  
