In [2]:
import torch

if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

NVIDIA GeForce GTX 1660 Ti


In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("./data.csv").dropna(subset=['clean_comment'])
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [3]:
df.shape

(36625, 2)

In [4]:
df = df[~(df['clean_comment'].str.strip() == '')]

### Data Preprocessing

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Koshti's
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Koshti's
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Data Preprocessing
def preprocess_comment(comment):
    # Convert to lower
    comment = comment.lower()

    # Removing Noise
    comment = re.sub(r"http\S+|www\S+|@\S+|#\S+|[^a-zA-Z\s]", "", comment)
    comment = re.sub(r"\s+", " ", comment).strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [8]:
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

In [9]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.to_csv('preprocessed_data.csv', index=False)

### MLOps

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import scipy

In [None]:
import mlflow
import mlflow.sklearn
import joblib
import pickle
import dagshub

import logging
import time

In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Set up DagsHub credentials for MLflow tracking
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")

if not username or not token:
    raise ValueError("Missing DagsHub credentials in environment variables")

# Construct the authenticated MLflow tracking URI
mlflow_uri = f"https://{username}:{token}@dagshub.com/{username}/YouTube-Sentiment-Insights-Plugin.mlflow"

dagshub.init(repo_owner=username, repo_name="YouTube-Sentiment-Insights-Plugin", mlflow=True)
mlflow.set_tracking_uri(mlflow_uri)

In [18]:
# Set Experiment name
experiment = 'Exp 1 - BaseLine RandomForest Classifier'
mlflow.set_experiment('Exp 1 - BaseLine RandomForest Classifier')

2025/10/18 01:37:43 INFO mlflow.tracking.fluent: Experiment with name 'Exp 1 - BaseLine RandomForest Classifier' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/3e4df1b94cc94611acc0fdb2737874b4', creation_time=1760731666350, experiment_id='8', last_update_time=1760731666350, lifecycle_stage='active', name='Exp 1 - BaseLine RandomForest Classifier', tags={}>

In [19]:
# Step 1: Vectorization using Bag of Words (CountVectorizer) with top 10000 most freq vocabulary
vec = CountVectorizer(max_features=10000)

In [20]:
X = vec.fit_transform(df['clean_comment']).toarray()
y = df['category']

X.shape, y.shape

((36188, 10000), (36188,))

In [21]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [27]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLFlow run ....")

# Define and train the baseline model
with mlflow.start_run() as run:
    start_time = time.time()
    
    try:
        # ------------------ Metadata ------------------
        logging.info("Logging Metadata ....")
        
        mlflow.set_tag("mlflow.runName", "RandomForest_Baseline")
        mlflow.set_tag("experiment_type", "baseline")
        mlflow.set_tag("model_type", "RandomForestClassifier")
        mlflow.set_tag("description", "Baseline RandomForest model for sentiment analysis using Bag of Words (BoW)")
    
        # ------------------ Log Vectorizer Params ------------------
        logging.info("Logging Vectorizer Params ....")
        
        mlflow.log_param("vectorizer_type", "CountVectorizer")
        mlflow.log_param("vectorizer_max_features", vec.max_features)
    
        # ------------------ Log Model Params ------------------
        logging.info("Logging Model Params ....")
        
        n_estimators = 200
        max_depth = 15
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
    
        # ------------------ Train Model ------------------
        logging.info("Model Training & Predicion Started ....")
        
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)
    
        # ------------------ Predictions ------------------
        y_pred = model.predict(X_test)

        logging.info("Model Training & Predicion Ended ....")
        
        # ------------------ Metrics ------------------
        logging.info("Logging Metrics ....")
        
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)
    
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)
    
        # ------------------ Confusion Matrix ------------------        
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix")
        plt.tight_layout()

        mlflow.log_figure(plt.gcf(), "Confusion_Matrix.png")  # log the plot
        plt.close()
    
        # ------------------ Log the Model Properly ------------------
        logging.info("Logging the model ....")

        input_example = X_test[ :5] if not scipy.sparse.issparse(X_test) else X_test[ :5].toarray()
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",  # will create artifacts/model folder
            input_example=input_example
            # registered_model_name="RandomForest_Baseline"
        )

        end_time = time.time()

        logging.info(f"Completed the Experiment in {end_time-start_time} seconds")

        logging.info(f"Accuracy -> {accuracy:.2f}")
        
    except Exception as e:
        logging.error(f"Unexpected Error Occured: {e}")
        raise

2025-10-18 01:59:42,822 - INFO - Starting MLFlow run ....
2025-10-18 01:59:44,073 - INFO - Logging Metadata ....
2025-10-18 01:59:45,531 - INFO - Logging Vectorizer Params ....
2025-10-18 01:59:46,234 - INFO - Logging Model Params ....
2025-10-18 01:59:46,953 - INFO - Model Training & Predicion Started ....
2025-10-18 02:01:19,296 - INFO - Model Training & Predicion Ended ....
2025-10-18 02:01:19,296 - INFO - Logging Metrics ....
2025-10-18 02:01:31,826 - INFO - Logging the model ....
2025-10-18 02:02:20,919 - INFO - Completed the Experiment in 156.84683442115784 seconds
2025-10-18 02:02:20,919 - INFO - Accuracy -> 0.64


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.00      0.01      1640
           0       0.70      0.76      0.73      2466
           1       0.60      0.88      0.71      3132

    accuracy                           0.64      7238
   macro avg       0.77      0.55      0.48      7238
weighted avg       0.73      0.64      0.56      7238



## Conclusion

1. **Model Performance**
   - The RFC + BoW baseline performs reasonably on **positive** and **neutral** classes.
   - **Negative (-1) class** is almost ignored:  
     - Recall = 0.008  
     - F1 = 0.017  

2. **Misleading Metrics**
   - **Accuracy** = 0.638 and **Weighted F1** = 0.560 look okay but **mask poor performance on minority class**.

3. **Macro F1**
   - Macro F1 = 0.485 highlights that the model is **not fair across all classes**.

4. **Metric Focus**
   - **Primary:** Macro F1 → ensures balanced performance across positive, neutral, and negative.  
   - **Secondary:** Class-specific recall for negative → ensures minority class is captured.