In [1]:
# Imports

import re
import nltk
import spacy
import plotly
import optuna
import mlflow
import dagshub
import mlflow.sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
nlp = spacy.load("en_core_web_sm")

Note: To be able to run the above cell, make sure you have downloaded `en_core_web_sm` using the following command in your terminal inside your environment:
```bash
python -m spacy download en_core_web_sm
```

In [3]:
# Setting up DagsHub

dagshub.init(repo_owner='SushrutGaikwad', repo_name='youtube-comments-analyzer', mlflow=True)

# Data

In [4]:
RAW_DATA_PATH = "../data/raw/Reddit_Data.csv"
df = pd.read_csv(RAW_DATA_PATH)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df = df[~(df["clean_comment"].str.strip() == "")]
df.shape

(36793, 2)

# Running the experiment

In [5]:
# Setting experiment name

mlflow.set_experiment("Improving LightGBM")

<Experiment: artifact_location='mlflow-artifacts:/eb66f0b362cf4a6e9e8119850de3216b', creation_time=1749135817604, experiment_id='7', last_update_time=1749135817604, lifecycle_stage='active', name='Improving LightGBM', tags={}>

## Preprocessing

In [6]:
mapping = {
    -1: 2,
    0: 0,
    1: 1
}
df["category"] = df["category"].map(mapping)

In [7]:
def preprocess_comment_for_feature_extraction(comment):
    comment = comment.lower()
    comment = comment.strip()
    return comment

In [8]:
df["clean_comment"] = df["clean_comment"].apply(
    preprocess_comment_for_feature_extraction
)
df.shape

(36793, 2)

In [9]:
def extract_custom_features(text):
    doc = nlp(text)
    word_list = [token.text for token in doc]
    
    # Comment length
    comment_length = len(text)
    
    # Word count
    word_count = len(word_list)
    
    # Average word length
    if word_count > 0:
        avg_word_length = sum(len(word) for word in word_list) / word_count
    else:
        avg_word_length = 0
    
    # Unique word count
    unique_word_count = len(set(word_list))
    
    # Lexical diversity
    if word_count > 0:
        lexical_diversity = unique_word_count / word_count
    else:
        lexical_diversity = 0
    
    # Count of POS tags
    pos_count = len([token.pos_ for token in doc])
    
    # Proportion of POS tags
    pos_tags = [token.pos_ for token in doc]
    if word_count > 0:
        pos_proportion = {
            tag: pos_tags.count(tag) / word_count for tag in set(pos_tags)
        }
    else:
        pos_proportion = {}
    
    return {
        "comment_length": comment_length,
        "word_count": word_count,
        "avg_word_length": avg_word_length,
        "unique_word_count": unique_word_count,
        "lexical_diversity": lexical_diversity,
        "pos_count": pos_count,
        **pos_proportion  # Flattening the POS proportions
    }

In [10]:
custom_features = pd.DataFrame([
    extract_custom_features(comment) for comment in df["clean_comment"]
])
custom_features.head()

Unnamed: 0,comment_length,word_count,avg_word_length,unique_word_count,lexical_diversity,pos_count,VERB,ADV,DET,AUX,...,NOUN,PROPN,NUM,INTJ,SCONJ,PART,X,PUNCT,SPACE,SYM
0,259,39,5.666667,34,0.871795,39,0.179487,0.076923,0.102564,0.076923,...,0.333333,0.025641,,,,,,,,
1,1268,196,5.47449,136,0.693878,196,0.214286,0.112245,0.02551,0.05102,...,0.219388,0.081633,0.005102,0.005102,0.035714,,,,,
2,459,86,4.348837,67,0.77907,86,0.174419,0.104651,0.069767,0.023256,...,0.186047,0.046512,0.011628,,0.034884,,,,,
3,167,29,4.793103,24,0.827586,29,0.137931,0.034483,0.103448,0.068966,...,0.275862,,,,,0.034483,,,,
4,690,112,5.169643,82,0.732143,112,0.223214,0.0625,0.035714,0.089286,...,0.142857,0.044643,,,0.026786,0.008929,,,,


In [11]:
custom_features.isnull().sum()

comment_length           0
word_count               0
avg_word_length          0
unique_word_count        0
lexical_diversity        0
pos_count                0
VERB                  6130
ADV                  16269
DET                  15488
AUX                  16988
CCONJ                22872
ADP                  18132
PRON                 15286
ADJ                  11863
NOUN                  3587
PROPN                16363
NUM                  30660
INTJ                 33212
SCONJ                24252
PART                 30277
X                    35674
PUNCT                35397
SPACE                36469
SYM                  36769
dtype: int64

In [12]:
custom_features.fillna(0, inplace=True)
custom_features.isnull().sum()

comment_length       0
word_count           0
avg_word_length      0
unique_word_count    0
lexical_diversity    0
pos_count            0
VERB                 0
ADV                  0
DET                  0
AUX                  0
CCONJ                0
ADP                  0
PRON                 0
ADJ                  0
NOUN                 0
PROPN                0
NUM                  0
INTJ                 0
SCONJ                0
PART                 0
X                    0
PUNCT                0
SPACE                0
SYM                  0
dtype: int64

In [13]:
custom_features.shape

(36793, 24)

In [14]:
df = pd.concat([df.reset_index(drop=True), custom_features.reset_index(drop=True)], axis=1)
df.shape

(36793, 26)

In [15]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df = df[~(df["clean_comment"].str.strip() == "")]

stop_words_to_include = {"not", "but", "however", "no", "yet"}

def preprocess_comment(comment):
    """
    This function performs the following tasks on a comment:
        1) Converts the comment to lowercase,
        2) Strips the trailing and leading whitespaces,
        3) Removes newline characters,
        4) Removes non-alphanumeric characters except punctuations,
        5) Removes stopwords except a few important ones for sentiment analysis,
        6) Lemmatizes the comment.
    """
    comment = comment.lower()
    comment = comment.strip()
    comment = re.sub(r"\n", " ", comment)
    comment = re.sub(r"[^A-Za-z0-9\s!?.,]", "", comment)
    stop_words = set(stopwords.words("english")) - stop_words_to_include
    comment = " ".join(
        [word for word in comment.split() if word not in stop_words]
    )
    lemmatizer = WordNetLemmatizer()
    comment = " ".join(
        [lemmatizer.lemmatize(word) for word in comment.split()]
    )
    return comment

In [16]:
df["clean_comment"] = df["clean_comment"].apply(preprocess_comment)
df.shape

(36607, 26)

In [17]:
df.columns

Index(['clean_comment', 'category', 'comment_length', 'word_count',
       'avg_word_length', 'unique_word_count', 'lexical_diversity',
       'pos_count', 'VERB', 'ADV', 'DET', 'AUX', 'CCONJ', 'ADP', 'PRON', 'ADJ',
       'NOUN', 'PROPN', 'NUM', 'INTJ', 'SCONJ', 'PART', 'X', 'PUNCT', 'SPACE',
       'SYM'],
      dtype='object')

In [18]:
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["category"]
)

In [19]:
train_df.shape, test_df.shape

((29285, 26), (7322, 26))

In [20]:
train_df.columns

Index(['clean_comment', 'category', 'comment_length', 'word_count',
       'avg_word_length', 'unique_word_count', 'lexical_diversity',
       'pos_count', 'VERB', 'ADV', 'DET', 'AUX', 'CCONJ', 'ADP', 'PRON', 'ADJ',
       'NOUN', 'PROPN', 'NUM', 'INTJ', 'SCONJ', 'PART', 'X', 'PUNCT', 'SPACE',
       'SYM'],
      dtype='object')

In [21]:
X_train = train_df["clean_comment"]
y_train = train_df["category"]

X_test = test_df["clean_comment"]
y_test = test_df["category"]

In [22]:
ngram_range = (1, 2)
max_features = 1000
vectorizer = CountVectorizer(
    ngram_range=ngram_range,
    max_features=max_features
)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [23]:
X_train_vectorized_df = pd.DataFrame(
    X_train_vectorized.toarray(),
    columns=vectorizer.get_feature_names_out()
)
X_test_vectorized_df = pd.DataFrame(
    X_test_vectorized.toarray(),
    columns=vectorizer.get_feature_names_out()
)

In [24]:
X_train_vectorized_df.shape, X_test_vectorized_df.shape

((29285, 1000), (7322, 1000))

In [25]:
X_train_combined = pd.concat(
    [
        X_train_vectorized_df.reset_index(drop=True),
        train_df.drop(columns=["clean_comment", "category"]).reset_index(drop=True)
    ],
    axis=1
)

X_test_combined = pd.concat(
    [
        X_test_vectorized_df.reset_index(drop=True),
        test_df.drop(columns=["clean_comment", "category"]).reset_index(drop=True)
    ],
    axis=1
)

In [26]:
X_train_combined.shape, X_test_combined.shape

((29285, 1024), (7322, 1024))

In [27]:
# Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_combined, y_train = rus.fit_resample(
    X_train_combined,
    y_train
)

X_train_combined = X_train_combined.astype(np.float32)
X_test_combined = X_test_combined.astype(np.float32)

In [28]:
X_train_combined.shape, X_test_combined.shape

((19758, 1024), (7322, 1024))

In [30]:
def objective(trial):
    # Suggest hyperparameters to be tuned
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "metric": "multi_logloss",
        # "is_unbalance": True,
        # "class_weight": "balanced",
    }

    # Initialize the LightGBM model with suggested parameters
    model = LGBMClassifier(**params, random_state=42, n_jobs=-1)

    # Perform cross-validation to evaluate the model performance
    scores = cross_val_score(
        model,
        X_train_combined,
        y_train,
        cv=3,
        scoring="accuracy",
        n_jobs=-1
    )

    # Return the mean accuracy score across folds
    return scores.mean()

In [31]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2025-06-07 12:11:07,505] A new study created in memory with name: no-name-bf4397fb-f475-4138-bd7d-69d42e33083b
[I 2025-06-07 12:11:16,007] Trial 0 finished with value: 0.6718797449134527 and parameters: {'learning_rate': 0.03680924831843726, 'min_child_samples': 149, 'max_depth': 13, 'n_estimators': 383}. Best is trial 0 with value: 0.6718797449134527.
[I 2025-06-07 12:11:26,854] Trial 1 finished with value: 0.761716772952728 and parameters: {'learning_rate': 0.08410481947899216, 'min_child_samples': 41, 'max_depth': 14, 'n_estimators': 365}. Best is trial 1 with value: 0.761716772952728.
[I 2025-06-07 12:11:37,655] Trial 2 finished with value: 0.64799068731653 and parameters: {'learning_rate': 0.03908200506114505, 'min_child_samples': 197, 'max_depth': 14, 'n_estimators': 473}. Best is trial 1 with value: 0.761716772952728.
[I 2025-06-07 12:11:43,868] Trial 3 finished with value: 0.7106994635084524 and parameters: {'learning_rate': 0.04257188022073709, 'min_child_samples': 102, 'ma

In [32]:
best_params = study.best_trial.params
best_params

{'learning_rate': 0.04812659274087647,
 'min_child_samples': 14,
 'max_depth': 28,
 'n_estimators': 193}

In [33]:
best_model = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    metric="multi_logloss",
    # is_unbalance=True,
    # class_weight="balanced",
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    n_estimators=best_params["n_estimators"],
    min_child_samples=best_params["min_child_samples"],
    random_state=42,
    n_jobs=-1
)

In [34]:
best_model.fit(X_train_combined, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9875
[LightGBM] [Info] Number of data points in the train set: 19758, number of used features: 1009
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [36]:
y_train_pred = best_model.predict(X_train_combined)
accuracy_train = accuracy_score(
    y_true=y_train,
    y_pred=y_train_pred
)
accuracy_train

0.8358639538414819

In [38]:
classification_report_train = classification_report(
    y_true=y_train,
    y_pred=y_train_pred
)
print(classification_report_train)

              precision    recall  f1-score   support

           0       0.80      0.94      0.86      6586
           1       0.89      0.79      0.84      6586
           2       0.84      0.77      0.80      6586

    accuracy                           0.84     19758
   macro avg       0.84      0.84      0.83     19758
weighted avg       0.84      0.84      0.83     19758



In [39]:
# Make predictions on the test data
y_test_pred = best_model.predict(X_test_combined)
accuracy_test = accuracy_score(
    y_true=y_test,
    y_pred=y_test_pred
)
accuracy_test

0.792952745151598

In [40]:
classification_report_test = classification_report(
    y_true=y_test,
    y_pred=y_test_pred
)
print(classification_report_test)

              precision    recall  f1-score   support

           0       0.80      0.92      0.85      2530
           1       0.89      0.73      0.81      3145
           2       0.64      0.71      0.67      1647

    accuracy                           0.79      7322
   macro avg       0.78      0.79      0.78      7322
weighted avg       0.80      0.79      0.79      7322



In [41]:
# Function to log results to MLFlow
def log_to_mlflow(
    model_name,
    improvement_technique,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    best_params
):
    with mlflow.start_run():
        # Tags
        mlflow.set_tag(
            "mlflow.runName", f"{model_name}_{improvement_technique}"
        )
        mlflow.set_tag("experiment_type", "Improving LightGBM")
        
        # Logging improvement technique as a parameter
        mlflow.log_param("improvement_technique", improvement_technique)
        
        # Initializing and training the model
        model.fit(X_train, y_train)
        
        # Making predictions on the test set and logging metrics
        y_pred = model.predict(X_test)
        
        # Logging cross-val accuracy
        scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        mlflow.log_metric("cross_val_accuracy", scores.mean())
        
        # Logging accuracy
        accuracy = accuracy_score(
            y_true=y_test,
            y_pred=y_pred
        )
        mlflow.log_metric("test_accuracy", accuracy)
        
        # Logging classification report metrics
        classification_rep = classification_report(
            y_true=y_test,
            y_pred=y_pred,
            output_dict=True
        )
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}: {metric} - test", value)
        
        # Logging the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        
        # Logging the best parameters
        mlflow.log_params(best_params)

In [42]:
log_to_mlflow(
    model_name="LightGBM",
    improvement_technique="custom_features",
    model=best_model,
    X_train=X_train_combined,
    X_test=X_test_combined,
    y_train=y_train,
    y_test=y_test,
    best_params=best_params
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9875
[LightGBM] [Info] Number of data points in the train set: 19758, number of used features: 1009
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_custom_features at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7/runs/d5060cc82d0d462c86f7578b55196087
🧪 View experiment at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7
