In [3]:
import numpy as np
import pandas as pd
import os
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import os
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import optuna
from imblearn.over_sampling import SMOTE


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv(r"E:\DATA SCIENCE & AI\DATASET\youtube_comment_analysis\final_processed_df.csv")

In [5]:
df.dropna(subset=['clean_comment'],inplace=True)

In [6]:
import dagshub
dagshub.init(repo_owner='Pravat-21', repo_name='ML-Project-YouTube-Comment-Analysis', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/Pravat-21/ML-Project-YouTube-Comment-Analysis.mlflow")

In [7]:
df['category'] = df['category'].map({-1:2,0:0,1:1})

In [8]:
df.sample()

Unnamed: 0,clean_comment,category
28733,uber driver today insulted muslim manmohan sin...,1


In [9]:
ngram_range = (1,3)
max_features = 800

vectorizer = TfidfVectorizer(ngram_range=ngram_range,max_features=max_features)

X = vectorizer.fit_transform(df['clean_comment'])
y= df["category"]

smote = SMOTE(random_state=21)
X,y = smote.fit_resample(X,y)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=21,stratify=y)

### **Model choosing:**

In [10]:

def model_objective(trial,classifier_name):

    #classifier_name = trial.suggest_categorical('classifier',['RF','LGBM'])

    if classifier_name == "xgboost":

        n_estimators = trial.suggest_int("n_estimators",50,400)
        learning_rate = trial.suggest_float("learning_rate", 1e-4,1e-1, log = True)
        max_depth = trial.suggest_int("max_depth",3,15)

        model = XGBClassifier(n_estimators=n_estimators,learning_rate = learning_rate,max_depth=max_depth)

    elif classifier_name == "SVC":
        C = trial.suggest_float("C",0.1,100,log=True)
        kernel = trial.suggest_categorical("kernel",['linear','rbf','poly','sigmoid'])
        gamma = trial.suggest_categorical("gamma",['auto','scale'])

        model = SVC(C=C,kernel=kernel,gamma=gamma)

    elif classifier_name == "LGBM":
        n_estimators = trial.suggest_int("n_estimators",50,400)
        learning_rate = trial.suggest_float("learning_rate", 1e-4,1e-1, log = True)
        max_depth = trial.suggest_int("max_depth",3,15)

        model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth= max_depth)

    elif classifier_name == "NB":
        alpha = trial.suggest_float('alpha',0.01,1)

        model = MultinomialNB(alpha=alpha)

    elif classifier_name == 'RF':
        n_estimators = trial.suggest_int("n_estimators",50,400)
        max_depth = trial.suggest_int("max_depth",3,15)
        criterion = trial.suggest_categorical("criterion",['gini','entropy','log_loss'])

        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion= criterion)

    model.fit(X_train,y_train)
    
    y_pred=model.predict(X_test)
        
    accuracy=accuracy_score(y_test,y_pred)

    return accuracy


In [19]:
study = optuna.create_study(direction='maximize')
study.optimize(model_objective,n_trials=50)

[I 2025-11-21 20:46:35,394] A new study created in memory with name: no-name-7164d5e7-0751-44dd-ab9f-8a575066cc4d
[I 2025-11-21 20:46:36,831] Trial 0 finished with value: 0.6030437539632213 and parameters: {'classifier': 'RF', 'n_estimators': 177, 'max_depth': 3, 'criterion': 'entropy'}. Best is trial 0 with value: 0.6030437539632213.
[I 2025-11-21 20:46:39,746] Trial 1 finished with value: 0.6236525047558655 and parameters: {'classifier': 'RF', 'n_estimators': 316, 'max_depth': 5, 'criterion': 'gini'}. Best is trial 1 with value: 0.6236525047558655.
[I 2025-11-21 20:46:44,203] Trial 2 finished with value: 0.6418304798139928 and parameters: {'classifier': 'RF', 'n_estimators': 330, 'max_depth': 6, 'criterion': 'entropy'}. Best is trial 2 with value: 0.6418304798139928.
[I 2025-11-21 20:46:46,264] Trial 3 finished with value: 0.6256605368843796 and parameters: {'classifier': 'RF', 'n_estimators': 164, 'max_depth': 5, 'criterion': 'entropy'}. Best is trial 2 with value: 0.641830479813992

In [20]:
study.best_params

{'classifier': 'LGBM',
 'n_estimators': 353,
 'learning_rate': 0.09844151722507702,
 'max_depth': 14}

In [None]:
study.

In [14]:
classifier_name = ['xgboost','RF']
for model in classifier_name:

    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: model_objective(trial, model),n_trials=5)

    print(study.best_params)
    
    if model == 'xgboost':

        classifier = XGBClassifier(**study.best_params)
        
    elif model == 'RF':
        classifier = RandomForestClassifier(**study.best_params)

    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_pred,y_test)

    print(accuracy)


[I 2025-11-21 23:08:48,599] A new study created in memory with name: no-name-c0a5a0eb-9fc3-4078-8dd4-7a6f70cc46ab
[I 2025-11-21 23:09:29,282] Trial 0 finished with value: 0.7395899387021772 and parameters: {'n_estimators': 80, 'learning_rate': 0.0743116597534753, 'max_depth': 9}. Best is trial 0 with value: 0.7395899387021772.
[I 2025-11-21 23:16:12,135] Trial 1 finished with value: 0.6491228070175439 and parameters: {'n_estimators': 344, 'learning_rate': 0.0010408639198874463, 'max_depth': 13}. Best is trial 0 with value: 0.7395899387021772.
[I 2025-11-21 23:21:04,862] Trial 2 finished with value: 0.7364193616571549 and parameters: {'n_estimators': 363, 'learning_rate': 0.011829912726465098, 'max_depth': 11}. Best is trial 0 with value: 0.7395899387021772.
[I 2025-11-21 23:23:35,416] Trial 3 finished with value: 0.7715070809554005 and parameters: {'n_estimators': 128, 'learning_rate': 0.051064816164656496, 'max_depth': 15}. Best is trial 3 with value: 0.7715070809554005.
[I 2025-11-21

{'n_estimators': 128, 'learning_rate': 0.051064816164656496, 'max_depth': 15}


[I 2025-11-21 23:28:59,754] A new study created in memory with name: no-name-389f3952-799d-4767-8463-007c9684da85


0.7715070809554005


[I 2025-11-21 23:29:03,014] Trial 0 finished with value: 0.637708729655464 and parameters: {'n_estimators': 331, 'max_depth': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.637708729655464.
[I 2025-11-21 23:29:03,894] Trial 1 finished with value: 0.6257662227858803 and parameters: {'n_estimators': 56, 'max_depth': 6, 'criterion': 'entropy'}. Best is trial 0 with value: 0.637708729655464.
[I 2025-11-21 23:29:08,899] Trial 2 finished with value: 0.6609596279856267 and parameters: {'n_estimators': 186, 'max_depth': 11, 'criterion': 'log_loss'}. Best is trial 2 with value: 0.6609596279856267.
[I 2025-11-21 23:29:13,224] Trial 3 finished with value: 0.6526104417670683 and parameters: {'n_estimators': 232, 'max_depth': 8, 'criterion': 'entropy'}. Best is trial 2 with value: 0.6609596279856267.
[I 2025-11-21 23:29:23,713] Trial 4 finished with value: 0.67712957091524 and parameters: {'n_estimators': 320, 'max_depth': 13, 'criterion': 'entropy'}. Best is trial 4 with value: 0.677129570

{'n_estimators': 320, 'max_depth': 13, 'criterion': 'entropy'}
0.6733248784612132


In [None]:
model = 