In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from scipy.sparse import hstack
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from model_selection_functions import train_models, prepare_train_test



  if not hasattr(np, "object"):


In [2]:
df = pd.read_csv('../data/youtube_comments_preprocessed_v2.csv')

In [8]:
df.isna().sum()

CommentText       0
Sentiment         0
Likes             0
Comment_Length    0
Month             0
DayOfWeek         0
Hour              0
IsWeekend         0
dtype: int64

In [3]:
df.dropna(inplace=True)

In [5]:
df_small = df.sample(50000, random_state=42)

In [4]:
from sklearn.model_selection import train_test_split

In [21]:
df_small.isna().sum()

CommentText       0
Sentiment         0
Likes             0
Replies           0
Comment_Length    0
Month             0
DayOfWeek         0
Hour              0
IsWeekend         0
dtype: int64

In [15]:
X_train, X_test, y_train, y_test, tfidf = prepare_train_test(df_small)

In [6]:
from sklearn.svm import LinearSVC

In [24]:
lr = LogisticRegression(
    max_iter=4000,
    solver="saga",
    penalty="l2",
    C=2.0,
    class_weight="balanced"
)

svc = LinearSVC(
    C=1.0,
    class_weight="balanced"
)

lgbm = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=50,
    objective="multiclass",
    num_class=3,
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

list_of_models = [lr, svc, lgbm, xgb,]

In [25]:
best_models = train_models(list_of_models, X_train, X_test,y_train ,y_test)

===== LogisticRegression =====




Accuracy   : 0.3815
Macro F1   : 0.3354
Confusion matrix:
[[1263  112 2030]
 [ 851  276 2145]
 [ 822  225 2276]]
Classification report:
              precision    recall  f1-score   support

           0       0.43      0.37      0.40      3405
           1       0.45      0.08      0.14      3272
           2       0.35      0.68      0.47      3323

    accuracy                           0.38     10000
   macro avg       0.41      0.38      0.34     10000
weighted avg       0.41      0.38      0.34     10000

------------------------------------------------------------
===== LinearSVC =====
Accuracy   : 0.6431
Macro F1   : 0.6430
Confusion matrix:
[[2208  778  419]
 [ 789 1929  554]
 [ 448  581 2294]]
Classification report:
              precision    recall  f1-score   support

           0       0.64      0.65      0.64      3405
           1       0.59      0.59      0.59      3272
           2       0.70      0.69      0.70      3323

    accuracy                           0.64   



Accuracy   : 0.6828
Macro F1   : 0.6830
Confusion matrix:
[[2325  662  418]
 [ 587 2193  492]
 [ 420  593 2310]]
Classification report:
              precision    recall  f1-score   support

           0       0.70      0.68      0.69      3405
           1       0.64      0.67      0.65      3272
           2       0.72      0.70      0.71      3323

    accuracy                           0.68     10000
   macro avg       0.68      0.68      0.68     10000
weighted avg       0.68      0.68      0.68     10000

------------------------------------------------------------
===== XGBClassifier =====
Accuracy   : 0.6751
Macro F1   : 0.6760
Confusion matrix:
[[2263  776  366]
 [ 530 2296  446]
 [ 396  735 2192]]
Classification report:
              precision    recall  f1-score   support

           0       0.71      0.66      0.69      3405
           1       0.60      0.70      0.65      3272
           2       0.73      0.66      0.69      3323

    accuracy                           0.6

In [26]:
df_small2=df_small.copy()
df_small2["Likes"] = np.log1p(df["Likes"])
df_small2["Replies"] = np.log1p(df["Replies"])

In [27]:
X_train2, X_test2, y_train2, y_test2, tfidf2 = prepare_train_test(df_small2)

In [28]:
lr = LogisticRegression(
    solver="saga",
    max_iter=8000,
    C=2.0,
    class_weight="balanced"
)

In [29]:
lr.fit(X_train2, y_train2)
y_pred =  lr.predict(X_test)
print(classification_report(y_test2, y_pred))
print("Accuracy:", accuracy_score(y_test2, y_pred))
print("F1 Score:", f1_score(y_test2, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test2, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.55      0.56      3405
           1       0.55      0.48      0.51      3272
           2       0.57      0.67      0.62      3323

    accuracy                           0.57     10000
   macro avg       0.57      0.57      0.57     10000
weighted avg       0.57      0.57      0.57     10000

Accuracy: 0.5688
F1 Score: 0.5661226719642591
Confusion Matrix:
 [[1870  726  809]
 [ 838 1575  859]
 [ 514  566 2243]]


In [30]:
svc.fit(X_train2, y_train2)
y_pred = svc.predict(X_test2)
print(classification_report(y_test2, y_pred))
print("Accuracy:", accuracy_score(y_test2, y_pred))
print("F1 Score:", f1_score(y_test2, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test2, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.66      0.65      3405
           1       0.59      0.60      0.59      3272
           2       0.72      0.69      0.71      3323

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000

Accuracy: 0.6501
F1 Score: 0.6507267045731183
Confusion Matrix:
 [[2241  803  361]
 [ 788 1965  519]
 [ 448  580 2295]]


In [31]:
lgbm = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    random_state=42,
    n_estimators=1500 
)

In [32]:
param_dist = {
    "learning_rate": [0.02, 0.03, 0.05, 0.08, 0.1],
    "num_leaves": [31, 63, 127],
    "min_data_in_leaf": [20, 30, 50, 80, 120],
    "feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0],
    "bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0],
    "bagging_freq": [0, 5, 10],
    "lambda_l2": [0.0, 0.5, 1.0, 2.0, 5.0]
}

In [33]:
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=25, 
    scoring="f1_macro",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)

print("Best Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)
best_lgbm = search.best_estimator_

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.178831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 173743
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 5985
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792
Best Macro F1: 0.6731740289148461
Best Params: {'num_leaves': 63, 'min_data_in_leaf': 20, 'learning_rate': 0.02, 'lambda_l2': 5.0, 'feature_fraction': 0.7, 'bagging_freq': 10, 'bagging_fraction': 0.6}


In [34]:
y_pred = best_lgbm.predict(X_test)
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test, y_pred))



Macro F1: 0.6906568998178252
Accuracy: 0.6905


In [7]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [8]:
from model_selection_functions import prepare_train_test_delete_stop_words

In [9]:
X_train3, X_test3, y_train3, y_test3, tfidf3 = prepare_train_test_delete_stop_words(df_small)

In [7]:
best_lgbm = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    random_state=42,
    n_estimators=1500,
    #best params from RandomizedSearch
    num_leaves=63,
    min_data_in_leaf=20,
    learning_rate=0.02,
    reg_lambda=5.0,
    feature_fraction=0.7,
    bagging_freq=10,
    bagging_fraction=0.6
)

In [11]:
best_lgbm.fit(X_train3, y_train3)
y_pred = best_lgbm.predict(X_test3)
print("Macro F1:", f1_score(y_test3, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test3, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83606
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 3094
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792




Macro F1: 0.6750780022787247
Accuracy: 0.6744


In [13]:
print(confusion_matrix(y_test3, y_pred))

[[2302  730  373]
 [ 553 2183  536]
 [ 363  701 2259]]


In [12]:
df_small['Sentiment'].value_counts()

Sentiment
0    17023
2    16614
1    16363
Name: count, dtype: int64

In [21]:
from preprocessing_functions import lemmatize_spacy_series

In [23]:
df_lemma=df_small.copy()
df_lemma['CommentText'] = lemmatize_spacy_series(df_lemma['CommentText'].tolist())

In [24]:
X_train_lemma, X_test_lemma, y_train_lemma, y_test_lemma, tfidf_lemma = prepare_train_test_delete_stop_words(df_lemma)

In [15]:
best_lgbm_lemma = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    random_state=42,
    n_estimators=1500,
    #best params from RandomizedSearch
    num_leaves=63,
    min_data_in_leaf=20,
    learning_rate=0.02,
    reg_lambda=5.0,
    feature_fraction=0.7,
    bagging_freq=10,
    bagging_fraction=0.6
)

In [26]:
best_lgbm_lemma.fit(X_train_lemma, y_train_lemma)
y_pred = best_lgbm_lemma.predict(X_test_lemma)
print("Macro F1:", f1_score(y_test_lemma, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test_lemma, y_pred))
print(confusion_matrix(y_test_lemma, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79981
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 2770
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792




Macro F1: 0.6721684296283601
Accuracy: 0.6716
[[2296  734  375]
 [ 554 2176  542]
 [ 398  681 2244]]


In [7]:
from model_selection_functions import prepare_train_test_v2

In [8]:
X_train4, X_test4, y_train4, y_test4, tfidf4 = prepare_train_test_v2(df_small)

In [10]:
best_lgbm.fit(X_train4, y_train4)
y_pred = best_lgbm.predict(X_test4)
print("Macro F1:", f1_score(y_test4, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test4, y_pred))
print(confusion_matrix(y_test4, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.020596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1290402
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 23452
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792




Macro F1: 0.6991476078522595
Accuracy: 0.6992
[[2433  613  359]
 [ 555 2191  526]
 [ 386  569 2368]]


In [1]:
from model_selection_functions import prepare_train_test_v3

In [9]:
X_train4, X_test4, y_train4, y_test4, tfidf4 = prepare_train_test_v3(df_small)

In [10]:
best_lgbm.fit(X_train4, y_train4)
y_pred = best_lgbm.predict(X_test4)
print("Macro F1:", f1_score(y_test4, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test4, y_pred))
print(confusion_matrix(y_test4, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.195608 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1290427
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 23452
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792




Macro F1: 0.6978756653182788
Accuracy: 0.6979
[[2426  620  359]
 [ 558 2188  526]
 [ 385  573 2365]]


In [1]:
from model_selection_functions import prepare_train_test_only_text

In [8]:
X_train5, X_test5, y_train5, y_test5, tfidf5 = prepare_train_test_only_text(df_small)

lr = LogisticRegression(
    solver="saga",
    max_iter=8000,
    C=2.0,
    class_weight="balanced"
)

svc = LinearSVC(
    C=1.0,
    class_weight="balanced"
)

In [9]:
lr.fit(X_train5, y_train5)
y_pred =  lr.predict(X_test5)
print(classification_report(y_test5, y_pred))
print("Accuracy:", accuracy_score(y_test5, y_pred))
print("F1 Score:", f1_score(y_test5, y_pred, average='macro'))


              precision    recall  f1-score   support

           0       0.66      0.68      0.67      3405
           1       0.60      0.62      0.61      3272
           2       0.73      0.68      0.70      3323

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000

Accuracy: 0.6577
F1 Score: 0.658101118548118


In [10]:
svc.fit(X_train5, y_train5)
y_pred = svc.predict(X_test5)
print(classification_report(y_test5, y_pred))
print("Accuracy:", accuracy_score(y_test5, y_pred))
print("F1 Score:", f1_score(y_test5, y_pred, average='macro'))

              precision    recall  f1-score   support

           0       0.64      0.66      0.65      3405
           1       0.59      0.59      0.59      3272
           2       0.71      0.69      0.70      3323

    accuracy                           0.65     10000
   macro avg       0.65      0.64      0.64     10000
weighted avg       0.65      0.65      0.65     10000

Accuracy: 0.645
F1 Score: 0.6448429536459587


Permutation Importance

In [11]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer

In [14]:
y = df_small["Sentiment"]
df_train, df_test, y_train, y_test = train_test_split(df_small, y, test_size=0.2, stratify=y, random_state=42)
numeric_features = ["Likes", "Replies", "Comment_Length", "Month", "DayOfWeek", "Hour", "IsWeekend"]

In [18]:
X_other_train = df_train[numeric_features].astype(float)
X_other_test  = df_test[numeric_features].astype(float)

best_lgbm.fit(X_other_train, y_train)

f1_macro = make_scorer(f1_score, average="macro")

perm = permutation_importance(
    best_lgbm,
    X_other_test,
    y_test,
    scoring=f1_macro,
    n_repeats=5,
    random_state=42,
    n_jobs=-1
)

imp_df = pd.DataFrame({
    "feature": numeric_features,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False).reset_index(drop=True)

print(imp_df.head(20))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 676
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 7
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792
          feature  importance_mean  importance_std
0           Month         0.066638        0.001661
1           Likes         0.056785        0.004278
2       DayOfWeek         0.055044        0.003073
3  Comment_Length         0.034385        0.001756
4            Hour         0.020038        0.001780
5         Replies         0.009483        0.001611
6       IsWeekend         0.000277        0.001092


Trying Keras for better results

In [8]:
import tensorflow as tf
from keras import layers
from sklearn.model_selection import train_test_split
from tensorflow import keras

from model_selection_functions import prepare_text_datasets

In [7]:
train_ds, test_ds, vectorizer = prepare_text_datasets(df_small)

In [9]:
inputs = keras.Input(shape=(1,), dtype=tf.string)
x = vectorizer(inputs)
x = layers.Embedding(40000, 128)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(3, activation="softmax")(x)

model = keras.Model(inputs, outputs)

model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [10]:
from keras.callbacks import EarlyStopping

callbacks = [EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)]

model.fit(train_ds,validation_data=test_ds,epochs=6,callbacks=callbacks)





Epoch 1/6
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 63ms/step - accuracy: 0.5442 - loss: 0.9447 - val_accuracy: 0.6451 - val_loss: 0.8145
Epoch 2/6
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 63ms/step - accuracy: 0.7014 - loss: 0.7065 - val_accuracy: 0.6474 - val_loss: 0.8122
Epoch 3/6
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.7907 - loss: 0.5294 - val_accuracy: 0.6320 - val_loss: 0.8988
Epoch 4/6
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.8630 - loss: 0.3732 - val_accuracy: 0.6144 - val_loss: 1.0388


<keras.src.callbacks.history.History at 0x1838aa4d550>

In [12]:
y_pred_proba = model.predict(test_ds)
y_pred = np.argmax(y_pred_proba, axis=1)

y_true = np.concatenate([y for x, y in test_ds], axis=0)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Macro F1:", f1_score(y_true, y_pred, average="macro"))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
Accuracy: 0.6474
Macro F1: 0.6462212320354186

Confusion Matrix:
[[2183  751  471]
 [ 765 1888  619]
 [ 437  483 2403]]


Trying two_channel for lgbm

In [1]:
from model_selection_functions import prepare_train_test_two_channel

  if not hasattr(np, "object"):


In [9]:
X_train, X_test, y_train, y_test, (tfidf_word, tfidf_char) = prepare_train_test_two_channel(df_small)

In [10]:
best_lgbm.fit(X_train, y_train)

y_pred = best_lgbm.predict(X_test)

print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.200898 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372328
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 26503
[LightGBM] [Info] Start training from score -1.077487
[LightGBM] [Info] Start training from score -1.116954
[LightGBM] [Info] Start training from score -1.101792




Macro F1: 0.695601707801284
Accuracy: 0.6957
[[2427  610  368]
 [ 564 2175  533]
 [ 393  575 2355]]


In [7]:
df.drop(columns=['Replies'],inplace=True)


In [6]:
from model_selection_functions import prepare_train_test_v3_for_train

In [7]:
X_train4, X_test4, y_train4, y_test4, tfidf4 = prepare_train_test_v3_for_train(df)

In [8]:
best_lgbm = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    random_state=42,
    n_estimators=5000,
    n_jobs=-1,

    num_leaves=63,
    min_data_in_leaf=20,
    learning_rate=0.02,
    reg_lambda=5.0,
    feature_fraction=0.7,
    bagging_freq=10,
    bagging_fraction=0.6
)

In [9]:
best_lgbm.fit(X_train4, y_train4,eval_set=[(X_test4, y_test4)],eval_metric="multi_logloss",callbacks=[lgb.early_stopping(50, verbose=True)])
y_pred = best_lgbm.predict(X_test4,num_iteration=best_lgbm.best_iteration_)
print("Macro F1:", f1_score(y_test4, y_pred, average="macro"))
print("Accuracy:", accuracy_score(y_test4, y_pred))
print(confusion_matrix(y_test4, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 49.690841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3708180
[LightGBM] [Info] Number of data points in the train set: 795083, number of used features: 50000
[LightGBM] [Info] Start training from score -1.082869
[LightGBM] [Info] Start training from score -1.110904
[LightGBM] [Info] Start training from score -1.102270
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's multi_logloss: 0.580502




Macro F1: 0.7508330271610036
Accuracy: 0.7507131321973527
[[51568 10516  5224]
 [ 9205 47683  8560]
 [ 6428  9618 49969]]


In [10]:
import joblib

In [12]:
bundle = {
    "model": best_lgbm,
    "tfidf": tfidf4,
}

joblib.dump(bundle, "../Models/sentiment_bundle_lgbm.joblib")

['../Models/sentiment_bundle_lgbm.joblib']