In [1]:
import pandas as pd
from consts import *


df_embeds = pd.read_parquet(get_filename(TT.ALL, EMBED_PROVIDER.DISTILBERT, MEAN_METHOD.CENTROID))
df_train_all = pd.read_parquet('data/training-dataset.parquet')

data/embeddings/distilbert/uc_em_mean.parquet


In [2]:
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from sklearn.model_selection import train_test_split

df_train = pd.merge(df_embeds, df_train_all[['username','category_enum','label']], how='left', on='username')
df_train = df_train.dropna(subset=['label'])
df_train = df_train.drop(columns=['username'])
df_train['category_enum'] = df_train['category_enum'].fillna('Missing')



labels = ['entertainment', 'food', 'travel', 'health and lifestyle', 'mom and children', 'fashion', 'tech', 'sports', 'art', 'gaming']

label_encoder = LabelEncoder()
label_encoder.fit(labels)

df_train['target'] = label_encoder.transform(df_train['label'])
df_train = df_train.drop(columns=['label'])


X = df_train.drop(columns=['target'])
y = df_train[['target']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


encoder = TargetEncoder()
cat_encoded_train = encoder.fit_transform(X_train["category_enum"].to_numpy().reshape(-1,1), y_train['target'].tolist())
X_train['category_encoded'] = [row.tolist() for row in cat_encoded_train]
X_train = X_train[['embeds_wavg','category_encoded']]

X_train = pd.concat([pd.DataFrame(X_train['category_encoded'].tolist()), pd.DataFrame(X_train['embeds_wavg'].tolist())],axis=1)



cat_encoded_test = encoder.transform(X_test['category_enum'].to_numpy().reshape(-1,1))
X_test['category_encoded'] = [row.tolist() for row in cat_encoded_test]
X_test = pd.concat([pd.DataFrame(X_test['category_encoded'].tolist()), pd.DataFrame(X_test['embeds_wavg'].tolist())],axis=1)

X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()





In [3]:
import numpy as np
from lightgbm import LGBMClassifier, Dataset, train

train_data = Dataset(X_train, label=y_train)
test_data = Dataset(X_test, label=y_test)

print(test_data)

params = {
    'objective':'multiclass',
    'num_class':10,
    'n_estimator':200,
    'learning_rate':0.1,
    'random_state':42,
    'min_child_samples': 5,  # Try a smaller value (default is 20)
    'min_data_in_leaf': 5    # Similar parameter, also try adjusting
}



#model = LGBMClassifier(
#    objective='multiclass',
#    num_class=10,
#    n_estimators=200,
#    learning_rate=0.1,
#    random_state=42,
#)

#model = XGBClassifier(
#    objective='multi:softmax',
#    num_class=10,
#    learning_rate=0.01,  # Lower learning rate
#    max_depth=8,         # Try deeper trees
#    min_child_weight=3,  # Increase to prevent overfitting
#    n_estimators=200,    # More trees
#    subsample=0.8,       # Add some randomness
#    colsample_bytree=0.8,# Feature sampling
#    eval_metrix=['mlogloss','merror'],
#    early_stopping_rounds = 10,
#    random_state=42
#)

bst = train(params, train_data, 10, valid_sets=[test_data])


y_pred = bst.predict(X_test)
#y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_pred_classes

<lightgbm.basic.Dataset object at 0x75f0f0249eb0>
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 131238
[LightGBM] [Info] Number of data points in the train set: 1370, number of used features: 522
[LightGBM] [Info] Start training from score -2.700777
[LightGBM] [Info] Start training from score -2.110578
[LightGBM] [Info] Start training from score -2.232133
[LightGBM] [Info] Start training from score -1.725398
[LightGBM] [Info] Start training from score -5.836272
[LightGBM] [Info] Start training from score -1.658046
[LightGBM] [Info] Start training from score -2.840539
[LightGBM] [Info] Start training from score -3.290740
[LightGBM] [Info] Start training from score -2.024069
[LightGBM] [Info] Start training from score -2.252753






array([5, 5, 8, ..., 3, 5, 3])

In [4]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Print detailed metrics
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_classes))

# Calculate additional metrics
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')

print("\nAdditional Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Weighted F1: {weighted_f1:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()




Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.23      0.31        99
           1       0.40      0.38      0.39       156
           2       0.66      0.61      0.63       152
           3       0.85      0.93      0.89       267
           4       0.00      0.00      0.00         9
           5       0.53      0.70      0.60       241
           6       0.60      0.36      0.45        69
           7       0.79      0.66      0.72        62
           8       0.65      0.70      0.67       165
           9       0.69      0.59      0.63       150

    accuracy                           0.63      1370
   macro avg       0.56      0.52      0.53      1370
weighted avg       0.63      0.63      0.62      1370



ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets