In [6]:
from sklearn.preprocessing import LabelEncoder

labels = [
    "entertainment",
    "food",
    "travel",
    "health and lifestyle",
    "mom and children",
    "fashion",
    "tech",
    "sports",
    "art",
    "gaming",
]

label_encoder = LabelEncoder()
label_encoder.fit(labels)

In [7]:
from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from consts import *
from sklearn.decomposition import PCA


FILENAME = get_filename(TT.ALL, EMBED_PROVIDER.OPENAI, MEAN_METHOD.CENTROID)
print(FILENAME)

df_embeds = pd.read_parquet(FILENAME)
df_train_all = pd.read_parquet("data/training-dataset.parquet")

df_train = pd.merge(
    df_embeds,
    df_train_all[["username", "category_enum", "label"]],
    how="left",
    on="username",
)
df_train = df_train.dropna(subset=["label"])
df_train = df_train.drop(columns=["username"])
df_train["category_enum"] = df_train["category_enum"].fillna("Missing")

df_train


df_train["target"] = label_encoder.transform(df_train["label"])
df_train = df_train.drop(columns=["label"])


X_train = df_train.drop(columns=["target"])
y_train = df_train[["target"]]


encoder = TargetEncoder()
cat_encoded_train = encoder.fit_transform(
    X_train["category_enum"].to_numpy().reshape(-1, 1), y_train["target"].tolist()
)
X_train["category_encoded"] = [row.tolist() for row in cat_encoded_train]
X_train = X_train[["embeds_wavg", "category_encoded"]]

X_train = pd.concat(
    [
        pd.DataFrame(X_train["category_encoded"].tolist()),
        pd.DataFrame(X_train["embeds_wavg"].tolist()),
    ],
    axis=1,
)


X_train, y_train = (
    X_train.to_numpy(),
    y_train.to_numpy(),
)

data/embeddings/openai-t3-large/ue_em_w_centroid.parquet


In [8]:
import xgboost as xgb
from sklearn.metrics import classification_report

dtrain = xgb.DMatrix(X_train, label=y_train)


# OpeanAI 73
params = {
    "objective": "multi:softmax",
    "num_class": 10,
    "learning_rate": 0.1,
    "max_depth": 4,
    "min_child_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_lambda": 1,
    "reg_alpha": 0,
    "device": "cuda",
    "random_state": 42,
    "eval_metric": ["mlogloss", "merror"],
}

evals_result = {}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=150,
    evals=[(dtrain, "train")],
    evals_result=evals_result,
    verbose_eval=True,
)



[0]	train-mlogloss:2.06536	train-merror:0.27883
[1]	train-mlogloss:1.88604	train-merror:0.22336
[2]	train-mlogloss:1.74147	train-merror:0.19307
[3]	train-mlogloss:1.62067	train-merror:0.17810
[4]	train-mlogloss:1.51178	train-merror:0.17117
[5]	train-mlogloss:1.41825	train-merror:0.15876
[6]	train-mlogloss:1.33341	train-merror:0.15255
[7]	train-mlogloss:1.25504	train-merror:0.14307
[8]	train-mlogloss:1.18629	train-merror:0.13686
[9]	train-mlogloss:1.12302	train-merror:0.13139
[10]	train-mlogloss:1.06377	train-merror:0.12372
[11]	train-mlogloss:1.00925	train-merror:0.12007
[12]	train-mlogloss:0.95887	train-merror:0.11350
[13]	train-mlogloss:0.91269	train-merror:0.10766
[14]	train-mlogloss:0.86935	train-merror:0.09964
[15]	train-mlogloss:0.82911	train-merror:0.09927
[16]	train-mlogloss:0.79020	train-merror:0.09489
[17]	train-mlogloss:0.75509	train-merror:0.09161
[18]	train-mlogloss:0.72275	train-merror:0.08650
[19]	train-mlogloss:0.69059	train-merror:0.08212
[20]	train-mlogloss:0.66114	tr

In [9]:
model.save_model('trained_models/xgb_openai_centroid.json')