In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from safetensors.torch import save_file, load_file

from utils import (
    load_gpc_hierarchical_classifier, 
    gpc_hierarchical_classifier_train,
    gpc_hierarchical_classifier_inference, 
    load_embedding_model
)
from constants import (
    GPC_HIERARCHICAL_CLASSIFIER_CONFIG, 
    FULL_TRAIN_DATASET_PATH, 
    FULL_TEST_DATASET_PATH, 
    E5_LARGE_INSTRUCT_CONFIG_PATH
)

In [2]:
df_train = pd.read_csv(FULL_TRAIN_DATASET_PATH)
df_test = pd.read_csv(FULL_TEST_DATASET_PATH)
df_merged = pd.concat([df_train, df_test])

In [3]:
seg_encoder = LabelEncoder()
fam_encoder = LabelEncoder()
cls_encoder = LabelEncoder()

df_merged["encoded_segment"] = seg_encoder.fit_transform(df_merged["segment"].tolist())
df_merged["encoded_family"] = fam_encoder.fit_transform(df_merged["family"].tolist())
df_merged["encoded_class"] = cls_encoder.fit_transform(df_merged["class"].tolist())

In [11]:
segment_to_families = (
    df_merged.groupby("encoded_segment")["encoded_family"]
      .unique()
      .apply(lambda arr: [int(x) for x in arr])
      .to_dict()
)

family_to_classes = (
    df_merged.groupby("encoded_family")["encoded_class"]
      .unique()
      .apply(lambda arr: [int(x) for x in arr])
      .to_dict()
)
segment_to_families = {
    int(k): [int(x) for x in v]
    for k, v in df_merged.groupby("encoded_segment")["encoded_family"].unique().items()
}

family_to_classes = {
    int(k): [int(x) for x in v]
    for k, v in df_merged.groupby("encoded_family")["encoded_class"].unique().items()
}



In [13]:
import json

with open("segment_to_families.json", "w") as f:
    json.dump(segment_to_families, f, indent=4)

import json

with open("family_to_classes.json", "w") as f:
    json.dump(family_to_classes, f, indent=4)

In [6]:
df_train_len = len(df_train)

df_train = df_merged.iloc[:df_train_len, :]
df_test = df_merged.iloc[df_train_len:, :]

In [7]:
X_train, y_train = df_train["product_name"].astype(str).tolist(), df_train[["encoded_segment", "encoded_family", "encoded_class"]].values.tolist()
X_test, y_test = df_test["product_name"].astype(str).tolist(), df_test[["encoded_segment", "encoded_family", "encoded_class"]].values.tolist()

In [8]:
# embed_model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [9]:
X_train = load_file("train_embeddings.safetensors")["input"]
X_test = load_file("test_embeddings.safetensors")["input"]

In [10]:
gpc_model = load_gpc_hierarchical_classifier(GPC_HIERARCHICAL_CLASSIFIER_CONFIG).to("cuda")

In [None]:
model, best_state = gpc_hierarchical_classifier_train(
    model=gpc_model,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
    epochs=500,
    lr=0.01
)

In [13]:
best_state["epoch"]

500

In [20]:
X_test = X_test.to(model.device)

In [21]:
y_pred = gpc_hierarchical_classifier_inference(model, X_test)

In [29]:
df_test["pred_segment"] = y_pred[0].tolist()
df_test["pred_family"] = y_pred[1].tolist()
df_test["pred_class"] = y_pred[2].tolist()