In [1]:
import pandas as pd
import lightgbm as lgb

In [2]:
data_path = './Data/data_lgbm_db1_mp1_pt1_aw1_tw8.gzip'
data_lgbm = pd.read_parquet(data_path)

def prepare_data(data: pd.DataFrame):
    """Load and split the dataset into train, validation, and test sets."""

    train_dataset = data[data["group"] == "train"]
    valid_dataset = data[data["group"] == "valid"]
    test_dataset = data[data["group"] == "test"]

    train_dataset = train_dataset.drop(["group", "id"], axis=1)
    valid_dataset = valid_dataset.drop(["group", "id"], axis=1)
    test_dataset = test_dataset.drop(["group", "id"], axis=1)

    return train_dataset, valid_dataset, test_dataset

train_dataset, valid_dataset, test_dataset = prepare_data(data_lgbm)

In [3]:
booster = lgb.Booster(
    model_file="./output/lgbm/model_lgbm_gbdt_db1_mp1_pt1_aw1_tw8.txt"
)

In [4]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


def get_X_y(df):
    X = df.drop("target_mro", axis=1)
    y = df["target_mro"]
    return X, y


X_train, y_train = get_X_y(train_dataset)
X_valid, y_valid = get_X_y(valid_dataset)
X_test, y_test = get_X_y(test_dataset)


def predict_and_eval(booster, X, y_true, dataset_name="dataset"):
    y_prob = booster.predict(X)
    y_pred = (y_prob >= 0.5).astype(int)

    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"\nEvaluation on {dataset_name}:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    result_df = pd.DataFrame(
        {"y_true": y_true.values, "y_prob": y_prob, "y_pred": y_pred}
    )

    return acc, precision, recall, f1, result_df


# predict_and_eval(booster, X_train, y_train, "Train Set")
# predict_and_eval(booster, X_valid, y_valid, "Validation Set")
# predict_and_eval(booster, X_test, y_test, "Test Set")
acc, precision, recall, f1, df_result = predict_and_eval(
    booster, X_test, y_test, "Test Set"
)


Evaluation on Test Set:
Accuracy:  0.7108
Precision: 0.0726
Recall:    0.4317
F1 Score:  0.1244
