# LGBM - Training

Sources
1. [LGBM & Deberta Explained by ZULQARNAIN ALI](https://www.kaggle.com/code/zulqarnainalipk/lgbm-deberta-explained)

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt
import pickle as pkl
import wandb
import re

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.model.utils import get_score
from lib.criterion.metrics import log_metrics

In [4]:
seed_everything()

### Setting Up WandB

In [5]:
WANDB_PROJECT = "Kaggle_ASE_2.0"
WANDB_NAME = f"LGBM-ASE-1"

In [6]:
wandb.login(key=os.environ.get('WANDB_API_KEY'))
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_NAME,
    save_code=True,
    job_type="train",
    config=config,
)
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


## Global Definitions

In [7]:
root_data_dir = "data/lgbm_deberta"

In [8]:
def read_data_part(fold, part):
    fold_dir = os.path.join(root_data_dir, f"fold_{fold}")
    part_dir = os.path.join(fold_dir, f"part_{part}")

    train_df = pd.read_csv(os.path.join(part_dir, f"train_lgbm_{fold}_{part}.csv"))
    valid_df = pd.read_csv(os.path.join(part_dir, f"valid_lgbm_{fold}_{part}.csv"))

    return train_df.drop(columns=["essay_id"]), valid_df.drop(columns=["essay_id"])

## Model Training

### Callbacks

In [9]:
from lightgbm import log_evaluation, early_stopping

In [10]:
callbacks = [
    log_evaluation(period=50),
    early_stopping(stopping_rounds=100, first_metric_only=True),
]

### Scoring Functions

In [11]:
a = config.lgbm_a
b = config.lgbm_b

In [12]:
def quadratic_weighted_kappa(y_true, y_pred):
    y_true = y_true + a
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return "QWK", qwk, True


def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1 / 2 * np.sum((preds - labels) ** 2)
    g = 1 / 2 * np.sum((preds - a) ** 2 + b)
    df = preds - labels
    dg = preds - a
    grad = (df / g - f * dg / g**2) * len(labels)
    hess = np.ones(len(labels))
    return grad, hess

### Training Loop

In [13]:
def get_feature_and_labels(fold, part):
    train_df, valid_df = read_data_part(fold, part)

    X_train = train_df.drop(columns=["score"])
    y_train = train_df["score"]

    X_valid = valid_df.drop(columns=["score"])
    y_valid = valid_df["score"]
    
    return X_train, y_train, X_valid, y_valid

In [14]:
def train_loop(fold, part):
    X_train, y_train, X_valid, y_valid = get_feature_and_labels(fold, part)
    y_valid_int = y_valid.astype(np.int64)
    y_train = y_train.astype(np.float32) - a
    y_valid = y_valid.astype(np.float32) - a

    model = lgb.LGBMRegressor(
        objective=qwk_obj,
        metrics="None",
        learning_rate=0.01,
        max_depth=5,
        num_leaves=10,
        colsample_bytree=0.3,
        reg_alpha=0.7,
        reg_lambda=0.1,
        n_estimators=1000,
        random_state=config.random_seed,
        extra_trees=True,
        class_weight="balanced",
        n_jobs=6,
        verbosity=0,
    )

    # Train model
    predictor = model.fit(
        X_train,
        y_train,
        eval_names=["train", "valid"],
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric=quadratic_weighted_kappa,
        callbacks=callbacks,
    )

    # Validate model
    preds = predictor.predict(X_valid)
    preds = preds + a
    preds = preds.clip(1, 6).round()
    f1 = f1_score(y_valid_int, preds, average="weighted")
    qwk = cohen_kappa_score(y_valid_int, preds, weights="quadratic")

    with open(f"output/LGBM/feature_importance/{fold}_{part}.pkl", "wb") as file:
        pkl.dump(
            {n: i for (n, i) in zip(model.feature_name_, model.feature_importances_)},
            file,
        )

    wandb.log({f"eval/f1_f{fold}_p{part}": f1, f"eval/qwk_f{fold}_p{part}": qwk})

    return predictor

In [15]:
def get_valid_data(fold):
    valid_df = pd.read_csv(
        os.path.join(root_data_dir, f"fold_{fold}/lgbm_valid_{fold}.csv"),
    )

    X_valid = valid_df.drop(columns=["score", "essay_id"])
    y_valid = valid_df["score"]
    y_valid_int = y_valid.astype(np.int64)

    y_valid = y_valid
    y_valid_int = y_valid_int
    return X_valid, y_valid_int

In [16]:
def ensemble_prediction(fold, fold_output_dir):
    all_preds = []

    for part in range(config.n_folds):
        X_valid, y_valid_int = get_valid_data(fold)

        with open(os.path.join(fold_output_dir, f"part_{part}.pkl"), "rb") as file:
            predictor = pkl.load(file)

            preds = predictor.predict(X_valid)
            preds = preds + a
            preds = preds.clip(1, 6).round()

            all_preds.append(preds)

    all_preds = np.array(all_preds)
    all_preds = np.median(all_preds, axis=0).astype(np.int64)
    
    return all_preds, y_valid_int

### Fold Loop

In [17]:
for fold in range(config.n_folds):
    fold_output_dir = os.path.join("output/LGBM", f"fold_{fold}")

    if not os.path.exists(fold_output_dir):
        os.makedirs(fold_output_dir)

    for part in range(config.n_folds):
        predictor = train_loop(fold, part)

        with open(os.path.join(fold_output_dir, f"part_{part}.pkl"), "wb") as file:
            pkl.dump(predictor, file)

    all_preds, y_valid_int = ensemble_prediction(fold, fold_output_dir)

    f1 = f1_score(y_valid_int, all_preds, average="weighted")
    qwk = cohen_kappa_score(y_valid_int, all_preds, weights="quadratic")

    print(f"All Parts\t\tF1 Score: {f1:<8.7f}\t\tQWK Score: {qwk:>8.7f}")

    wandb.log({f"eval/f1_f{fold}": f1, f"eval/qwk_f{fold}": qwk})

    log_metrics(
        pd.DataFrame(
            {
                "score": y_valid_int.map(lambda x: x- 1),
                "pred_score": all_preds.flatten().astype(np.int64) - 1,
            }
        ),
        f"Fold {fold}",
    )

[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 100 rounds
[50]	train's QWK: 0.654391	valid's QWK: 0.647706
[100]	train's QWK: 0.727518	valid's QWK: 0.717295
[150]	train's QWK: 0.764087	valid's QWK: 0.754369
[200]	train's QWK: 0.787268	valid's QWK: 0.775318
[250]	train's QWK: 0.797778	valid's QWK: 0.789897
[300]	train's QWK: 0.805545	valid's QWK: 0.794394
[350]	train's QWK: 0.810216	valid's QWK: 0.798039
[400]	train's QWK: 0.814032	valid's QWK: 0.799426
[450]	train's QWK: 0.817477	valid's QWK: 0.802546
[500]	train's QWK: 0.820039	valid's QWK: 0.803386
[550]	train's QWK: 0.822303	valid's QWK: 0.803783
[600]	train's QWK: 0.82466	valid's QWK: 0.805804
[650]	train's QWK: 0.826124	valid's QWK: 0.806403
[700]	train's QWK: 0.827731	valid's QWK: 0.807483
[750]	train's QWK: 0.829286	valid's QWK: 0.807738
[800]	train's QWK: 0.831891	valid's QWK: 0.808805
[850]	train's QWK: 0.833276	valid's QWK: 0.809648
[900]	train's QWK: 0.835963	valid'

In [18]:
wandb.finish()

VBox(children=(Label(value='0.358 MB of 0.358 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1_f0,▁
eval/f1_f0_p0,▁
eval/f1_f0_p1,▁
eval/f1_f0_p2,▁
eval/f1_f0_p3,▁
eval/f1_f0_p4,▁
eval/f1_f1,▁
eval/f1_f1_p0,▁
eval/f1_f1_p1,▁
eval/f1_f1_p2,▁

0,1
eval/f1_f0,0.64132
eval/f1_f0_p0,0.63501
eval/f1_f0_p1,0.64265
eval/f1_f0_p2,0.6307
eval/f1_f0_p3,0.62486
eval/f1_f0_p4,0.63138
eval/f1_f1,0.65834
eval/f1_f1_p0,0.63548
eval/f1_f1_p1,0.63635
eval/f1_f1_p2,0.62956
