# LGBM - Training

Sources
1. [LGBM & Deberta Explained by ZULQARNAIN ALI](https://www.kaggle.com/code/zulqarnainalipk/lgbm-deberta-explained)

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import lightgbm as lgb
import pickle as pkl
import wandb
from lightgbm import log_evaluation, early_stopping
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.criterion.metrics import log_metrics
from lib.model.utils import quadratic_weighted_kappa, qwk_obj, get_score
from lib.utils.find_threshold import find_thresholds

In [4]:
seed_everything()

### Setting Up WandB

In [5]:
WANDB_PROJECT = "Kaggle_ASE_2.0"
WANDB_NAME = f"LGBM-ASE-13"

In [6]:
wandb.login(key=os.environ.get('WANDB_API_KEY'))
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_NAME,
    save_code=True,
    job_type="train",
    config=config,
)
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


## Model Training

### Dataset Creation

In [7]:
df = pd.read_csv(Paths.FEATURE_ENGG_CSV_PATH)
df.shape

(17307, 147)

In [8]:
ids = df.loc[:, "essay_id"]
X, y = df.drop(columns=["score", "essay_id"]), df.loc[:, "score"]
del df

X.shape, y.shape

((17307, 145), (17307,))

### Callbacks

In [9]:
callbacks = [
    log_evaluation(period=25),
    early_stopping(stopping_rounds=75, first_metric_only=True),
]

### Constants

In [10]:
a = config.lgbm_a
b = config.lgbm_b

### Training Loop

In [11]:
def train_loop(X_train, y_train, X_valid, y_valid):
    y_train = y_train.astype(np.float32) - a
    y_valid = y_valid.astype(np.float32) - a

    model = lgb.LGBMRegressor(
        objective=qwk_obj,
        metrics="None",
        learning_rate=0.01,
        max_depth=5,
        num_leaves=10,
        colsample_bytree=0.3,
        reg_alpha=0.7,
        reg_lambda=0.1,
        n_estimators=700,
        random_state=config.random_seed,
        extra_trees=True,
        class_weight="balanced",
        n_jobs=6,
        verbosity=0,
    )

    # Train model
    predictor = model.fit(
        X_train,
        y_train,
        eval_names=["train", "valid"],
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric=quadratic_weighted_kappa,
        callbacks=callbacks,
    )

    return model, predictor

In [12]:
def validate_model(idx, X_valid, y_valid, id_valid, predictor):
    y_valid_int = y_valid.astype(np.int64).to_numpy()
    preds_raw = predictor.predict(X_valid) + a
    preds = preds_raw.clip(1, 6).round().astype(np.int64)

    oof = pd.DataFrame(
        {
            "essay_id": id_valid,
            "score": y_valid_int - 1,
            "pred_score": preds - 1,
            "raw": preds_raw
        }
    )
    log_metrics(oof, f"Fold {idx}")

    return oof

### Fold Loop

In [13]:
skf = StratifiedKFold(
    n_splits=config.lgbm_n_folds,
    random_state=config.random_seed,
    shuffle=True,
)

In [14]:
oof_df = pd.DataFrame()

for idx, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_valid, y_valid = X.loc[valid_idx], y.loc[valid_idx]
    id_valid = ids.loc[valid_idx].to_numpy()

    model, predictor = train_loop(X_train, y_train, X_valid, y_valid)

    with open(f"output/LGBM/{idx}.pkl", "wb") as file:
        pkl.dump({"predictor": predictor, "model": model}, file)

    oof = validate_model(idx, X_valid, y_valid, id_valid, predictor)
    oof_df = pd.concat([oof_df, oof])

[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.45289	valid's QWK: 0.480398
[50]	train's QWK: 0.736666	valid's QWK: 0.755467
[75]	train's QWK: 0.79928	valid's QWK: 0.811051
[100]	train's QWK: 0.83597	valid's QWK: 0.842451
[125]	train's QWK: 0.858038	valid's QWK: 0.863786
[150]	train's QWK: 0.867819	valid's QWK: 0.872742
[175]	train's QWK: 0.871897	valid's QWK: 0.881577
[200]	train's QWK: 0.874616	valid's QWK: 0.88415
[225]	train's QWK: 0.878254	valid's QWK: 0.891688
[250]	train's QWK: 0.880652	valid's QWK: 0.891362
[275]	train's QWK: 0.881875	valid's QWK: 0.894474
[300]	train's QWK: 0.882067	valid's QWK: 0.895535
[325]	train's QWK: 0.88304	valid's QWK: 0.895919
[350]	train's QWK: 0.883929	valid's QWK: 0.895017
[375]	train's QWK: 0.884472	valid's QWK: 0.895292
Early stopping, best iteration is:
[320]	train's QWK: 0.882743	valid's QWK: 0.897251
Evaluated only: QWK
[LightGBM] [Info] Using self-defined

  precision = tp / (tp + fp)


[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.451614	valid's QWK: 0.45348
[50]	train's QWK: 0.738468	valid's QWK: 0.707614
[75]	train's QWK: 0.799379	valid's QWK: 0.784095
[100]	train's QWK: 0.83626	valid's QWK: 0.825894
[125]	train's QWK: 0.859432	valid's QWK: 0.850339
[150]	train's QWK: 0.868698	valid's QWK: 0.854371
[175]	train's QWK: 0.87379	valid's QWK: 0.861317
[200]	train's QWK: 0.877098	valid's QWK: 0.861092
[225]	train's QWK: 0.879782	valid's QWK: 0.860236
[250]	train's QWK: 0.881733	valid's QWK: 0.862745
[275]	train's QWK: 0.883383	valid's QWK: 0.865184
[300]	train's QWK: 0.884492	valid's QWK: 0.864928
[325]	train's QWK: 0.885447	valid's QWK: 0.86774
[350]	train's QWK: 0.885939	valid's QWK: 0.869225
[375]	train's QWK: 0.88664	valid's QWK: 0.869225
[400]	train's QWK: 0.88693	valid's QWK: 0.870877
[425]	train's QWK: 0.887176	valid's QWK: 0.870463
[450]	train's QWK: 0.887624	valid's QWK: 0

  precision = tp / (tp + fp)


[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.439313	valid's QWK: 0.461778
[50]	train's QWK: 0.735467	valid's QWK: 0.74041
[75]	train's QWK: 0.802273	valid's QWK: 0.798621
[100]	train's QWK: 0.835096	valid's QWK: 0.840658
[125]	train's QWK: 0.860459	valid's QWK: 0.864656
[150]	train's QWK: 0.869524	valid's QWK: 0.872679
[175]	train's QWK: 0.873507	valid's QWK: 0.873838
[200]	train's QWK: 0.875937	valid's QWK: 0.876536
[225]	train's QWK: 0.879117	valid's QWK: 0.877112
[250]	train's QWK: 0.881262	valid's QWK: 0.87721
[275]	train's QWK: 0.882811	valid's QWK: 0.877473
Early stopping, best iteration is:
[205]	train's QWK: 0.876868	valid's QWK: 0.878261
Evaluated only: QWK


  precision = tp / (tp + fp)


[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
[25]	train's QWK: 0.441893	valid's QWK: 0.427064
[50]	train's QWK: 0.738826	valid's QWK: 0.728052
[75]	train's QWK: 0.800429	valid's QWK: 0.802548
[100]	train's QWK: 0.837187	valid's QWK: 0.838022
[125]	train's QWK: 0.858944	valid's QWK: 0.862998
[150]	train's QWK: 0.868227	valid's QWK: 0.866535
[175]	train's QWK: 0.873118	valid's QWK: 0.869911
[200]	train's QWK: 0.87611	valid's QWK: 0.875867
[225]	train's QWK: 0.878636	valid's QWK: 0.879438
[250]	train's QWK: 0.880502	valid's QWK: 0.880438
[275]	train's QWK: 0.882278	valid's QWK: 0.885108
[300]	train's QWK: 0.883498	valid's QWK: 0.884204
[325]	train's QWK: 0.883833	valid's QWK: 0.883771
[350]	train's QWK: 0.884344	valid's QWK: 0.88367
Early stopping, best iteration is:
[275]	train's QWK: 0.882278	valid's QWK: 0.885108
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation s

In [15]:
score = get_score(oof_df["score"], oof_df["pred_score"])
print(f'Overall Score: {score:<.4f}')
wandb.log({"CV/qwk_score": score})

Overall Score: 0.8862


In [16]:
wandb.log({"oof_table": wandb.Table(dataframe=oof_df)})

In [17]:
oof_df.to_csv("output/oof.csv", index=False)

In [None]:
best, threshold, xs, ys = find_thresholds(oof_df["score"], oof_df["raw"], steps=500)

In [18]:
log_metrics(
    pd.DataFrame(
        {
            "score": oof_df.score.to_numpy(),
            "pred_score": oof_df.pred_score.to_numpy(),
        }
    ),
    "Overall",
)

### Finding Optimal Thresholds

In [None]:
best, threshold, xs, ys = find_thresholds(oof_df["score"] + 1, oof_df["raw"], steps=500)

In [None]:
diff = 0.5
nrows, ncols = 1, 5  # Define grid for 5 subplots (1 row, 5 columns)

fig, axes = plt.subplots(nrows, ncols, figsize=(15, 3))  # Create figure and subplots

# Loop through data and plot on each subplot
for k in range(5):
    ax = axes.flat[k]  # Access each subplot using flattened axes
    ax.scatter(xs[k], ys[k], s=3)
    m = k + 1.5
    ax.set_xlim((m - diff, m + diff))
    i = np.where((np.array(xs[k]) > m - diff) & (np.array(xs[k]) < m + diff))[0]
    mn = np.min(np.array(ys[k])[i])
    mx = np.max(np.array(ys[k])[i])
    ax.set_ylim((mn, mx))

    ax.plot(
        [threshold[k], threshold[k]],
        [mn, mx],
        "--",
        color="black",
        label="optimal threshold",
    )

    ax.set_title(f"Optimal Thresh: {threshold[k]:4.3f}", size=12)  # Adjust title size
    ax.set_xlabel(f"Threshold between {k+1} and {k+2}", size=10)
    ax.set_ylabel("QWK CV score", size=10)

# Common legend for all subplots
plt.suptitle("Optimal Thresholds", size=12)
plt.tight_layout()  # Adjust spacing between subplots
plt.show()

In [None]:
with open("output/LGBM/threshold.pkl", "wb") as file:
    pkl.dump(threshold, file)

In [19]:
wandb.finish()

VBox(children=(Label(value='1.190 MB of 1.190 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
CV/qwk_score,▁

0,1
CV/qwk_score,0.88625
