# LGBM - Hyperparameter Tuning

## Setup

### Environment Variables

In [1]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


### Import Libraries

In [2]:
import os
import random
from typing import Tuple, List
import warnings
import pickle as pkl
import torch

from dotenv import load_dotenv
import wandb
from tqdm import tqdm
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import cohen_kappa_score

In [3]:
os.chdir("../../")
warnings.simplefilter('ignore')
load_dotenv()

True

In [4]:
from lib.criterion.metrics import log_metrics
from lib.utils.find_threshold import find_thresholds
from lib.model.utils import quadratic_weighted_kappa, qwk_obj, get_score

### Paths

In [5]:
class Paths:
    # Competition data with added topic column
    train_csv: str = "data/processed/train.csv"
    test_csv: str = "data/processed/test.csv"
    feature_csv: str = "data/feature_engg/all_features.csv"

    # Output path
    output_path: str = "output/model_dir_ht"
    model_path: str = os.path.join(output_path, "{model_name}")
    tokenizer_path: str = os.path.join(model_path, "{model_name}_tokenizer")
    threshold_path: str = os.path.join(model_path, "threshold.pkl")
    logging_path: str = os.path.join(model_path, "logging")

### Configurations

In [6]:
class CFG:
    num_labels: int = 6
    seed: int = 20
    lgbm_a: float = 2.998
    lgbm_b: float = 1.092
    lgbm_n_folds: int = 7

### Setting Random Seed

In [7]:
def seed_everything() -> None:
    """Seed everything to ensure reproducibility

    Sources:
    1. https://www.kaggle.com/code/alejopaullier/aes-2-multi-class-classification-train
    2. https://www.kaggle.com/code/hashidoyuto/deberta-baseline-aes2-0-train
    """
    random.seed(CFG.seed)
    os.environ["PYTHONHASHCFG.SEED"] = str(CFG.seed)
    np.random.seed(CFG.seed)
    torch.manual_seed(CFG.seed)
    torch.cuda.manual_seed(CFG.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()

### Sweep Configuration

In [8]:
sweep_config = {
    # How to perform hyperparameter tuning
    "method": "random",
    # How to evaluate which hyperparameter combination is good
    "metric": {
        "name": "QWK",
        "goal": "maximize",
    },
    # Hyperparameters to tune
    "parameters": {
        "learning_rate": {"distribution": "uniform", "min": 0.01, "max": 0.1},
        "max_depth": {"distribution": "int_uniform", "min": 3, "max": 10},
        "num_leaves": {"distribution": "int_uniform", "min": 5, "max": 25},
        "n_estimators": {"distribution": "int_uniform", "min": 100, "max": 2000},
        "class_weight": {"values": ["balanced", None]},
        "reg_alpha": {"distribution": "uniform", "min": 0.0, "max": 1.0},
        "reg_lambda": {"distribution": "uniform", "min": 0.0, "max": 1.0},
        "colsample_bytree": {"distribution": "uniform", "min": 0.0, "max": 1.0},
    },
}

### WandB setup

In [9]:
WANDB_PROJECT = "Kaggle_ASE_2.0"

In [10]:
wandb.login(key=os.environ.get('WANDB_API_KEY'))
sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


Create sweep with ID: sg50uz5j
Sweep URL: https://wandb.ai/laplacesdemon43/Kaggle_ASE_2.0/sweeps/sg50uz5j


## Data Preparation

In [11]:
df = pd.read_csv(Paths.train_csv, usecols=["score", "topic"])
X = pd.read_csv(Paths.feature_csv)
ids, X = X.loc[:, "essay_id"], X.drop(columns=["essay_id"])
topic, y = df.loc[:, "topic"], df.loc[:, "score"]

del df
X.shape, y.shape

((17307, 104), (17307,))

## Training

In [12]:
a = CFG.lgbm_a
b = CFG.lgbm_b

In [13]:
callbacks = [
    # log_evaluation(period=25),
    early_stopping(stopping_rounds=75, first_metric_only=True),
]

### Utility Functions

In [14]:
def validate_model(idx, X_valid, y_valid, id_valid, predictor):
    y_valid_int = y_valid.astype(np.int64).to_numpy()
    preds_raw = predictor.predict(X_valid) + a
    preds = preds_raw.clip(1, 6).round().astype(np.int64)

    oof = pd.DataFrame(
        {
            "essay_id": id_valid,
            "score": y_valid_int - 1,
            "pred_score": preds - 1,
            "raw": preds_raw
        }
    )
    log_metrics(oof, f"Fold {idx}")

    return oof

In [15]:
def train_loop(sweep_config, X_train, y_train, X_valid, y_valid):
    y_train = y_train.astype(np.float32) - a
    y_valid = y_valid.astype(np.float32) - a

    model = lgb.LGBMRegressor(
        objective=qwk_obj,
        metrics="None",
        learning_rate=sweep_config.learning_rate,
        max_depth=sweep_config.max_depth,
        num_leaves=sweep_config.num_leaves,
        colsample_bytree=sweep_config.colsample_bytree,
        reg_alpha=sweep_config.reg_alpha,
        reg_lambda=sweep_config.reg_lambda,
        n_estimators=sweep_config.n_estimators,
        random_state=CFG.seed,
        extra_trees=True,
        class_weight=sweep_config.class_weight,
        n_jobs=6,
        verbosity=0,
    )

    # Train model
    predictor = model.fit(
        X_train,
        y_train,
        eval_names=["train", "valid"],
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric=quadratic_weighted_kappa,
        callbacks=callbacks,
    )

    return model, predictor

### Training Model

In [16]:
def main(sweep_config=None):
    with wandb.init(config=sweep_config):
        sweep_config = wandb.config
        oof_df = pd.DataFrame()

        skf = StratifiedGroupKFold(n_splits=7, random_state=CFG.seed, shuffle=True)

        for idx, (train_idx, valid_idx) in enumerate(skf.split(X, y, groups=topic)):
            X_train, y_train = X.loc[train_idx], y.loc[train_idx]
            X_valid, y_valid = X.loc[valid_idx], y.loc[valid_idx]
            id_valid = ids.loc[valid_idx].to_numpy()

            _, predictor = train_loop(sweep_config, X_train, y_train, X_valid, y_valid)

            oof = validate_model(idx, X_valid, y_valid, id_valid, predictor)
            oof_df = pd.concat([oof_df, oof])

        score = get_score(oof_df["score"], oof_df["pred_score"])
        print(f"Overall QWK Score: {score}")
        wandb.log({"QWK": score})

### Initiate Sweep

In [17]:
wandb.agent(sweep_id, main, count=3, project=WANDB_PROJECT)

[34m[1mwandb[0m: Agent Starting Run: 6qmwlcpj with config:
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	colsample_bytree: 0.5840402580407215
[34m[1mwandb[0m: 	learning_rate: 0.08335541530874528
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 1856
[34m[1mwandb[0m: 	num_leaves: 11
[34m[1mwandb[0m: 	reg_alpha: 0.30461892741484864
[34m[1mwandb[0m: 	reg_lambda: 0.2847663024551428
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[336]	train's QWK: 0.826831	valid's QWK: 0.806407
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[455]	train's QWK: 0.835826	valid's QWK: 0.756917
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[357]	train's QWK: 0.825525	valid's QWK: 0.828182
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[245]	train's QWK: 0.824961	valid's QWK: 0.614158
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 ro

VBox(children=(Label(value='0.027 MB of 0.027 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
QWK,▁

0,1
QWK,0.78967


[34m[1mwandb[0m: Agent Starting Run: 3fm84egg with config:
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	colsample_bytree: 0.37953532216975494
[34m[1mwandb[0m: 	learning_rate: 0.06077595577180699
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 469
[34m[1mwandb[0m: 	num_leaves: 19
[34m[1mwandb[0m: 	reg_alpha: 0.4511734857169445
[34m[1mwandb[0m: 	reg_lambda: 0.6837531639048126
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[238]	train's QWK: 0.821854	valid's QWK: 0.806923
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Did not meet early stopping. Best iteration is:
[442]	train's QWK: 0.837152	valid's QWK: 0.748614
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[384]	train's QWK: 0.830323	valid's QWK: 0.82581
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[349]	train's QWK: 0.833354	valid's QWK: 0.60288
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improv

VBox(children=(Label(value='0.027 MB of 0.027 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
QWK,▁

0,1
QWK,0.78825


[34m[1mwandb[0m: Agent Starting Run: gf08c6ev with config:
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	colsample_bytree: 0.8160224862340288
[34m[1mwandb[0m: 	learning_rate: 0.04891955852092683
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 698
[34m[1mwandb[0m: 	num_leaves: 18
[34m[1mwandb[0m: 	reg_alpha: 0.7717803718818617
[34m[1mwandb[0m: 	reg_lambda: 0.80020756977463
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[315]	train's QWK: 0.828405	valid's QWK: 0.802173
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Did not meet early stopping. Best iteration is:
[632]	train's QWK: 0.846736	valid's QWK: 0.75861
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[398]	train's QWK: 0.829969	valid's QWK: 0.830823
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[150]	train's QWK: 0.818161	valid's QWK: 0.625042
Evaluated only: QWK
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't impro

VBox(children=(Label(value='0.027 MB of 0.027 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
QWK,▁

0,1
QWK,0.78993


## Wrapping up

In [18]:
wandb.finish()