# Light Gradient Boosting Machine for predictions
This Notebook attempts to create the best possible lightgbm classifier for the earthquake dataset.  

## Imports

In [25]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from imblearn.over_sampling import SMOTE


# Hyperparameter Optimization
import optuna

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer
from category_encoders import LeaveOneOutEncoder, TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin

from pathlib import Path
import pickle
from os import PathLike
import torch

## Constants

In [2]:
TRAINING_FEATURES_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/train_values.csv"
TRAINING_LABELS_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/train_labels.csv"
TEST_FEATURES_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/test_values.csv"
SUBMISSION_FORMAT_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/submission_format.csv"

SUBMISSION_DIR="D:/ml_competitions/gorkha_earthquake/submissions"

## Data Loading

In [3]:
features_df         = pd.read_csv(TRAINING_FEATURES_PATH,   index_col=0)
labels_df           = pd.read_csv(TRAINING_LABELS_PATH,     index_col=0) - 1
test_features_df    = pd.read_csv(TEST_FEATURES_PATH,       index_col=0)

## Data Preprocessing

In [4]:
# Setup

geo_level_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
numerical_columns = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
categorical_columns = ['foundation_type', 'ground_floor_type', 'land_surface_condition', 
                       'legal_ownership_status', 'other_floor_type',
                       'plan_configuration', 'position', 'roof_type']

In [5]:

class DREncoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.geo_lv1_embedder = torch.nn.Embedding(geo_lv1_size, 16)
        self.geo_lv2_embedder = torch.nn.Embedding(geo_lv2_size, 128)
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128) 
        self.compressor = torch.nn.Linear(16+128+128, latent_dim)

    def forward(self, x):
        x_1 = self.geo_lv1_embedder(x[:, 0])
        x_2 = self.geo_lv2_embedder(x[:, 1])
        x_3 = self.geo_lv3_embedder(x[:, 2])
        x = torch.concat((x_1, x_2, x_3), dim=1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoDimensionReduction(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv1_size: int=31,
            geo_lv2_size: int=1418,
            geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.path = path
        self.model = DREncoder(
            latent_dim, 
            geo_lv1_size,
            geo_lv2_size,
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv1_size = geo_lv1_size
        self.geo_lv2_size = geo_lv2_size
        self.geo_lv3_size = geo_lv3_size
        self.model.load_state_dict(torch.load(path))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        if isinstance(X, pd.DataFrame):
            X = X.values # type: ignore
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()
    
class RollUpGeoLv3Encoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128)
        self.compressor = torch.nn.Linear(128, latent_dim)

    def forward(self, x):
        x = self.geo_lv3_embedder(x).squeeze(1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoLv3Rollup(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.path = path
        self.model = RollUpGeoLv3Encoder(
            latent_dim, 
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv3_size = geo_lv3_size
        self.model.load_state_dict(torch.load(path))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        if isinstance(X, pd.DataFrame):
            X = X.values # type: ignore
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()

In [22]:
# Load All Label Encoders
with open(Path.cwd().parent / 'models' / 'geo-lv-1-label-encoder.pickle', 'rb') as f:
    le1 = pickle.load(f)
with open(Path.cwd().parent / 'models' / 'geo-lv-2-label-encoder.pickle', 'rb') as f:
    le2 = pickle.load(f)
with open(Path.cwd().parent / 'models' / 'geo-lv-3-label-encoder.pickle', 'rb') as f:
    le3 = pickle.load(f)

# Prepare Transformers
geo_lv1_le = FunctionTransformer(
    func=lambda x: np.array(le1.transform(x.values.ravel())).reshape(-1, 1),
    feature_names_out='one-to-one'
)

geo_lv2_le = FunctionTransformer(
    func=lambda x: np.array(le2.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

geo_lv3_le = FunctionTransformer(
    func=lambda x: np.array(le3.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

# Dim Reducer
geo_dim_reduction_preprocessor = ColumnTransformer([
        ('geo1_le', geo_lv1_le, ['geo_level_1_id']),
        ('geo2_le', geo_lv2_le, ['geo_level_2_id']),
        ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
    ], 
    remainder='drop', 
    verbose_feature_names_out=False
).set_output(transform='pandas')

geo_dim_reduction_pipe = Pipeline([
    ('label_encoder', geo_dim_reduction_preprocessor),
    ('embedder', GeoDimensionReduction(path=Path.cwd().parent / 'models' / 'dim-reduction-32', latent_dim=32)),
])

# Rollup
geo3_rollup_preprocessor = ColumnTransformer([
        ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
    ], 
    remainder='drop', 
    verbose_feature_names_out=False,
).set_output(transform='pandas')

geo_rollup_pipe = Pipeline([
    ('label_encoder', geo3_rollup_preprocessor),
    ('embedder', GeoLv3Rollup(path=Path.cwd().parent / 'models' / 'geo3-rollup-16')),
])

preprocessor_lgbm = ColumnTransformer(
    transformers=[
        # ('bool', FunctionTransformer(lambda x: np.log(1+x), feature_names_out='one-to-one'), numerical_columns),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns),
        ('geo_dim_reduction', geo_dim_reduction_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
        ('geo_rollup', geo_rollup_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
        ('geos', TargetEncoder(cols=geo_level_columns), ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
    ],
    remainder='passthrough'
)
# preprocessor_lgbm.set_output(transform='pandas')

## Evaluate Model

In [49]:
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier


hyperparams = {
    "objective": "multiclass",
    "verbosity": -1,
    # "boosting_type": "dart",
    # 'colsample_bytree': 0.6841610995217867,
    # 'min_child_samples': 5,
    # 'num_leaves': 255,
    # 'reg_alpha': 0.8062643812740887,
    # 'reg_lambda': 0.0007637307639175299,
    # 'subsample': 0.8490626929837635,
    # 'subsample_freq': 5,
    # 'n_estimators': 500,
}

lgbm_clf = LGBMClassifier(force_row_wise=True, **hyperparams)

lgbm_pipeline = Pipeline([
    ('preprocessor', preprocessor_lgbm),
    ('classifier', lgbm_clf),
])


results = cross_val_score(
    lgbm_pipeline, 
    features_df, 
    labels_df.to_numpy().squeeze(), 
    cv=StratifiedKFold(n_splits=5), 
    scoring='f1_micro',
    verbose=100
)

# X_train, X_valid, y_train, y_valid = train_test_split(
#     features_df, 
#     labels_df.values.ravel(), 
#     stratify=labels_df.values.ravel(),
#     random_state=69
# )

# X_train = preprocessor_lgbm.fit_transform(X_train, y_train)
# X_valid = preprocessor_lgbm.transform(X_valid)

# lgbm_clf.fit(X_train, y_train)
# y_pred_train = lgbm_clf.predict(X_train)
# y_pred_valid = lgbm_clf.predict(X_valid)

# f1_train = f1_score(y_train, y_pred_train, average='micro')
# f1_valid = f1_score(y_valid, y_pred_valid, average='micro')

# print(f'{f1_train :.5f}')
# print(f'{f1_valid :.5f}')

print(f'{results.mean():.5f}')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] START .....................................................................
[CV] END ................................ score: (test=0.742) total time=  11.3s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.5s remaining:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.738) total time=  13.9s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.5s remaining:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.742) total time=  11.2s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   36.9s remaining:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.741) total time=  12.7s
[Parallel(n_jobs=1)]: Done   4 

0.721 - DR  
0.73735 - Target + DR  
0.72343 - No transform + DR  
0.72722 - DR + OHE  
0.74009 - No log transform on numeric  
0.74071 - No log, GeoDR + GeoRollUp  

## Hyperparameter Optimization

### Objective Function

In [36]:
# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(
        trial: optuna.Trial, 
        X_train: pd.DataFrame, 
        y_train: np.ndarray, 
        X_valid: pd.DataFrame, 
        y_valid: np.ndarray):

    hyperparams = {
        "objective": "multiclass",
        "verbosity": -1,
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "learning_rate": trial.suggest_float("subsample", 1e-8, 1.0, log=True),
    }

    # Add a callback for pruning.
    lgbm_clf = LGBMClassifier(force_row_wise=True, **hyperparams)
        
    # lgbm_pipeline = Pipeline([
    #     ('preprocessor', preprocessor_lgbm),
    #     ('scaler', StandardScaler()),
    #     ('classifier', lgbm_clf),
    # ]).set_output(transform='default')
    lgbm_pipeline = Pipeline([
        ('preprocessor', preprocessor_lgbm),
        ('classifier', LGBMClassifier(force_row_wise=True, **hyperparams)),
    ]).set_output(transform='default')

    results = cross_val_score(
        lgbm_pipeline, 
        X, y, cv=StratifiedKFold(n_splits=5), 
        scoring='f1_micro')
    return results.mean()

### Study

In [37]:
from sklearn.model_selection import train_test_split


study = optuna.create_study(
    study_name='lgbm-study', 
    storage='sqlite:///lgbm-32.db', 
    load_if_exists=True,
    direction="maximize",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
)

X_train, X_valid, y_train, y_valid = train_test_split(features_df, labels_df.values.ravel(), stratify=labels_df.values.ravel())

study.optimize(
    lambda trial: objective(trial, X=features_df, y=labels_df.to_numpy().squeeze()), 
    n_trials=100)

[32m[I 2023-05-06 19:29:20,038][0m A new study created in RDB with name: lgbm-study[0m
[32m[I 2023-05-06 19:31:06,552][0m Trial 0 finished with value: 0.7486080225123477 and parameters: {'reg_alpha': 0.0028518221171421787, 'reg_lambda': 0.8801527240793902, 'num_leaves': 219, 'colsample_bytree': 0.5979600947256924, 'subsample': 0.5697442274995929, 'subsample_freq': 2, 'min_child_samples': 44}. Best is trial 0 with value: 0.7486080225123477.[0m
[32m[I 2023-05-06 19:32:13,840][0m Trial 1 finished with value: 0.7462519367420956 and parameters: {'reg_alpha': 6.259343675120804e-05, 'reg_lambda': 7.37497864798079e-05, 'num_leaves': 108, 'colsample_bytree': 0.43927258392987956, 'subsample': 0.42565778179358565, 'subsample_freq': 7, 'min_child_samples': 68}. Best is trial 0 with value: 0.7486080225123477.[0m
[32m[I 2023-05-06 19:33:08,685][0m Trial 2 finished with value: 0.7419695202569179 and parameters: {'reg_alpha': 3.471572961818051e-07, 'reg_lambda': 4.095549636473527e-05, 'num_

KeyboardInterrupt: 

In [38]:
print(f"Best score: {study.best_trial.value}")
study.best_trial.params

Best score: 0.7491836128026256


{'colsample_bytree': 0.6841610995217867,
 'min_child_samples': 5,
 'num_leaves': 255,
 'reg_alpha': 0.8062643812740887,
 'reg_lambda': 0.0007637307639175299,
 'subsample': 0.8490626929837635,
 'subsample_freq': 5}

### Evaluate Best Model

In [41]:
hyperparams = {
    "objective": "binary",
    "metric": "auc",
    "verbose": -1,
    "boosting_type": "gbdt",
    'colsample_bytree': 0.6841610995217867,
    'min_child_samples': 5,
    'num_leaves': 255,
    'reg_alpha': 0.8062643812740887,
    'reg_lambda': 0.0007637307639175299,
    'subsample': 0.8490626929837635,
    'subsample_freq': 5
}
    
h1n1_lgbm_pipeline = Pipeline([
    ('preprocessor', preprocessor_lgbm),
    ('classifier', LGBMClassifier(force_row_wise=True, **hyperparams)),
]).set_output(transform='default')

results = cross_val_score(
    h1n1_lgbm_pipeline, 
    features_df, 
    labels_df, 
    cv=StratifiedKFold(n_splits=5), scoring='f1_micro'
)

print(f'{results.mean():.5f}') # 0.87217

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\sklearn.py", line 967, in fit
    super().fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=valid_sets,
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\sklearn.py", line 748, in fit
    self._Booster = train(
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\basic.py", line 2605, in __init__
    train_set.construct()
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\basic.py", line 1815, in construct
    self._lazy_init(self.data, label=self.label,
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\basic.py", line 1538, in _lazy_init
    self.__init_from_np2d(data, params_str, ref_dataset)
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\basic.py", line 1659, in __init_from_np2d
    _safe_call(_LIB.LGBM_DatasetCreateFromMat(
  File "d:\USC\College\Courses\CSCI-544 Applied Natural Language Processing\homework\3\src\env\lib\site-packages\lightgbm\basic.py", line 125, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
lightgbm.basic.LightGBMError: Multiclass objective and metrics don't match


# Submission

In [20]:
h1n1_lgbm_pipeline.fit(features_df, labels_df)

test_features_df = pd.read_csv(TEST_FEATURES_PATH, index_col="respondent_id")

submission_df = pd.read_csv(SUBMISSION_FORMAT_PATH, index_col="respondent_id")

np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = h1n1_lgbm_pipeline.predict_proba(test_features_df)[:, 1]

submission_df.to_csv(Path(SUBMISSION_DIR) / 'lgbm-h1n1.csv', index=True)