In [1]:
import os
import random
import warnings
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import (
    roc_auc_score,
    mean_squared_error,
    average_precision_score,
    log_loss,
)
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

warnings.simplefilter("ignore")


train_file = './data/train.pgt'
test_file = './data/test.pgt'

In [2]:
# loading files
train = pd.read_parquet(train_file)
test = pd.read_parquet(test_file)

# file NA
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [3]:
train.columns

Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'O00391', 'O00533', 'O14773',
       ...
       'TSTDLQVLAAR', 'EAEEHQETQC(UniMod_4)LR', 'NVDGVNYASITR', 'ELDESLQVAER',
       'AQC(UniMod_4)GGGLLGVR', 'GAQTQTEEEMTR', 'ELDLNSVLLK', 'PALEDLR',
       'NSWGEEWGMGGYVK', 'SSNTYTLTDVRR'],
      dtype='object', length=1173)

In [4]:
raw_features = [t for t in train.columns if t not in ['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4',]]
label_1 = ['updrs_1']
label_2 = ['updrs_2']
label_3 = ['updrs_3']
label_4 = ['updrs_4']


In [5]:
train.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O14773,...,TSTDLQVLAAR,EAEEHQETQC(UniMod_4)LR,NVDGVNYASITR,ELDESLQVAER,AQC(UniMod_4)GGGLLGVR,GAQTQTEEEMTR,ELDLNSVLLK,PALEDLR,NSWGEEWGMGGYVK,SSNTYTLTDVRR
0,55_0,55,0,10.0,6.0,15.0,0.0,11254.3,732430.0,31238.0,...,13189.3,22021.6,23122.5,283103.0,10698.4,11808.5,27229.3,0.0,8833.09,13929.2
1,55_3,55,3,10.0,7.0,25.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55_6,55,6,8.0,10.0,34.0,0.0,13163.6,630465.0,26219.9,...,12739.3,23109.0,23499.8,89583.1,0.0,10336.0,23305.4,50231.1,9106.86,15427.3
3,55_9,55,9,8.0,9.0,30.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,30703.6,...,11830.0,22730.6,21860.1,59812.8,10465.3,11592.7,28367.0,24770.9,11701.7,15499.3


In [9]:
train.shape

(2615, 1173)

In [6]:
test.head()

Unnamed: 0,visit_id,visit_month,patient_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,SGSAHEYSSSPDDAIFQSLAR,FLPSYQAVEYMR,EAEEETTNDNGVLVLEPARK,SNVSDAVAQSTR,STSSFPC(UniMod_4)PAGHFNGFR,GAQTQTEEEMTR,EGDMLTLFDGDGPSAR,FAALDNEEEDK,IC(UniMod_4)LEDNVLM(UniMod_35)SGVK,GNSYFMVEVK
0,3342_0,0,3342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3342_0,0,3342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3342_0,0,3342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3342_0,0,3342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50423_0,0,50423,33127.9,490742.0,43615.3,0.0,16486.6,2882.42,124344.0,...,128080.0,17743.7,5472.04,14379.8,50484.8,11051.1,15938.2,14768.7,14798.6,46049.4


In [7]:
class LightGBMWrapper:
    """lightGBM training wrapper"""

    def __init__(self, model_config):
        """ init function to construct model
        """
        # retrieve config from model_config
        self.model_config = model_config
        self.name = model_config["name"]
        self.root_path = model_config["root_path"]
        self.epoch = model_config["epoch"]
        self.seed = model_config["seed"]
        self.n_folds = model_config["n_folds"]
        self.early_stopping = model_config["early_stopping"]
        self.lgb_hyper_params = model_config["lgb_hyper_params"]
        self.features = model_config["features"]
        self.lablel = model_config["label"]
        self.verbose_eval = model_config["verbose_eval"]
        self.remark = model_config["remark"]

        # init experiment id and output folder
        self.experiment_id = self.name + "_" + datetime.now().strftime("%Y%m%d_%H%M%S")
        self.output_path = os.path.join(self.root_path, self.experiment_id)

        # create folder if not exist
        if not os.path.isdir(self.output_path):
            os.makedirs(self.output_path)

        # create train log file
        self.train_log = open(
            os.path.join(self.output_path, "train.log"), "w", buffering=1
        )

    def train(self, data):
        # data should not be null
        assert (data is not None, "training data should not be null")
        
        self._write_train_log_header()

        # init variables
        feature_importance = []
        label_name = self.lablel[0]

        # split data by StratifiedKFold
        skf = StratifiedKFold(
            n_splits=self.n_folds, shuffle=True, random_state=self.seed
        )
        X = data[self.features]
        y = data[self.lablel]
        split = skf.split(X, y)

        # loop split data
        for fold, (train_index, val_index) in enumerate(split):
            # init evaluation result dict
            eval_dict = {}
            # init lgb training and validation dataset
            train_data = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index])
            val_data = lgb.Dataset(X.iloc[val_index], label=y.iloc[val_index])
            # train model
            model = lgb.train(
                self.lgb_hyper_params,
                train_set=train_data,
                num_boost_round=self.epoch,
                valid_sets=[train_data, val_data],
                evals_result=eval_dict,
                early_stopping_rounds=self.early_stopping,
                verbose_eval=self.verbose_eval,
            )
            # save model
            model.save_model(os.path.join(self.output_path, f"{fold}.ckpt"))
            # save model performance
            self._write_train_log_body(eval_dict, fold)
            # append feature importance
            feature_importance.append(
                pd.DataFrame({
                        "feature_name": model.feature_name(),
                        "importance_gain": model.feature_importance(importance_type="gain"),
                        "importance_split": model.feature_importance(importance_type="split")
                })
            )
        # output feature importance
        self._write_feature_importance(feature_importance)

    def predict(self, data, model_path=None):
        """ predict and generate submission file
        """
        # create submission dataframe
        submission = data[self.id]
        submission["prediction"] = 0
        # looping all the folds
        for fold in range(self.n_folds):
            # generate model file path
            model_path = self.output_path if model_path is None else model_path
            model_file = os.path.join(model_path, f"{fold}.ckpt")
            # loading model
            model = lgb.Booster(model_file=model_file)
            test_preds = model.predict(
                data[self.features], num_iteration=model.best_iteration
            )
            submission["prediction"] += test_preds / self.n_folds
        # save to local disk
        submission.to_csv(
            os.path.join(model_path, "submission.csv.zip"),
            compression="zip",
            index=False,
        )

    def _write_feature_importance(self, feature_importance):
        # calc mean for feature importance -> sort -> save csv to disk
        _ = (
            pd.concat(feature_importance)
            .groupby(["feature_name"])
            .mean()
            .reset_index()
            .sort_values(by=["importance_gain"], ascending=False)
            .to_csv(
                os.path.join(self.output_path, "feature_importance.csv"), index=False
            )
        )
        
    def _write_train_log_header(self):
        """ write header log
        """
        self._write_log(self.train_log, "================================Model Config Start================================\n")
        self._write_log(self.train_log, str(self.model_config) + "\n")
        self._write_log(self.train_log,"================================Model Config End================================\n")
        
    def _write_train_log_body(self, eval_dict, fold):
        """ write training log into train log
        """
        # retrieve validation result for dict
        train_metrics = eval_dict["training"][self.lgb_hyper_params["metric"]]
        validate_metrics = eval_dict["valid_1"][self.lgb_hyper_params["metric"]]
        self._write_log(self.train_log,f"================================Fold {fold} start================================\n")
        for i in range(len(validate_metrics) // self.verbose_eval):
            self._write_log(self.train_log, f" - {i * self.verbose_eval} round - train_metric: {train_metrics[i * self.verbose_eval]:.6f} - valid_metric: {validate_metrics[i * self.verbose_eval]:.6f}\n")
        self._write_log(self.train_log,f"================================Fold {fold} End================================\n")

        
    def _write_log(self, f, log, is_print=True):
        """ print and write log
        """
        if is_print:
            print(log)
        f.write(log)
        return None

In [11]:
# define lightGBM config
lgb_config = {
    "name": "lightGBM_with_raw_features",
    "root_path": "./model/",
    "seed": 42,
    "epoch": 10,
    "early_stopping": 10,
    "verbose_eval": 50,
    "n_folds": 3,
    "features": raw_features,
    "label": label_1,
    "verbose_eval": 10,
    "remark": "lightGBM_with_raw_features",
    "lgb_hyper_params": {
        "objective": "regression",
        "metric": "rmse",
        "boosting": "dart",
        "max_depth": -1,
        "num_leaves": 5,
        "learning_rate": 0.01,
        'bagging_freq': 5,
        'bagging_fraction' : 0.75,
        "feature_fraction": 0.1,
        "min_data_in_leaf": 10,
        "max_bin": 5,
        "min_data_in_bin": 20,
        "tree_learner": "serial",
        "boost_from_average": "false",
        "lambda_l1": 0.1,
        "lambda_l2": 30,
        "num_threads": 14,
        "verbosity": 1,
    },
}
# construct lightGBM model
lightGBMModel_updrs1 = LightGBMWrapper(lgb_config)
lightGBMModel_updrs1.train(train)


{'name': 'lightGBM_with_raw_features', 'root_path': './model/', 'seed': 42, 'epoch': 10, 'early_stopping': 10, 'verbose_eval': 10, 'n_folds': 3, 'features': ['O00391', 'O00533', 'O14773', 'O14791', 'O15240', 'O15394', 'O43505', 'O60888', 'O75144', 'O94919', 'P00450', 'P00734', 'P00738', 'P00746', 'P00747', 'P00748', 'P00751', 'P01008', 'P01009', 'P01011', 'P01019', 'P01023', 'P01024', 'P01033', 'P01034', 'P01042', 'P01344', 'P01591', 'P01608', 'P01621', 'P01717', 'P01780', 'P01834', 'P01857', 'P01859', 'P01860', 'P01861', 'P01876', 'P01877', 'P02452', 'P02647', 'P02649', 'P02652', 'P02656', 'P02671', 'P02675', 'P02679', 'P02747', 'P02748', 'P02749', 'P02750', 'P02751', 'P02753', 'P02760', 'P02763', 'P02765', 'P02766', 'P02768', 'P02774', 'P02787', 'P02790', 'P04004', 'P04156', 'P04180', 'P04196', 'P04207', 'P04217', 'P04275', 'P04433', 'P05060', 'P05067', 'P05090', 'P05155', 'P05156', 'P05452', 'P05546', 'P06396', 'P06681', 'P06727', 'P07225', 'P07339', 'P07602', 'P07711', 'P07858', '