In [1]:
import sys

sys.path.append("../")

import numpy as np
import pandas as pd

# from src.dataloader_ import *
# from src.network_ import *
from src.utils import *

import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from pytorch_tabnet.tab_model import TabNetRegressor

train_series_dir = "../../../inputs/series_train.parquet/"
test_series_dir = "../../../inputs/series_test.parquet/"

data_dic_path = "../../../inputs/data_dictionary.csv"
sample_submission_path = "../../../inputs/sample_submission.csv"
train_path = "../../../inputs/train.csv"
test_path = "../../../inputs/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
data_dic = pd.read_csv(data_dic_path)

import os
import random

import numpy as np
import torch


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


nb_name = os.path.basename(os.getcwd())  # notebook name
seed_torch(seed=42)

In [2]:
# データの前処理

from sklearn.impute import SimpleImputer, KNNImputer

add_features = [
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]

double_columns = [
    "FGC-FGC_SRR_Zone",
    "BIA-BIA_SMM",
    "Physical-Waist_Circumference",
    "BIA-BIA_FFMI",
    "FGC-FGC_CU",
    "PreInt_EduHx-computerinternet_hoursday",
    "BIA-BIA_ECW",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_SRL_Zone",
    "BIA-BIA_DEE",
    "Physical-Weight",
    "Fitness_Endurance-Time_Mins",
    "FGC-FGC_SRR",
    "SDS-SDS_Total_T",
    "FGC-FGC_PU",
    "BIA-BIA_FFM",
    "FGC-FGC_TL_Zone",
    "Physical-BMI",
    "Physical-Systolic_BP",
    "Physical-HeartRate",
    "BIA-BIA_ICW",
    "Physical-Height",
    "FGC-FGC_SRL",
    "BIA-BIA_BMC",
    "Fitness_Endurance-Time_Sec",
    "BIA-BIA_Frame_num",
    "Basic_Demos-Age",
    "FGC-FGC_GSND_Zone",
    "Basic_Demos-Sex",
    "FGC-FGC_GSND",
    "BIA-BIA_LST",
    "FGC-FGC_TL",
    "BIA-BIA_BMI",
    "BIA-BIA_FMI",
    "PAQ_C-PAQ_C_Total",
    "BIA-BIA_Activity_Level_num",
    "FGC-FGC_GSD",
    "BIA-BIA_BMR",
    "BIA-BIA_Fat",
    "SDS-SDS_Total_Raw",
    "CGAS-CGAS_Score",
    "FGC-FGC_PU_Zone",
    "BIA-BIA_LDM",
    "Fitness_Endurance-Max_Stage",
    "PAQ_A-PAQ_A_Total",
    "BIA-BIA_TBW",
    "FGC-FGC_GSD_Zone",
    "Physical-Diastolic_BP",
]


def feature_engineering(df):
    # season_cols = [col for col in df.columns if "Season" in col]
    # df = df.drop(season_cols, axis=1)
    df["BMI_Age"] = df["Physical-BMI"] * df["Basic_Demos-Age"]
    df["Internet_Hours_Age"] = (
        df["PreInt_EduHx-computerinternet_hoursday"] * df["Basic_Demos-Age"]
    )
    df["BMI_Internet_Hours"] = (
        df["Physical-BMI"] * df["PreInt_EduHx-computerinternet_hoursday"]
    )
    df["BFP_BMI"] = df["BIA-BIA_Fat"] / df["BIA-BIA_BMI"]
    df["FFMI_BFP"] = df["BIA-BIA_FFMI"] / df["BIA-BIA_Fat"]
    df["FMI_BFP"] = df["BIA-BIA_FMI"] / df["BIA-BIA_Fat"]
    df["LST_TBW"] = df["BIA-BIA_LST"] / df["BIA-BIA_TBW"]
    df["BFP_BMR"] = df["BIA-BIA_Fat"] * df["BIA-BIA_BMR"]
    df["BFP_DEE"] = df["BIA-BIA_Fat"] * df["BIA-BIA_DEE"]
    df["BMR_Weight"] = df["BIA-BIA_BMR"] / df["Physical-Weight"]
    df["DEE_Weight"] = df["BIA-BIA_DEE"] / df["Physical-Weight"]
    df["SMM_Height"] = df["BIA-BIA_SMM"] / df["Physical-Height"]
    df["Muscle_to_Fat"] = df["BIA-BIA_SMM"] / df["BIA-BIA_FMI"]
    df["Hydration_Status"] = df["BIA-BIA_TBW"] / df["Physical-Weight"]
    df["ICW_TBW"] = df["BIA-BIA_ICW"] / df["BIA-BIA_TBW"]

    return df


train = feature_engineering(train)
train = train.replace([np.inf, -np.inf], np.nan)
for add_ in add_features:
    train[add_] = train[add_].fillna(0.0)
train = train.dropna(thresh=10, axis=0)

test = feature_engineering(test)
test = test.replace([np.inf, -np.inf], np.nan)
for add_ in add_features:
    test[add_] = test[add_].fillna(0.0)
test = test.dropna(thresh=10, axis=0)

In [3]:
# AutoEncoderの学習

import os
from sklearn.preprocessing import StandardScaler


def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    return pd.read_parquet(path)


def get_valid_ids(base_dir):
    return [f.split("=")[1].split(".")[0] for f in os.listdir(base_dir)]


p = read_parquet(base_dir="../../../inputs/series_train.parquet/", id_="ffcd4dbd")
# p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="10e46254")

scale_columns = [
    "X",
    "Y",
    "Z",
    "enmo",
    "anglez",
    "light",
    "battery_voltage",
]

masked_columns = [
    "masked_X",
    "masked_Y",
    "masked_Z",
    "masked_enmo",
    "masked_anglez",
    "masked_light",
]

original_columns = ["battery_voltage", "non-wear_flag"]

p["non-wear_flag"] = 1 - p["non-wear_flag"]
scaler_features = p[scale_columns].values
scaler = StandardScaler()
p[scale_columns] = scaler.fit_transform(scaler_features)

for mask_col in masked_columns:
    p[mask_col] = p[mask_col.replace("masked_", "")] * p["non-wear_flag"]

p = p.fillna(0.0)

groups = p.groupby("relative_date_PCIAT")
# グループごとにデータフレームのリストに分割
chunks = [group.reset_index(drop=True) for _, group in groups]

use_cols = masked_columns + original_columns + scale_columns
watch_day = len(chunks)
active_logs = np.zeros((31, 17280, len(use_cols)), dtype=np.float32)
active_mask = np.zeros((31), dtype=np.int32)

for i, chunk in enumerate(chunks):
    if i == 0:  #
        active_logs[i, -len(chunk) :, :] = chunk[use_cols].values
    elif i == watch_day:
        active_logs[i, : len(chunk), :] = chunk[use_cols].values
    else:
        array = chunk[use_cols].values
        active_logs[i, : len(array), :] = array

    active_mask[i] = 1

    if i == 30:
        break

active_logs = active_logs.reshape(31, 24, 60, 12, 15)  # 12は1時間の分割数
active_logs_mean = active_logs.mean(axis=3)  # 1時間の分割数で平均を取る # 31, 1440, 15
# active_logs_var = active_logs.var(axis=3)  # 1時間の分割数で分散を取る # 31, 1440, 15
active_logs = np.concatenate([active_logs_mean], axis=-1)  # (31, 24, 30)
# print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)

print(active_logs_mean.shape, active_logs.shape)

active_logs_mean = active_logs.mean(axis=2)  # 1時間の分割数で平均を取る # 31, 1440, 15
active_logs_var = active_logs.var(axis=2)  # 1時間の分割数で分散を取る # 31, 1440, 15
active_logs = np.concatenate(
    [active_logs_mean, active_logs_var], axis=-1
)  # (31, 24, 30)
print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)
active_logs = active_logs.reshape(-1, 30)
print(active_logs.shape)

# active_logs = active_logs.unsqueeze(0)
active_logs = torch.tensor(active_logs, dtype=torch.float32).unsqueeze(0).to("cuda")
print(active_logs.shape)

import torch.nn as nn


class CNNAutoEncoder(nn.Module):
    def __init__(self):
        super(CNNAutoEncoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(
                in_channels=30, out_channels=64, kernel_size=3, stride=2, padding=1
            ),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(
                128, 64, kernel_size=3, stride=2, padding=1, output_padding=1
            ),
            nn.ReLU(),
            nn.ConvTranspose1d(
                64, 30, kernel_size=3, stride=2, padding=1, output_padding=1
            ),
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, channel, time)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded.permute(0, 2, 1), encoded  # (batch, time, channel)


# 実行例
model = CNNAutoEncoder()
input_data = torch.randn(1, 744, 30)
output, embedding = model(input_data)
print("CNN AutoEncoder output shape:", output.shape, embedding.shape)

(31, 24, 60, 15) (31, 24, 60, 15)
(31, 24, 15) (31, 24, 15) (31, 24, 30)
(744, 30)
torch.Size([1, 744, 30])
CNN AutoEncoder output shape: torch.Size([1, 744, 30]) torch.Size([1, 128, 186])


In [4]:
from torch.utils.data import Dataset, DataLoader


class CMIDataset(Dataset):
    def __init__(self, table_df, valid_ids, base_dir, save_filename):
        self.base_dir = base_dir
        self.table_df = table_df
        self.valid_ids = valid_ids
        self.save_filename = save_filename
        self.scale_columns = [
            "X",
            "Y",
            "Z",
            "enmo",
            "anglez",
            "light",
            "battery_voltage",
        ]

        self.masked_columns = [
            "masked_X",
            "masked_Y",
            "masked_Z",
            "masked_enmo",
            "masked_anglez",
            "masked_light",
        ]

        self.original_columns = ["battery_voltage", "non-wear_flag"]

    def __len__(self):
        return len(self.valid_ids)

    def __getitem__(self, idx):
        # テーブルデータの抽出
        id_ = self.valid_ids[idx]

        save_dir = f"/home/tatsuya/code/projects/kaggle/ChildMindInstitute2024/precreated_dataset/{self.save_filename}/"
        save_path = os.path.join(save_dir, id_)

        table = self.table_df.loc[self.table_df["id"] == self.valid_ids[idx], :]
        table_feature = table.drop(columns=["id", "sii"]).values
        sii = table["sii"].values

        # 時系列データの抽出
        use_cols = self.masked_columns + self.original_columns + self.scale_columns
        p = read_parquet(self.base_dir, self.valid_ids[idx])

        if p is not None:
            p["non-wear_flag"] = 1 - p["non-wear_flag"]
            scaler_features = p[scale_columns].values
            scaler = StandardScaler()
            p[scale_columns] = scaler.fit_transform(scaler_features)

            for mask_col in masked_columns:
                p[mask_col] = p[mask_col.replace("masked_", "")] * p["non-wear_flag"]

            p = p.fillna(0.0)

            groups = p.groupby("relative_date_PCIAT")
            # グループごとにデータフレームのリストに分割
            chunks = [group.reset_index(drop=True) for _, group in groups]

            use_cols = masked_columns + original_columns + scale_columns
            watch_day = len(chunks)
            active_logs = np.zeros((31, 17280, len(use_cols)), dtype=np.float32)
            active_mask = np.zeros((31), dtype=np.int32)

            for i, chunk in enumerate(chunks):
                if i == 0:  #
                    active_logs[i, -len(chunk) :, :] = chunk[use_cols].values
                elif i == watch_day:
                    active_logs[i, : len(chunk), :] = chunk[use_cols].values
                else:
                    array = chunk[use_cols].values
                    active_logs[i, : len(array), :] = array

                active_mask[i] = 1

                if i == 30:
                    break

            active_logs = active_logs.reshape(31, 24, 60, 12, 15)  # 12は1時間の分割数
            active_logs_mean = active_logs.mean(
                axis=3
            )  # 1時間の分割数で平均を取る # 31, 1440, 15
            # active_logs_var = active_logs.var(axis=3)  # 1時間の分割数で分散を取る # 31, 1440, 15
            active_logs = np.concatenate([active_logs_mean], axis=-1)  # (31, 24, 30)
            # print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)

            # print(active_logs_mean.shape, active_logs.shape)

            active_logs_mean = active_logs.mean(
                axis=2
            )  # 1時間の分割数で平均を取る # 31, 1440, 15
            active_logs_var = active_logs.var(
                axis=2
            )  # 1時間の分割数で分散を取る # 31, 1440, 15
            active_logs = np.concatenate(
                [active_logs_mean, active_logs_var], axis=-1
            )  # (31, 24, 30)
            # print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)
            active_logs = active_logs.reshape(-1, 30)

        else:
            active_logs = np.zeros((744, 30), dtype=np.float32)
            active_mask = np.zeros((744), dtype=np.int32)

        dataset_ = {
            "id": id_,
            # "table_input": torch.tensor(table_feature, dtype=torch.float32),
            "time_input": torch.tensor(active_logs, dtype=torch.float32),
            "mask": torch.tensor(active_mask, dtype=torch.int32),
            "output": torch.tensor(sii, dtype=torch.float32),
        }

        return dataset_


def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    if not os.path.exists(path):
        return None
    return pd.read_parquet(path)


dataset = CMIDataset(
    table_df=train,
    valid_ids=get_valid_ids(train_series_dir),
    base_dir=train_series_dir,
    save_filename="train",
)

# AutoEncoderのモデルのインスタンス化
cnn_model = CNNAutoEncoder().to("cuda")
cnn_model.load_state_dict(torch.load("./assets/cnn_autoencoder.pth"))

# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.0001)
# # データセットからデータを取り出す

# from tqdm import tqdm

# best_model = None
# minimum_loss = 1000000

# for epoch in range(10):
#     print(f"Epoch {epoch}")
#     dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
#     epoch_loss = []
#     tq = tqdm(dataloader)
#     for data in dataloader:
#         optimizer.zero_grad()
#         # table_input = data["table_input"]
#         time_input = data["time_input"].to("cuda")
#         mask = data["mask"]

#         # モデルにデータを入力し、出力を取得
#         cnn_output, embedding = cnn_model(time_input)
#         # 損失の計算
#         loss = criterion(cnn_output, time_input)
#         loss.backward()

#         optimizer.step()

#         epoch_loss.append(loss.item())

#         tq.set_postfix(loss=np.mean(epoch_loss))
#         tq.update()

#     if np.mean(epoch_loss) < minimum_loss:
#         minimum_loss = np.mean(epoch_loss)
#         best_model = cnn_model
#         cnn_model.eval()
#         torch.save(cnn_model.state_dict(), "./assets/cnn_autoencoder.pth")
#         cnn_model.train()

#     print(f"Epoch {epoch} Loss: {np.mean(epoch_loss)}")
#     tq.close()

<All keys matched successfully>

In [5]:
dataset = CMIDataset(
    table_df=train,
    valid_ids=get_valid_ids(train_series_dir),
    base_dir=train_series_dir,
    save_filename="train",
)

# AutoEncoderのモデルのインスタンス化
cnn_model = CNNAutoEncoder().to("cuda")
cnn_model.load_state_dict(torch.load("./assets/cnn_autoencoder.pth"))
# データセットからデータを取り出す

from tqdm import tqdm

best_model = None
minimum_loss = 1000000

print(f"Create Embedding")
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
epoch_loss = []
tq = tqdm(dataloader)

embedding_result = []

for data in dataloader:
    id_ = data["id"][0]
    # table_input = data["table_input"]
    time_input = data["time_input"].to("cuda")
    mask = data["mask"]

    # モデルにデータを入力し、出力を取得
    cnn_output, embedding = cnn_model(time_input)
    # 損失の計算

    mean_embedding = embedding.squeeze(0).mean(axis=-1).cpu().detach().numpy()
    # mean_embedding = embedding.cpu().detach().numpy()

    embedding_result.append({"id": id_, "embedding": mean_embedding})

    tq.update()

tq.close()

Create Embedding


100%|██████████| 996/996 [01:33<00:00, 10.71it/s]


In [6]:
embedding_df_all = None

for row in embedding_result:
    id_ = row["id"]
    embedding = row["embedding"]
    embedding_cols = [f"embedding_{i}" for i in range(embedding.shape[-1])]
    embedding_df = pd.DataFrame(embedding.reshape(1, -1), columns=embedding_cols)
    embedding_df["id"] = id_

    if embedding_df_all is None:
        embedding_df_all = embedding_df
    else:
        embedding_df_all = pd.concat([embedding_df_all, embedding_df], axis=0)

embedding_df_all = embedding_df_all[["id"] + embedding_cols]
embedding_df_all

Unnamed: 0,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,23dafdab,0.934273,1.058295,1.028002,1.107266,1.511564,1.678031,0.614336,0.847055,0.827912,...,1.049939,1.311329,0.996335,1.217725,0.973498,0.754627,1.093903,0.841220,1.063418,1.293948
0,e4614ec6,1.292569,1.470513,1.542059,1.298862,1.744134,1.957554,0.864446,1.181660,0.996835,...,1.476123,1.654581,1.056002,1.295083,1.551957,0.754289,1.351622,1.053791,1.531604,1.537732
0,56ef356c,1.030161,1.136603,1.320513,1.339276,1.893055,1.956167,0.807934,1.109800,1.112787,...,1.311299,1.835630,1.093886,1.421084,1.230976,0.685264,1.468262,1.255411,1.170305,1.344505
0,dcfcd574,1.437756,1.521565,1.574320,1.582690,1.890268,2.237839,0.957044,1.491451,1.149451,...,1.768229,1.925022,1.165750,1.321340,1.601160,0.861113,1.609951,1.270303,1.577081,1.733535
0,338146bd,0.833982,0.835131,1.005347,1.011853,1.326406,1.422274,0.602682,0.865930,0.832643,...,0.994025,1.280672,0.815593,0.957112,0.948529,0.525334,1.072017,0.860638,0.921265,1.005224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2a9e0dee,1.118476,1.154102,1.328735,1.243005,1.651012,1.804135,0.703245,1.077119,0.960325,...,1.295312,1.530142,1.013229,1.196613,1.293185,0.606831,1.258096,0.982478,1.243394,1.284215
0,0eddd8e5,1.003418,1.115285,1.469960,1.305835,2.142858,2.006252,0.750761,1.160900,1.059262,...,1.395912,1.999641,1.101297,1.405424,1.463978,0.790238,1.587342,1.133078,1.314270,1.498254
0,a49eda7f,1.112953,1.273634,1.532017,1.440813,1.984239,1.999086,0.782063,1.181599,1.059615,...,1.443727,1.882844,1.108292,1.392762,1.356991,0.621868,1.475014,1.189176,1.345740,1.492343
0,fa34f945,0.885967,0.950511,0.895033,1.111232,1.495834,1.656162,0.655794,0.761335,0.848918,...,0.853265,1.006434,0.973718,1.090717,0.844414,0.705840,1.027594,0.745894,0.964948,1.118776


In [7]:
train = train.merge(embedding_df_all, on="id", how="left")

***Tabnet***

In [8]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback
import os
import torch
from pytorch_tabnet.callbacks import Callback


class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy="median")
        self.best_model_path = "best_tabnet_model.pt"

    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)

        if hasattr(y, "values"):
            y = y.values

        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, y, test_size=0.2, random_state=42
        )

        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=["valid"],
            eval_metric=["mse"],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor="valid_mse",
                    mode="min",
                    save_best_only=True,
                    verbose=True,
                )
            ],
        )

        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file

        return self

    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()

    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result


# TabNet hyperparameters
TabNet_Params = {
    "n_d": 64,  # Width of the decision prediction layer
    "n_a": 64,  # Width of the attention embedding for each step
    "n_steps": 5,  # Number of steps in the architecture
    "gamma": 1.5,  # Coefficient for feature selection regularization
    "n_independent": 2,  # Number of independent GLU layer in each GLU block
    "n_shared": 2,  # Number of shared GLU layer in each GLU block
    "lambda_sparse": 1e-4,  # Sparsity regularization
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": dict(lr=2e-2, weight_decay=1e-5),
    "mask_type": "entmax",
    "scheduler_params": dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    "scheduler_fn": torch.optim.lr_scheduler.ReduceLROnPlateau,
    "verbose": 1,
    "device_name": "cuda" if torch.cuda.is_available() else "cpu",
}


class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(
        self, filepath, monitor="val_loss", mode="min", save_best_only=True, verbose=1
    ):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float("inf") if mode == "min" else -float("inf")

    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return

        # Check if current metric is better than best
        if (self.mode == "min" and current < self.best) or (
            self.mode == "max" and current > self.best
        ):
            if self.verbose:
                print(
                    f"\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}"
                )
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

In [9]:
import warnings

warnings.filterwarnings("ignore")
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    VotingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
)


def extract_features(df):
    return df[double_columns + add_features + embedding_cols]


seed = 42
oof = []
cv_scores = []
y = None

# Model parameters for LightGBM
Params = {
    "learning_rate": 0.046,
    "max_depth": 12,
    "num_leaves": 478,
    "min_data_in_leaf": 13,
    "feature_fraction": 0.893,
    "bagging_fraction": 0.784,
    "bagging_freq": 4,
    "lambda_l1": 10,  # Increased from 6.59
    "lambda_l2": 0.01,  # Increased from 2.68e-06
}
XGB_Params = {
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 1,  # Increased from 0.1
    "reg_lambda": 5,  # Increased from 1
    "random_state": seed,
}


CatBoost_Params = {
    "learning_rate": 0.05,
    "depth": 6,
    "iterations": 200,
    "random_seed": seed,
    "verbose": 0,
    "l2_leaf_reg": 10,  # Increase this value
}

for fold in range(5):
    Light = LGBMRegressor(**Params, random_state=seed, verbose=-1, n_estimators=300)
    XGB_Model = XGBRegressor(**XGB_Params)
    CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
    TabNet_Model = TabNetWrapper(**TabNet_Params)  # New

    model = VotingRegressor(
        estimators=[
            ("lightgbm", Light),
            ("xgboost", XGB_Model),
            ("catboost", CatBoost_Model),
            ("tabnet", TabNet_Model),
        ]
    )
    # model = LGBMRegressor(**Params, random_state=seed, verbose=-1, n_estimators=300)

    with open(f"../divided-datasets/fold_train_ids_{fold}.pkl", "rb") as f:
        fold_train_ids = pickle.load(f)

    with open(f"../divided-datasets/fold_valid_ids_{fold}.pkl", "rb") as f:
        fold_valid_ids = pickle.load(f)

    train_fold = train[train["id"].isin(fold_train_ids)].reset_index(drop=True)
    valid_fold = train[train["id"].isin(fold_valid_ids)].reset_index(drop=True)

    mode = "drop"

    if mode == "impute":
        numeric_cols = train.select_dtypes(include=["float64", "int64"]).columns
        imputer = KNNImputer(n_neighbors=5)
        train_fold[numeric_cols] = imputer.fit_transform(train_fold[numeric_cols])
        test[numeric_cols] = imputer.transform(test[numeric_cols])
    elif mode == "drop":
        train_fold = train_fold[train_fold["sii"].notnull()].reset_index(drop=True)

    train_fold_x = extract_features(train_fold)
    valid_fold_x = extract_features(valid_fold)

    train_fold_y = train_fold["sii"].astype(int)
    valid_fold_y = valid_fold["sii"].astype(int)

    model.fit(train_fold_x, train_fold_y)

    # save model
    with open(f"./assets/model02_{fold}.pkl", "wb") as f:
        pickle.dump(model, f)

    train_pred = model.predict(train_fold_x)
    valid_pred = model.predict(valid_fold_x)

    train_kappa = quadratic_weighted_kappa(
        train_fold_y, train_pred.round(0).astype(int)
    )
    val_kappa = quadratic_weighted_kappa(valid_fold_y, valid_pred.round(0).astype(int))
    cv_scores.append(val_kappa)

    print(
        f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}"
    )

    for i, id_ in enumerate(fold_valid_ids):
        oof.append({"id": id_, "sii": valid_pred[i]})

    if y is None:
        y = valid_fold[["id", "sii"]]
    else:
        y = pd.concat([y, valid_fold[["id", "sii"]]], axis=0).reset_index(drop=True)

oof = pd.DataFrame(oof)

KappaOPtimizer = minimize(
    evaluate_predictions,
    x0=[0.5, 1.5, 2.5],
    args=(y["sii"].astype(int), oof["sii"]),
    method="Nelder-Mead",
)

oof_tuned = threshold_Rounder(oof["sii"], KappaOPtimizer.x)
tKappa = quadratic_weighted_kappa(y["sii"], oof_tuned)
print(f"CV: {np.mean(cv_scores):.4f}")
print(f"tuned Kappa: {tKappa:.3f}")

epoch 0  | loss: 2.78555 | valid_mse: 8.69106 |  0:00:00s

Epoch 0: valid_mse improved from inf to 8.6911
Successfully saved model at best_tabnet_model.pt.zip
epoch 1  | loss: 1.79328 | valid_mse: 1.17087 |  0:00:00s

Epoch 1: valid_mse improved from 8.6911 to 1.1709
Successfully saved model at best_tabnet_model.pt.zip
epoch 2  | loss: 1.32625 | valid_mse: 1.39013 |  0:00:00s
epoch 3  | loss: 1.07773 | valid_mse: 1.12234 |  0:00:00s

Epoch 3: valid_mse improved from 1.1709 to 1.1223
Successfully saved model at best_tabnet_model.pt.zip
epoch 4  | loss: 0.82884 | valid_mse: 14.89091|  0:00:00s
epoch 5  | loss: 0.76702 | valid_mse: 1.39176 |  0:00:00s
epoch 6  | loss: 0.7533  | valid_mse: 1.07529 |  0:00:00s

Epoch 6: valid_mse improved from 1.1223 to 1.0753
Successfully saved model at best_tabnet_model.pt.zip
epoch 7  | loss: 0.77908 | valid_mse: 1.03437 |  0:00:00s

Epoch 7: valid_mse improved from 1.0753 to 1.0344
Successfully saved model at best_tabnet_model.pt.zip
epoch 8  | loss: 0.

In [10]:
oof.to_csv("./oof/oof.csv", index=False)