In [1]:
import numpy as np
import pandas as pd
from src.dataloader_ import *
from src.network_ import *
from src.utils import *

import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer

train_series_dir = "../../inputs/series_train.parquet/"
test_series_dir = "../../inputs/series_test.parquet/"

data_dic_path = "../../inputs/data_dictionary.csv"
sample_submission_path = "../../inputs/sample_submission.csv"
train_path = "../../inputs/train.csv"
test_path = "../../inputs/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
data_dic = pd.read_csv(data_dic_path)

import os
import random

import numpy as np
import torch


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


nb_name = os.path.basename(os.getcwd())  # notebook name
seed_torch(seed=42)

In [2]:
import os


def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    return pd.read_parquet(path)


def get_valid_ids(base_dir):
    return [f.split("=")[1].split(".")[0] for f in os.listdir(base_dir)]


p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="ffcd4dbd")
# p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="10e46254")

scale_columns = [
    "X",
    "Y",
    "Z",
    "enmo",
    "anglez",
    "light",
    "battery_voltage",
]

masked_columns = [
    "masked_X",
    "masked_Y",
    "masked_Z",
    "masked_enmo",
    "masked_anglez",
    "masked_light",
]

original_columns = ["battery_voltage", "non-wear_flag"]

p["non-wear_flag"] = 1 - p["non-wear_flag"]
scaler_features = p[scale_columns].values
scaler = StandardScaler()
p[scale_columns] = scaler.fit_transform(scaler_features)

for mask_col in masked_columns:
    p[mask_col] = p[mask_col.replace("masked_", "")] * p["non-wear_flag"]

p = p.fillna(0.0)

groups = p.groupby("relative_date_PCIAT")
# グループごとにデータフレームのリストに分割
chunks = [group.reset_index(drop=True) for _, group in groups]

use_cols = masked_columns + original_columns + scale_columns
watch_day = len(chunks)
active_logs = np.zeros((31, 17280, len(use_cols)), dtype=np.float32)
active_mask = np.zeros((31), dtype=np.int32)

for i, chunk in enumerate(chunks):
    if i == 0:  #
        active_logs[i, -len(chunk) :, :] = chunk[use_cols].values
    elif i == watch_day:
        active_logs[i, : len(chunk), :] = chunk[use_cols].values
    else:
        array = chunk[use_cols].values
        active_logs[i, : len(array), :] = array

    active_mask[i] = 1

    if i == 30:
        break

active_logs = active_logs.reshape(31, 24, 60, 12, 15)  # 12は1時間の分割数
active_logs_mean = active_logs.mean(axis=3)  # 1時間の分割数で平均を取る # 31, 1440, 15
# active_logs_var = active_logs.var(axis=3)  # 1時間の分割数で分散を取る # 31, 1440, 15
active_logs = np.concatenate([active_logs_mean], axis=-1)  # (31, 24, 30)
# print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)

print(active_logs_mean.shape, active_logs.shape)

active_logs_mean = active_logs.mean(axis=2)  # 1時間の分割数で平均を取る # 31, 1440, 15
active_logs_var = active_logs.var(axis=2)  # 1時間の分割数で分散を取る # 31, 1440, 15
active_logs = np.concatenate(
    [active_logs_mean, active_logs_var], axis=-1
)  # (31, 24, 30)
print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)
active_logs = active_logs.reshape(-1, 30)
print(active_logs.shape)

# active_logs = active_logs.unsqueeze(0)
active_logs = torch.tensor(active_logs, dtype=torch.float32).unsqueeze(0).to("cuda")
print(active_logs.shape)

(31, 24, 60, 15) (31, 24, 60, 15)
(31, 24, 15) (31, 24, 15) (31, 24, 30)
(744, 30)
torch.Size([1, 744, 30])


In [3]:
import torch
import torch.nn as nn


class TransformerAutoEncoder(nn.Module):
    def __init__(self, d_model=128, nhead=4, num_layers=2):
        super(TransformerAutoEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.embedding = nn.Linear(30, d_model)
        self.output_layer = nn.Linear(d_model, 30)

    def forward(self, x):
        x = self.embedding(x)  # (batch, day*time, d_model)
        encoded = self.encoder(x.permute(1, 0, 2))  # (day*time, batch, d_model)
        decoded = self.decoder(encoded, encoded)  # (day*time, batch, d_model)
        return (
            self.output_layer(decoded.permute(1, 0, 2)),
            encoded,
        )  # (batch, day*time, hidden)


# Example
model = TransformerAutoEncoder().to("cuda")
input_data = torch.randn(1, 744, 30).to("cuda")
output = model(input_data)


import torch
import torch.nn as nn


class LSTMAutoEncoder(nn.Module):
    def __init__(self, input_size=30, hidden_size=64, num_layers=2):
        super(LSTMAutoEncoder, self).__init__()
        # Encoder LSTM
        self.encoder_lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        # Decoder LSTM
        self.decoder_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=input_size,
            num_layers=num_layers,
            batch_first=True,
        )

    def forward(self, x):
        # Encode
        _, (h, _) = self.encoder_lstm(x)
        embedding = h[-1]
        # Decode
        h = (
            h[-1].unsqueeze(1).repeat(1, x.size(1), 1)
        )  # Repeat hidden state for each timestep
        decoded, _ = self.decoder_lstm(h)
        return decoded, embedding


# 実行例
model = LSTMAutoEncoder()
input_data = torch.randn(1, 744, 30)
output, embedding = model(input_data)
print("LSTM AutoEncoder output shape:", output.shape, embedding.shape)

LSTM AutoEncoder output shape: torch.Size([1, 744, 30]) torch.Size([1, 64])


In [4]:
class CNNAutoEncoder(nn.Module):
    def __init__(self):
        super(CNNAutoEncoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(
                in_channels=30, out_channels=64, kernel_size=3, stride=2, padding=1
            ),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(
                128, 64, kernel_size=3, stride=2, padding=1, output_padding=1
            ),
            nn.ReLU(),
            nn.ConvTranspose1d(
                64, 30, kernel_size=3, stride=2, padding=1, output_padding=1
            ),
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, channel, time)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded.permute(0, 2, 1), encoded  # (batch, time, channel)


# 実行例
model = CNNAutoEncoder()
input_data = torch.randn(1, 744, 30)
output, embedding = model(input_data)
print("CNN AutoEncoder output shape:", output.shape, embedding.shape)

CNN AutoEncoder output shape: torch.Size([1, 744, 30]) torch.Size([1, 128, 186])


In [5]:
# 各モデルのインスタンス化
# transformer_model = TransformerAutoEncoder()

# # 正規分布からランダムに(1, 31, 17280, 15)の形状でデータを生成
# input_data = torch.randn(1, 31, 17280, 15)

# # 各モデルにデータを入力し、出力形状を確認
# print("Input shape:", input_data.shape)

# # Transformerモデル
# transformer_output = transformer_model(input_data)
# print("Transformer Model output shape:", transformer_output.shape)

In [6]:
train

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,Winter,0.0,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0


### テーブルデータセット

In [7]:
add_features = [
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]


def feature_engineering(df):
    # season_cols = [col for col in df.columns if "Season" in col]
    # df = df.drop(season_cols, axis=1)
    df["BMI_Age"] = df["Physical-BMI"] * df["Basic_Demos-Age"]
    df["Internet_Hours_Age"] = (
        df["PreInt_EduHx-computerinternet_hoursday"] * df["Basic_Demos-Age"]
    )
    df["BMI_Internet_Hours"] = (
        df["Physical-BMI"] * df["PreInt_EduHx-computerinternet_hoursday"]
    )
    df["BFP_BMI"] = df["BIA-BIA_Fat"] / df["BIA-BIA_BMI"]
    df["FFMI_BFP"] = df["BIA-BIA_FFMI"] / df["BIA-BIA_Fat"]
    df["FMI_BFP"] = df["BIA-BIA_FMI"] / df["BIA-BIA_Fat"]
    df["LST_TBW"] = df["BIA-BIA_LST"] / df["BIA-BIA_TBW"]
    df["BFP_BMR"] = df["BIA-BIA_Fat"] * df["BIA-BIA_BMR"]
    df["BFP_DEE"] = df["BIA-BIA_Fat"] * df["BIA-BIA_DEE"]
    df["BMR_Weight"] = df["BIA-BIA_BMR"] / df["Physical-Weight"]
    df["DEE_Weight"] = df["BIA-BIA_DEE"] / df["Physical-Weight"]
    df["SMM_Height"] = df["BIA-BIA_SMM"] / df["Physical-Height"]
    df["Muscle_to_Fat"] = df["BIA-BIA_SMM"] / df["BIA-BIA_FMI"]
    df["Hydration_Status"] = df["BIA-BIA_TBW"] / df["Physical-Weight"]
    df["ICW_TBW"] = df["BIA-BIA_ICW"] / df["BIA-BIA_TBW"]

    return df


train = feature_engineering(train)
train = train.replace([np.inf, -np.inf], np.nan)
for add_ in add_features:
    train[add_] = train[add_].fillna(0.0)
train = train.dropna(thresh=10, axis=0)

test = feature_engineering(test)
test = test.replace([np.inf, -np.inf], np.nan)
for add_ in add_features:
    test[add_] = test[add_].fillna(0.0)
test = test.dropna(thresh=10, axis=0)

In [8]:
# onehotEncoderの作成
from sklearn.preprocessing import OneHotEncoder


double_columns = [
    "Basic_Demos-Age",
    "Basic_Demos-Sex",
    "CGAS-CGAS_Score",
    "Physical-BMI",
    "Physical-Height",
    "Physical-Weight",
    "Physical-Waist_Circumference",
    "Physical-Diastolic_BP",
    "Physical-HeartRate",
    "Physical-Systolic_BP",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
    "FGC-FGC_CU",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_PU",
    "FGC-FGC_PU_Zone",
    "FGC-FGC_SRL",
    "FGC-FGC_SRL_Zone",
    "FGC-FGC_SRR",
    "FGC-FGC_SRR_Zone",
    "FGC-FGC_TL",
    "FGC-FGC_TL_Zone",
    "BIA-BIA_Activity_Level_num",
    "BIA-BIA_BMC",
    "BIA-BIA_BMI",
    "BIA-BIA_BMR",
    "BIA-BIA_DEE",
    "BIA-BIA_ECW",
    "BIA-BIA_FFM",
    "BIA-BIA_FFMI",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "BIA-BIA_Frame_num",
    "BIA-BIA_ICW",
    "BIA-BIA_LDM",
    "BIA-BIA_LST",
    "BIA-BIA_SMM",
    "BIA-BIA_TBW",
    "PAQ_A-PAQ_A_Total",
    "PAQ_C-PAQ_C_Total",
    "SDS-SDS_Total_Raw",
    "SDS-SDS_Total_T",
    "PreInt_EduHx-computerinternet_hoursday",
]
from sklearn.preprocessing import StandardScaler


def create_dataset_(df, scaler=None, train=True):

    if scaler is None:
        print("create new scaler")
        scaler = StandardScaler()
        scaler.fit(df[double_columns + add_features])
        with open("./assets/scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

    double_feature = scaler.transform(df[double_columns + add_features])
    # 欠損値の補完
    double_feature = np.nan_to_num(double_feature)

    ids = df["id"].values.reshape(-1, 1)
    X = double_feature

    # DataFrameの作成
    ids_df = pd.DataFrame(ids, columns=["id"])
    X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])

    if train:
        y = df["sii"].fillna(-1).values.reshape(-1, 1)
        y_df = pd.DataFrame(y, columns=["sii"])
        df = pd.concat([ids_df, X_df, y_df], axis=1)
    else:
        df = pd.concat([ids_df, X_df], axis=1)
    return df, scaler


train, scaler = create_dataset_(train)
test = create_dataset_(test, scaler=scaler, train=False)[0]

create new scaler


In [None]:
class CMIDataset(Dataset):
    def __init__(self, table_df, valid_ids, base_dir, save_filename):
        self.base_dir = base_dir
        self.table_df = table_df
        self.valid_ids = valid_ids
        self.save_filename = save_filename
        self.scale_columns = [
            "X",
            "Y",
            "Z",
            "enmo",
            "anglez",
            "light",
            "battery_voltage",
        ]

        self.masked_columns = [
            "masked_X",
            "masked_Y",
            "masked_Z",
            "masked_enmo",
            "masked_anglez",
            "masked_light",
        ]

        self.original_columns = ["battery_voltage", "non-wear_flag"]

    def __len__(self):
        return len(self.valid_ids)

    def __getitem__(self, idx):
        # テーブルデータの抽出
        id_ = self.valid_ids[idx]

        save_dir = f"/home/tatsuya/code/projects/kaggle/ChildMindInstitute2024/precreated_dataset/{self.save_filename}/"
        save_path = os.path.join(save_dir, id_)

        table = self.table_df.loc[self.table_df["id"] == self.valid_ids[idx], :]
        table_feature = table.drop(columns=["id", "sii"]).values
        sii = table["sii"].values

        # 時系列データの抽出
        use_cols = self.masked_columns + self.original_columns + self.scale_columns
        p = read_parquet(self.base_dir, self.valid_ids[idx])

        if p is not None:
            p["non-wear_flag"] = 1 - p["non-wear_flag"]
            scaler_features = p[scale_columns].values
            scaler = StandardScaler()
            p[scale_columns] = scaler.fit_transform(scaler_features)

            for mask_col in masked_columns:
                p[mask_col] = p[mask_col.replace("masked_", "")] * p["non-wear_flag"]

            p = p.fillna(0.0)

            groups = p.groupby("relative_date_PCIAT")
            # グループごとにデータフレームのリストに分割
            chunks = [group.reset_index(drop=True) for _, group in groups]

            use_cols = masked_columns + original_columns + scale_columns
            watch_day = len(chunks)
            active_logs = np.zeros((31, 17280, len(use_cols)), dtype=np.float32)
            active_mask = np.zeros((31), dtype=np.int32)

            for i, chunk in enumerate(chunks):
                if i == 0:  #
                    active_logs[i, -len(chunk) :, :] = chunk[use_cols].values
                elif i == watch_day:
                    active_logs[i, : len(chunk), :] = chunk[use_cols].values
                else:
                    array = chunk[use_cols].values
                    active_logs[i, : len(array), :] = array

                active_mask[i] = 1

                if i == 30:
                    break

            active_logs = active_logs.reshape(31, 24, 60, 12, 15)  # 12は1時間の分割数
            active_logs_mean = active_logs.mean(
                axis=3
            )  # 1時間の分割数で平均を取る # 31, 1440, 15
            # active_logs_var = active_logs.var(axis=3)  # 1時間の分割数で分散を取る # 31, 1440, 15
            active_logs = np.concatenate([active_logs_mean], axis=-1)  # (31, 24, 30)
            # print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)

            # print(active_logs_mean.shape, active_logs.shape)

            active_logs_mean = active_logs.mean(
                axis=2
            )  # 1時間の分割数で平均を取る # 31, 1440, 15
            active_logs_var = active_logs.var(
                axis=2
            )  # 1時間の分割数で分散を取る # 31, 1440, 15
            active_logs = np.concatenate(
                [active_logs_mean, active_logs_var], axis=-1
            )  # (31, 24, 30)
            # print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)
            active_logs = active_logs.reshape(-1, 30)

        else:
            active_logs = np.zeros((744, 30), dtype=np.float32)
            active_mask = np.zeros((744), dtype=np.int32)

        dataset_ = {
            "id": id_,
            "table_input": torch.tensor(table_feature, dtype=torch.float32),
            "time_input": torch.tensor(active_logs, dtype=torch.float32),
            "mask": torch.tensor(active_mask, dtype=torch.int32),
            "output": torch.tensor(sii, dtype=torch.float32),
        }

        return dataset_


def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    if not os.path.exists(path):
        return None
    return pd.read_parquet(path)


dataset = CMIDataset(
    table_df=train,
    valid_ids=get_valid_ids(train_series_dir),
    base_dir=train_series_dir,
    save_filename="train",
)

# AutoEncoderのモデルのインスタンス化
# transformer_model = TransformerAutoEncoder().to("cuda")
# transformer_model.load_state_dict(torch.load("./assets/transformer_autoencoder.pth"))
# lstm_model = LSTMAutoEncoder().to("cuda")
cnn_model = CNNAutoEncoder().to("cuda")
cnn_model.load_state_dict(torch.load("./assets/cnn_autoencoder.pth"))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.0001)
# データセットからデータを取り出す

from tqdm import tqdm

best_model = None
minimum_loss = 1000000

# for epoch in range(10):
#     print(f"Epoch {epoch}")
#     dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
#     epoch_loss = []
#     tq = tqdm(dataloader)
#     for data in dataloader:
#         optimizer.zero_grad()
#         table_input = data["table_input"]
#         time_input = data["time_input"].to("cuda")
#         mask = data["mask"]

#         # モデルにデータを入力し、出力を取得
#         cnn_output, embedding = cnn_model(time_input)
#         # 損失の計算
#         loss = criterion(cnn_output, time_input)
#         loss.backward()

#         optimizer.step()

#         epoch_loss.append(loss.item())

#         tq.set_postfix(loss=np.mean(epoch_loss))
#         tq.update()

#     if np.mean(epoch_loss) < minimum_loss:
#         minimum_loss = np.mean(epoch_loss)
#         best_model = cnn_model
#         cnn_model.eval()
#         torch.save(cnn_model.state_dict(), "./assets/cnn_autoencoder.pth")
#         cnn_model.train()

#     print(f"Epoch {epoch} Loss: {np.mean(epoch_loss)}")
#     tq.close()

In [10]:
dataset = CMIDataset(
    table_df=train,
    valid_ids=get_valid_ids(train_series_dir),
    base_dir=train_series_dir,
    save_filename="train",
)

# AutoEncoderのモデルのインスタンス化
# transformer_model = TransformerAutoEncoder().to("cuda")
# transformer_model.load_state_dict(torch.load("./assets/transformer_autoencoder.pth"))
cnn_model = CNNAutoEncoder().to("cuda")
cnn_model.load_state_dict(torch.load("./assets/cnn_autoencoder.pth"))
# データセットからデータを取り出す

from tqdm import tqdm

best_model = None
minimum_loss = 1000000

print(f"Create Embedding")
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
epoch_loss = []
tq = tqdm(dataloader)

embedding_result = []

for data in dataloader:
    id_ = data["id"][0]
    table_input = data["table_input"]
    time_input = data["time_input"].to("cuda")
    mask = data["mask"]

    # モデルにデータを入力し、出力を取得
    cnn_output, embedding = cnn_model(time_input)
    # 損失の計算

    mean_embedding = embedding.squeeze(0).mean(axis=-1).cpu().detach().numpy()
    # mean_embedding = embedding.cpu().detach().numpy()

    embedding_result.append({"id": id_, "embedding": mean_embedding})

    tq.update()

tq.close()

Create Embedding


100%|██████████| 996/996 [01:30<00:00, 11.03it/s]


In [11]:
embedding_df_all = None

for row in embedding_result:
    id_ = row["id"]
    embedding = row["embedding"]
    embedding_cols = [f"embedding_{i}" for i in range(embedding.shape[-1])]
    embedding_df = pd.DataFrame(embedding.reshape(1, -1), columns=embedding_cols)
    embedding_df["id"] = id_

    if embedding_df_all is None:
        embedding_df_all = embedding_df
    else:
        embedding_df_all = pd.concat([embedding_df_all, embedding_df], axis=0)

embedding_df_all = embedding_df_all[["id"] + embedding_cols]
embedding_df_all

Unnamed: 0,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,23dafdab,1.728628,1.331131,1.257535,1.118445,0.987019,1.087559,1.131072,0.995585,1.255212,...,1.302553,0.938135,0.814442,2.029479,1.367574,1.924622,1.463870,1.350334,2.242436,2.051528
0,e4614ec6,2.314725,1.688984,1.519484,1.735348,1.193666,1.536869,1.550757,1.390435,1.811116,...,1.619692,1.358370,1.046536,2.621617,1.780002,2.258282,2.161840,1.815999,3.007010,2.582609
0,56ef356c,2.159015,1.613339,1.231065,1.499338,1.035320,1.470763,1.512265,1.431377,1.769337,...,1.495479,0.998973,0.892488,2.294397,1.778150,2.344225,2.104154,1.832239,2.810589,2.482788
0,dcfcd574,2.620850,1.587554,1.519351,1.526752,1.178476,1.720902,1.462330,1.542035,2.176839,...,1.697147,1.257415,0.930574,2.445918,1.790776,2.725111,2.195935,2.077631,2.984224,2.751604
0,338146bd,1.702317,1.146785,0.947993,1.113863,0.825012,1.121790,1.108613,1.016863,1.336934,...,1.141957,0.823104,0.675855,1.746763,1.330033,1.805014,1.522724,1.314972,2.145939,1.884954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2a9e0dee,2.158872,1.343930,1.242806,1.386952,0.977405,1.362802,1.344970,1.286474,1.639083,...,1.352335,1.042948,0.783055,2.138570,1.547521,2.187941,1.923652,1.662507,2.642068,2.314295
0,0eddd8e5,2.484117,1.468063,1.276847,1.519080,1.297554,1.425604,1.605949,1.533385,1.932054,...,1.541252,1.163978,0.947115,2.526440,1.801957,2.367331,2.304687,2.076990,2.887928,2.770179
0,a49eda7f,2.429227,1.728591,1.312529,1.631665,1.108303,1.534565,1.537943,1.418095,1.846664,...,1.545845,1.069983,0.899543,2.499636,1.783232,2.476268,2.219812,1.915287,3.120052,2.707149
0,fa34f945,1.557142,0.875563,1.173156,0.954732,0.988366,1.082507,1.034887,0.900241,1.134255,...,1.228432,0.866287,0.633695,1.558471,1.345874,1.759833,1.225766,1.281289,1.923254,1.814930


## Metric

In [12]:
from sklearn.metrics import *


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

## Model, Dataset

In [13]:
train = train[train["sii"] != -1].reset_index(drop=True)
train = train.merge(embedding_df_all, on="id", how="left")
# train.fillna(0.0, inplace=True)

In [14]:
feature_imputer = KNNImputer(n_neighbors=5)
sii_imputer = KNNImputer(n_neighbors=5)

numeric_cols = train.columns.drop(
    ["id", "sii"]
)  # test.select_dtypes(include=["float64", "int64"]).columns
numeric_feature_cols = numeric_cols.copy()
# numeric_feature_cols = numeric_feature_cols.drop("sii")

# numeric_sii_cols = train.select_dtypes(include=["float64", "int64"]).columns

# sii_inputed = sii_imputer.fit_transform(train[numeric_sii_cols])

train[numeric_cols] = feature_imputer.fit_transform(train[numeric_feature_cols])

with open("feature_imputer.pkl", "wb") as f:
    pickle.dump(feature_imputer, f)

train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,00008ff9,-1.520226,-0.770846,-0.647115,-0.480065,-1.331104,-0.858103,0.0,0.0,0.0,...,1.428142,1.037569,0.784901,2.058612,1.580202,2.141743,1.709615,1.614125,2.443338,2.158067
1,000fd460,-0.401093,-0.770846,0.0,-1.035839,-1.063457,-0.965819,-0.948658,0.393202,-0.848813,...,1.160853,0.845095,0.668588,1.787926,1.312256,1.872245,1.547791,1.371346,2.154535,1.842573
2,00105258,-0.12131,1.297277,0.24825,-0.524777,0.074043,-0.301573,0.0,-0.34161,0.90777,...,1.404206,1.106323,0.821356,2.140345,1.584761,2.223369,1.837487,1.689799,2.566232,2.28033
3,00115b9f,-0.401093,-0.770846,0.24825,-0.203318,0.007131,-0.166928,0.0,-0.709017,1.127343,...,0.635673,0.428215,0.316652,0.767731,0.634796,0.851805,0.541776,0.496939,0.895844,0.785507
4,001f3379,0.718039,1.297277,-0.691883,0.576564,0.475513,0.519759,0.0,-0.709017,-0.62924,...,1.504753,0.889019,0.727498,1.811945,1.743254,2.050748,1.620137,1.58883,2.485625,2.320382


In [15]:
# train.fillna(0.0, inplace=True)

In [16]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# from keras.models import Model
# from keras.layers import Input, Dense
# from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    VotingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

SEED = 42


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


def TrainML(model_class, test_data, model_name):  # -> Series[Any] | Any:
    id_ = train["id"]
    X = train.drop(["sii", "id"], axis=1)
    y = train["sii"]

    y_df = train[["id", "sii"]]

    n_splits = 5
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(
        tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)
    ):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        train_id = id_.iloc[train_idx]
        val_id = id_.iloc[test_idx]

        # 各idを保存
        with open(f"./assets/train_id_{fold}.pkl", "wb") as f:
            pickle.dump(train_id, f)

        with open(f"./assets/val_id_{fold}.pkl", "wb") as f:
            pickle.dump(val_id, f)

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(
            y_train, y_train_pred.round(0).astype(int)
        )
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        # test_preds[:, fold] = model.predict(test_data.drop(columns=["id"]))

        print(
            f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}"
        )
        # clear_output(wait=True)

        voting_model = model
        # modelの保存

        if model_name != "lgbm":
            with open(f"./assets/{model_name}_{fold}.pkl", "wb") as f:
                pickle.dump(voting_model, f)
        else:
            # lgbmの場合
            model.booster_.save_model(f"./assets/{model_name}_{fold}.txt")

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"CV: {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(
        evaluate_predictions,
        x0=[0.5, 1.5, 2.5],
        args=(y, oof_non_rounded),
        method="Nelder-Mead",
    )
    assert KappaOPtimizer.success, "Optimization did not converge."

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"tuned Kappa: {tKappa:.3f}")

    # tpm = test_preds.mean(axis=1)
    # tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    # y["pred"] = oof_non_rounded.values
    y_df["pred"] = oof_non_rounded
    return y_df


# Model parameters for LightGBM
Params = {
    "learning_rate": 0.046,
    "max_depth": 12,
    "num_leaves": 478,
    "min_data_in_leaf": 13,
    "feature_fraction": 0.893,
    "bagging_fraction": 0.784,
    "bagging_freq": 4,
    "lambda_l1": 10,  # Increased from 6.59
    "lambda_l2": 0.01,  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 1,  # Increased from 0.1
    "reg_lambda": 5,  # Increased from 1
    "random_state": SEED,
}


CatBoost_Params = {
    "learning_rate": 0.05,
    "depth": 6,
    "iterations": 200,
    "random_seed": SEED,
    "verbose": 0,
    "l2_leaf_reg": 10,  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

lgbm = Light
xgb = XGB_Model
ctb = CatBoost_Model

# Train the ensemble model
lgbm_oof = TrainML(lgbm, test, model_name="lgbm")
xgb_oof = TrainML(xgb, test, model_name="xgb")
ctb_oof = TrainML(ctb, test, model_name="ctb")

Training Folds:   0%|          | 0/5 [00:00<?, ?it/s]



Training Folds:  20%|██        | 1/5 [00:00<00:02,  1.94it/s]

Fold 1 - Train QWK: 0.7856, Validation QWK: 0.3923


Training Folds:  40%|████      | 2/5 [00:00<00:01,  2.35it/s]

Fold 2 - Train QWK: 0.7970, Validation QWK: 0.4294


Training Folds:  60%|██████    | 3/5 [00:01<00:00,  2.50it/s]

Fold 3 - Train QWK: 0.8029, Validation QWK: 0.4079


Training Folds:  80%|████████  | 4/5 [00:01<00:00,  2.60it/s]

Fold 4 - Train QWK: 0.8035, Validation QWK: 0.3694


Training Folds: 100%|██████████| 5/5 [00:01<00:00,  2.52it/s]


Fold 5 - Train QWK: 0.8019, Validation QWK: 0.3606
Mean Train QWK --> 0.7982
CV: 0.3919
tuned Kappa: 0.453


Training Folds:  20%|██        | 1/5 [00:00<00:02,  1.67it/s]

Fold 1 - Train QWK: 0.9233, Validation QWK: 0.3892


Training Folds:  40%|████      | 2/5 [00:01<00:01,  1.64it/s]

Fold 2 - Train QWK: 0.9206, Validation QWK: 0.4264


Training Folds:  60%|██████    | 3/5 [00:01<00:01,  1.64it/s]

Fold 3 - Train QWK: 0.9282, Validation QWK: 0.3561


Training Folds:  80%|████████  | 4/5 [00:02<00:00,  1.66it/s]

Fold 4 - Train QWK: 0.9319, Validation QWK: 0.3586


Training Folds: 100%|██████████| 5/5 [00:03<00:00,  1.66it/s]


Fold 5 - Train QWK: 0.9282, Validation QWK: 0.3519
Mean Train QWK --> 0.9264
CV: 0.3765
tuned Kappa: 0.436


Training Folds:  20%|██        | 1/5 [00:00<00:02,  1.36it/s]

Fold 1 - Train QWK: 0.5194, Validation QWK: 0.3686


Training Folds:  40%|████      | 2/5 [00:01<00:02,  1.22it/s]

Fold 2 - Train QWK: 0.5474, Validation QWK: 0.4242


Training Folds:  60%|██████    | 3/5 [00:02<00:01,  1.27it/s]

Fold 3 - Train QWK: 0.5345, Validation QWK: 0.4020


Training Folds:  80%|████████  | 4/5 [00:03<00:00,  1.32it/s]

Fold 4 - Train QWK: 0.5412, Validation QWK: 0.3329


Training Folds: 100%|██████████| 5/5 [00:03<00:00,  1.30it/s]

Fold 5 - Train QWK: 0.5465, Validation QWK: 0.3330
Mean Train QWK --> 0.5378
CV: 0.3721
tuned Kappa: 0.462





In [18]:
with open("./assets/lstm_oof_preds.pkl", "rb") as f:
    lstm_oof = pickle.load(f)

lgbm_oof = lgbm_oof.sort_values("id").reset_index(drop=True)
xgb_oof = xgb_oof.sort_values("id").reset_index(drop=True)
ctb_oof = ctb_oof.sort_values("id").reset_index(drop=True)
lstm_oof = lstm_oof.sort_values("id").reset_index(drop=True)

oof = lstm_oof.rename(columns={"pred_sii": "lstm_oof"})
oof["lgbm_oof"] = lgbm_oof["pred"]
oof["xgb_oof"] = xgb_oof["pred"]
oof["ctb_oof"] = ctb_oof["pred"]


y = xgb_oof["sii"].values
oof = oof[["id", "lstm_oof", "lgbm_oof", "xgb_oof", "ctb_oof"]]

In [19]:
from sklearn.metrics import *


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    mean_weight = thresholds[:4]
    thresholds = thresholds[4:]

    lstm_pred = oof_non_rounded[:, 0]
    lgbm_pred = oof_non_rounded[:, 1]
    xgb_pred = oof_non_rounded[:, 2]
    ctb_pred = oof_non_rounded[:, 3]
    oof_non_rounded = (
        mean_weight[0] * lstm_pred
        + mean_weight[1] * lgbm_pred
        + mean_weight[2] * xgb_pred
        + mean_weight[3] * ctb_pred
    )
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


KappaOPtimizer = minimize(
    evaluate_predictions,
    x0=[0.25, 0.25, 0.25, 0.25, 0.5, 1.5, 2.5],
    args=(y, oof.drop(columns=["id"]).values),
    method="Nelder-Mead",
)

pred = oof.drop(columns=["id"]).values
pred = (
    pred[:, 0] * KappaOPtimizer.x[0]
    + pred[:, 1] * KappaOPtimizer.x[1]
    + pred[:, 2] * KappaOPtimizer.x[2]
    + pred[:, 3] * KappaOPtimizer.x[3]
)

oof_tuned = threshold_Rounder(pred, KappaOPtimizer.x[4:])
tKappa = quadratic_weighted_kappa(y, oof_tuned)

print(f"tuned Kappa: {tKappa:.3f}")

tuned Kappa: 0.471


In [20]:
KappaOPtimizer.x

array([0.25978536, 0.25245911, 0.26024378, 0.25166782, 0.58545671,
       1.09664355, 2.63238037])

In [22]:
import lightgbm as lgb

model_path = f"./assets/lgbm_0.txt"
loaded_model = lgb.Booster(model_file=model_path)

In [26]:
loaded_model.predict(train.drop(["id", "sii"], axis=1))

array([0.99616248, 0.14798295, 0.42020302, ..., 1.09092573, 0.67819407,
       0.43826146])

In [None]:
final_submission["sii"] = (
    final_submission["lstm"] * 0.25978536
    + final_submission["lgbm"] * 0.25245911
    + final_submission["xgb"] * 0.26024378
    + final_submission["ctb"] * 0.25166782
)