In [1]:
import numpy as np
import pandas as pd
from src.dataloader_ import *
from src.network_ import *
from src.utils import *

import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer

train_series_dir = "../../inputs/series_train.parquet/"
test_series_dir = "../../inputs/series_test.parquet/"

data_dic_path = "../../inputs/data_dictionary.csv"
sample_submission_path = "../../inputs/sample_submission.csv"
train_path = "../../inputs/train.csv"
test_path = "../../inputs/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
data_dic = pd.read_csv(data_dic_path)

import os
import random

import numpy as np
import torch


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


nb_name = os.path.basename(os.getcwd())  # notebook name
seed_torch(seed=42)

In [2]:
import os


def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    return pd.read_parquet(path)


def get_valid_ids(base_dir):
    return [f.split("=")[1].split(".")[0] for f in os.listdir(base_dir)]


p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="ffcd4dbd")
# p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="10e46254")

scale_columns = [
    "X",
    "Y",
    "Z",
    "enmo",
    "anglez",
    "light",
    "battery_voltage",
]

masked_columns = [
    "masked_X",
    "masked_Y",
    "masked_Z",
    "masked_enmo",
    "masked_anglez",
    "masked_light",
]

original_columns = ["battery_voltage", "non-wear_flag"]

p["non-wear_flag"] = 1 - p["non-wear_flag"]
scaler_features = p[scale_columns].values
scaler = StandardScaler()
p[scale_columns] = scaler.fit_transform(scaler_features)

for mask_col in masked_columns:
    p[mask_col] = p[mask_col.replace("masked_", "")] * p["non-wear_flag"]

p = p.fillna(0.0)

groups = p.groupby("relative_date_PCIAT")
# グループごとにデータフレームのリストに分割
chunks = [group.reset_index(drop=True) for _, group in groups]

use_cols = masked_columns + original_columns + scale_columns
watch_day = len(chunks)
active_logs = np.zeros((31, 17280, len(use_cols)), dtype=np.float32)
active_mask = np.zeros((31), dtype=np.int32)

for i, chunk in enumerate(chunks):
    if i == 0:  #
        active_logs[i, -len(chunk) :, :] = chunk[use_cols].values
    elif i == watch_day:
        active_logs[i, : len(chunk), :] = chunk[use_cols].values
    else:
        array = chunk[use_cols].values
        active_logs[i, : len(array), :] = array

    active_mask[i] = 1

    if i == 30:
        break

active_logs = active_logs.reshape(31, 24, 60, 12, 15)  # 12は1時間の分割数
active_logs_mean = active_logs.mean(axis=3)  # 1時間の分割数で平均を取る # 31, 1440, 15
# active_logs_var = active_logs.var(axis=3)  # 1時間の分割数で分散を取る # 31, 1440, 15
active_logs = np.concatenate([active_logs_mean], axis=-1)  # (31, 24, 30)
# print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)

print(active_logs_mean.shape, active_logs.shape)

active_logs_mean = active_logs.mean(axis=2)  # 1時間の分割数で平均を取る # 31, 1440, 15
active_logs_var = active_logs.var(axis=2)  # 1時間の分割数で分散を取る # 31, 1440, 15
active_logs = np.concatenate(
    [active_logs_mean, active_logs_var], axis=-1
)  # (31, 24, 30)
print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)
active_logs = active_logs.reshape(-1, 30)
print(active_logs.shape)

# active_logs = active_logs.unsqueeze(0)
active_logs = torch.tensor(active_logs, dtype=torch.float32).unsqueeze(0).to("cuda")
print(active_logs.shape)

(31, 24, 60, 15) (31, 24, 60, 15)
(31, 24, 15) (31, 24, 15) (31, 24, 30)
(744, 30)
torch.Size([1, 744, 30])


In [3]:
import torch
import torch.nn as nn


class TransformerAutoEncoder(nn.Module):
    def __init__(self, d_model=128, nhead=4, num_layers=2):
        super(TransformerAutoEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.embedding = nn.Linear(30, d_model)
        self.output_layer = nn.Linear(d_model, 30)

    def forward(self, x):
        x = self.embedding(x)  # (batch, day*time, d_model)
        encoded = self.encoder(x.permute(1, 0, 2))  # (day*time, batch, d_model)
        decoded = self.decoder(encoded, encoded)  # (day*time, batch, d_model)
        return (
            self.output_layer(decoded.permute(1, 0, 2)),
            encoded,
        )  # (batch, day*time, hidden)


# Example
model = TransformerAutoEncoder().to("cuda")
input_data = torch.randn(1, 744, 30).to("cuda")
output = model(input_data)


import torch
import torch.nn as nn


class LSTMAutoEncoder(nn.Module):
    def __init__(self, input_size=30, hidden_size=64, num_layers=2):
        super(LSTMAutoEncoder, self).__init__()
        # Encoder LSTM
        self.encoder_lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        # Decoder LSTM
        self.decoder_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=input_size,
            num_layers=num_layers,
            batch_first=True,
        )

    def forward(self, x):
        # Encode
        _, (h, _) = self.encoder_lstm(x)
        embedding = h[-1]
        # Decode
        h = (
            h[-1].unsqueeze(1).repeat(1, x.size(1), 1)
        )  # Repeat hidden state for each timestep
        decoded, _ = self.decoder_lstm(h)
        return decoded, embedding


# 実行例
model = LSTMAutoEncoder()
input_data = torch.randn(1, 744, 30)
output, embedding = model(input_data)
print("LSTM AutoEncoder output shape:", output.shape, embedding.shape)

LSTM AutoEncoder output shape: torch.Size([1, 744, 30]) torch.Size([1, 64])


In [4]:
# 各モデルのインスタンス化
# transformer_model = TransformerAutoEncoder()

# # 正規分布からランダムに(1, 31, 17280, 15)の形状でデータを生成
# input_data = torch.randn(1, 31, 17280, 15)

# # 各モデルにデータを入力し、出力形状を確認
# print("Input shape:", input_data.shape)

# # Transformerモデル
# transformer_output = transformer_model(input_data)
# print("Transformer Model output shape:", transformer_output.shape)

In [5]:
train

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,Winter,0.0,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0


### テーブルデータセット

In [6]:
add_features = [
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]


def feature_engineering(df):
    # season_cols = [col for col in df.columns if "Season" in col]
    # df = df.drop(season_cols, axis=1)
    df["BMI_Age"] = df["Physical-BMI"] * df["Basic_Demos-Age"]
    df["Internet_Hours_Age"] = (
        df["PreInt_EduHx-computerinternet_hoursday"] * df["Basic_Demos-Age"]
    )
    df["BMI_Internet_Hours"] = (
        df["Physical-BMI"] * df["PreInt_EduHx-computerinternet_hoursday"]
    )
    df["BFP_BMI"] = df["BIA-BIA_Fat"] / df["BIA-BIA_BMI"]
    df["FFMI_BFP"] = df["BIA-BIA_FFMI"] / df["BIA-BIA_Fat"]
    df["FMI_BFP"] = df["BIA-BIA_FMI"] / df["BIA-BIA_Fat"]
    df["LST_TBW"] = df["BIA-BIA_LST"] / df["BIA-BIA_TBW"]
    df["BFP_BMR"] = df["BIA-BIA_Fat"] * df["BIA-BIA_BMR"]
    df["BFP_DEE"] = df["BIA-BIA_Fat"] * df["BIA-BIA_DEE"]
    df["BMR_Weight"] = df["BIA-BIA_BMR"] / df["Physical-Weight"]
    df["DEE_Weight"] = df["BIA-BIA_DEE"] / df["Physical-Weight"]
    df["SMM_Height"] = df["BIA-BIA_SMM"] / df["Physical-Height"]
    df["Muscle_to_Fat"] = df["BIA-BIA_SMM"] / df["BIA-BIA_FMI"]
    df["Hydration_Status"] = df["BIA-BIA_TBW"] / df["Physical-Weight"]
    df["ICW_TBW"] = df["BIA-BIA_ICW"] / df["BIA-BIA_TBW"]

    return df


train = feature_engineering(train)
train = train.replace([np.inf, -np.inf], np.nan)
for add_ in add_features:
    train[add_] = train[add_].fillna(0.0)
train = train.dropna(thresh=10, axis=0)

test = feature_engineering(test)
test = test.replace([np.inf, -np.inf], np.nan)
for add_ in add_features:
    test[add_] = test[add_].fillna(0.0)
test = test.dropna(thresh=10, axis=0)

In [7]:
# onehotEncoderの作成
from sklearn.preprocessing import OneHotEncoder


double_columns = [
    "FGC-FGC_SRR_Zone",
    "BIA-BIA_SMM",
    "Physical-Waist_Circumference",
    "BIA-BIA_FFMI",
    "FGC-FGC_CU",
    "PreInt_EduHx-computerinternet_hoursday",
    "BIA-BIA_ECW",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_SRL_Zone",
    "BIA-BIA_DEE",
    "Physical-Weight",
    "Fitness_Endurance-Time_Mins",
    "FGC-FGC_SRR",
    "SDS-SDS_Total_T",
    "FGC-FGC_PU",
    "BIA-BIA_FFM",
    "FGC-FGC_TL_Zone",
    "Physical-BMI",
    "Physical-Systolic_BP",
    "Physical-HeartRate",
    "BIA-BIA_ICW",
    "Physical-Height",
    "FGC-FGC_SRL",
    "BIA-BIA_BMC",
    "Fitness_Endurance-Time_Sec",
    "BIA-BIA_Frame_num",
    "Basic_Demos-Age",
    "FGC-FGC_GSND_Zone",
    "Basic_Demos-Sex",
    "FGC-FGC_GSND",
    "BIA-BIA_LST",
    "FGC-FGC_TL",
    "BIA-BIA_BMI",
    "BIA-BIA_FMI",
    "PAQ_C-PAQ_C_Total",
    "BIA-BIA_Activity_Level_num",
    "FGC-FGC_GSD",
    "BIA-BIA_BMR",
    "BIA-BIA_Fat",
    "SDS-SDS_Total_Raw",
    "CGAS-CGAS_Score",
    "FGC-FGC_PU_Zone",
    "BIA-BIA_LDM",
    "Fitness_Endurance-Max_Stage",
    "PAQ_A-PAQ_A_Total",
    "BIA-BIA_TBW",
    "FGC-FGC_GSD_Zone",
    "Physical-Diastolic_BP",
]
from sklearn.preprocessing import StandardScaler


def create_dataset_(df, scaler=None, train=True):

    if scaler is None:
        print("create new scaler")
        scaler = StandardScaler()
        scaler.fit(df[double_columns + add_features])
        with open("./assets/scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

    double_feature = scaler.transform(df[double_columns + add_features])
    # 欠損値の補完
    double_feature = np.nan_to_num(double_feature)

    ids = df["id"].values.reshape(-1, 1)
    X = double_feature

    # DataFrameの作成
    ids_df = pd.DataFrame(ids, columns=["id"])
    X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])

    if train:
        y = df["sii"].fillna(-1).values.reshape(-1, 1)
        y_df = pd.DataFrame(y, columns=["sii"])
        df = pd.concat([ids_df, X_df, y_df], axis=1)
    else:
        df = pd.concat([ids_df, X_df], axis=1)
    return df, scaler


train, scaler = create_dataset_(train)
test = create_dataset_(test, scaler=scaler, train=False)[0]

create new scaler


In [8]:
feature_imputer = KNNImputer(n_neighbors=5)
sii_imputer = KNNImputer(n_neighbors=5)

numeric_cols = test.select_dtypes(include=["float64", "int64"]).columns
numeric_feature_cols = numeric_cols.copy()
# numeric_feature_cols = numeric_feature_cols.drop("sii")

numeric_sii_cols = train.select_dtypes(include=["float64", "int64"]).columns

sii_inputed = sii_imputer.fit_transform(train[numeric_sii_cols])
feature_imputer.fit(test[numeric_feature_cols])
feature_inputed = feature_imputer.fit_transform(train[numeric_feature_cols])

train_imputed = pd.DataFrame(feature_inputed, columns=numeric_feature_cols)

for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]

train_imputed["sii"] = train["sii"]
train = train_imputed

# train = train[train["sii"] > -1].reset_index(drop=True)
train = train[train["sii"].notnull()].reset_index(drop=True)

# sii_impute = pd.DataFrame(sii_inputed, columns=numeric_sii_cols)
# sii_impute["sii"] = sii_impute["sii"].round().astype(int)
# train["sii"] = sii_impute["sii"]

with open("feature_imputer.pkl", "wb") as f:
    pickle.dump(feature_imputer, f)

train.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,id,sii
0,-1.277596,-0.176702,0.0,-0.209436,-0.953788,1.771623,-0.1716,-0.953742,-1.274301,-0.20197,...,0.015691,0.01563,0.531031,0.540327,0.113954,0.083339,0.25165,1.296492,00008ff9,2.0
1,0.78272,-0.225858,-0.948658,-0.380787,-0.699663,-0.96883,-0.202128,-0.953742,0.784744,-0.199625,...,0.01527,0.015181,0.629987,0.643793,0.021863,0.304658,0.209698,1.388511,000fd460,0.0
2,0.78272,0.0,0.0,0.0,0.740376,0.858138,0.0,1.048502,0.784744,0.0,...,0.014948,0.014837,-0.37456,-0.406553,-0.263102,-0.139659,-0.235959,-0.993169,00105258,0.0
3,-1.277596,-0.09413,0.0,-0.165178,0.570959,-0.96883,-0.07144,1.048502,-1.274301,-0.049816,...,0.016791,0.016926,0.309486,0.353385,0.156596,0.079541,0.191155,1.031686,00115b9f,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014948,0.014837,-0.37456,-0.406553,-0.263102,-0.139659,-0.235959,-0.993169,0016bb22,-1.0


In [9]:
class CMIDataset(Dataset):
    def __init__(self, table_df, valid_ids, base_dir, save_filename):
        self.base_dir = base_dir
        self.table_df = table_df
        self.valid_ids = valid_ids
        self.save_filename = save_filename
        self.scale_columns = [
            "X",
            "Y",
            "Z",
            "enmo",
            "anglez",
            "light",
            "battery_voltage",
        ]

        self.masked_columns = [
            "masked_X",
            "masked_Y",
            "masked_Z",
            "masked_enmo",
            "masked_anglez",
            "masked_light",
        ]

        self.original_columns = ["battery_voltage", "non-wear_flag"]

    def __len__(self):
        return len(self.valid_ids)

    def __getitem__(self, idx):
        # テーブルデータの抽出
        id_ = self.valid_ids[idx]

        save_dir = f"/home/tatsuya/code/projects/kaggle/ChildMindInstitute2024/precreated_dataset/{self.save_filename}/"
        save_path = os.path.join(save_dir, id_)

        table = self.table_df.loc[self.table_df["id"] == self.valid_ids[idx], :]
        table_feature = table.drop(columns=["id", "sii"]).values
        sii = table["sii"].values

        # 時系列データの抽出
        use_cols = self.masked_columns + self.original_columns + self.scale_columns
        p = read_parquet(self.base_dir, self.valid_ids[idx])

        if p is not None:
            p["non-wear_flag"] = 1 - p["non-wear_flag"]
            scaler_features = p[scale_columns].values
            scaler = StandardScaler()
            p[scale_columns] = scaler.fit_transform(scaler_features)

            for mask_col in masked_columns:
                p[mask_col] = p[mask_col.replace("masked_", "")] * p["non-wear_flag"]

            p = p.fillna(0.0)

            groups = p.groupby("relative_date_PCIAT")
            # グループごとにデータフレームのリストに分割
            chunks = [group.reset_index(drop=True) for _, group in groups]

            use_cols = masked_columns + original_columns + scale_columns
            watch_day = len(chunks)
            active_logs = np.zeros((31, 17280, len(use_cols)), dtype=np.float32)
            active_mask = np.zeros((31), dtype=np.int32)

            for i, chunk in enumerate(chunks):
                if i == 0:  #
                    active_logs[i, -len(chunk) :, :] = chunk[use_cols].values
                elif i == watch_day:
                    active_logs[i, : len(chunk), :] = chunk[use_cols].values
                else:
                    array = chunk[use_cols].values
                    active_logs[i, : len(array), :] = array

                active_mask[i] = 1

                if i == 30:
                    break

            active_logs = active_logs.reshape(31, 24, 60, 12, 15)  # 12は1時間の分割数
            active_logs_mean = active_logs.mean(
                axis=3
            )  # 1時間の分割数で平均を取る # 31, 1440, 15
            # active_logs_var = active_logs.var(axis=3)  # 1時間の分割数で分散を取る # 31, 1440, 15
            active_logs = np.concatenate([active_logs_mean], axis=-1)  # (31, 24, 30)
            # print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)

            # print(active_logs_mean.shape, active_logs.shape)

            active_logs_mean = active_logs.mean(
                axis=2
            )  # 1時間の分割数で平均を取る # 31, 1440, 15
            active_logs_var = active_logs.var(
                axis=2
            )  # 1時間の分割数で分散を取る # 31, 1440, 15
            active_logs = np.concatenate(
                [active_logs_mean, active_logs_var], axis=-1
            )  # (31, 24, 30)
            # print(active_logs_mean.shape, active_logs_var.shape, active_logs.shape)
            active_logs = active_logs.reshape(-1, 30)

        else:
            active_logs = np.zeros((744, 30), dtype=np.float32)
            active_mask = np.zeros((744), dtype=np.int32)

        dataset_ = {
            "id": id_,
            "table_input": torch.tensor(table_feature, dtype=torch.float32),
            "time_input": torch.tensor(active_logs, dtype=torch.float32),
            "mask": torch.tensor(active_mask, dtype=torch.int32),
            "output": torch.tensor(sii, dtype=torch.float32),
        }

        return dataset_


def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    if not os.path.exists(path):
        return None
    return pd.read_parquet(path)


dataset = CMIDataset(
    table_df=train,
    valid_ids=get_valid_ids(train_series_dir),
    base_dir=train_series_dir,
    save_filename="train",
)

# AutoEncoderのモデルのインスタンス化
# transformer_model = TransformerAutoEncoder().to("cuda")
# transformer_model.load_state_dict(torch.load("./assets/transformer_autoencoder.pth"))
lstm_model = LSTMAutoEncoder().to("cuda")
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.0001)
# データセットからデータを取り出す

from tqdm import tqdm

best_model = None
minimum_loss = 1000000

for epoch in range(10):
    print(f"Epoch {epoch}")
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    epoch_loss = []
    tq = tqdm(dataloader)
    for data in dataloader:
        optimizer.zero_grad()
        table_input = data["table_input"]
        time_input = data["time_input"].to("cuda")
        mask = data["mask"]

        # モデルにデータを入力し、出力を取得
        lstm_output, embedding = lstm_model(time_input)
        # 損失の計算
        loss = criterion(lstm_output, time_input)
        loss.backward()

        optimizer.step()

        epoch_loss.append(loss.item())

        tq.set_postfix(loss=np.mean(epoch_loss))
        tq.update()

    if np.mean(epoch_loss) < minimum_loss:
        minimum_loss = np.mean(epoch_loss)
        best_model = lstm_model
        lstm_model.eval()
        torch.save(lstm_model.state_dict(), "./assets/lstm_autoencoder.pth")
        lstm_model.train()

    print(f"Epoch {epoch} Loss: {np.mean(epoch_loss)}")
    tq.close()

Epoch 0


  0%|          | 0/996 [00:00<?, ?it/s]Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /usr/local/cuda-11.3/lib64/libcublas.so.11: undefined symbol: cublasGetSmCountTarget
100%|██████████| 996/996 [01:50<00:00,  8.99it/s, loss=0.921]


Epoch 0 Loss: 0.9213714597327821
Epoch 1


100%|██████████| 996/996 [01:52<00:00,  8.82it/s, loss=0.918]


Epoch 1 Loss: 0.9182404741424185
Epoch 2


100%|██████████| 996/996 [01:53<00:00,  8.76it/s, loss=0.918]


Epoch 2 Loss: 0.9182000718543298
Epoch 3


100%|██████████| 996/996 [01:52<00:00,  8.86it/s, loss=0.918]


Epoch 3 Loss: 0.918125432966479
Epoch 4


 85%|████████▌ | 847/996 [01:35<00:17,  8.42it/s, loss=0.957]

KeyboardInterrupt: 

In [16]:
dataset = CMIDataset(
    table_df=train,
    valid_ids=get_valid_ids(train_series_dir),
    base_dir=train_series_dir,
    save_filename="train",
)

# AutoEncoderのモデルのインスタンス化
# transformer_model = TransformerAutoEncoder().to("cuda")
# transformer_model.load_state_dict(torch.load("./assets/transformer_autoencoder.pth"))
lstm_model = LSTMAutoEncoder().to("cuda")
lstm_model.load_state_dict(torch.load("./assets/lstm_autoencoder.pth"))
# データセットからデータを取り出す

from tqdm import tqdm

best_model = None
minimum_loss = 1000000

print(f"Create Embedding")
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
epoch_loss = []
tq = tqdm(dataloader)

embedding_result = []

for data in dataloader:
    id_ = data["id"][0]
    table_input = data["table_input"]
    time_input = data["time_input"].to("cuda")
    mask = data["mask"]

    # モデルにデータを入力し、出力を取得
    lstm_output, embedding = lstm_model(time_input)
    # 損失の計算

    # mean_embedding = transformer_output.squeeze(0).mean(axis=0).cpu().detach().numpy()
    mean_embedding = embedding.cpu().detach().numpy()

    embedding_result.append({"id": id_, "embedding": mean_embedding})

    tq.update()

tq.close()

Create Embedding


 11%|█         | 107/996 [00:18<02:37,  5.65it/s]
100%|██████████| 996/996 [01:45<00:00,  9.47it/s]


In [19]:
embedding_df_all = None

for row in embedding_result:
    id_ = row["id"]
    embedding = row["embedding"]
    embedding_cols = [f"embedding_{i}" for i in range(embedding.shape[-1])]
    embedding_df = pd.DataFrame(embedding.reshape(1, -1), columns=embedding_cols)
    embedding_df["id"] = id_

    if embedding_df_all is None:
        embedding_df_all = embedding_df
    else:
        embedding_df_all = pd.concat([embedding_df_all, embedding_df], axis=0)

embedding_df_all = embedding_df_all[["id"] + embedding_cols]
embedding_df_all

Unnamed: 0,id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63
0,23dafdab,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
0,e4614ec6,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
0,56ef356c,-0.762479,-0.732674,-0.558686,0.272567,-0.133033,0.456527,0.014443,0.803861,-0.518356,...,0.544268,-0.496551,0.510508,0.658227,0.603788,-0.453413,-0.780339,0.588516,-0.362753,0.768235
0,dcfcd574,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
0,338146bd,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2a9e0dee,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
0,0eddd8e5,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
0,a49eda7f,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942
0,fa34f945,-0.528813,-0.620962,-0.310135,0.170380,-0.074117,0.282184,-0.002533,0.716247,-0.323445,...,0.242430,-0.411412,0.194835,0.513866,0.444615,-0.311440,-0.503001,0.379872,-0.301026,0.496942


## Metric

In [20]:
from sklearn.metrics import *


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

## Model, Dataset

In [21]:
train = train[train["sii"] != -1].reset_index(drop=True)
train = train.merge(embedding_df_all, on="id", how="left")
train.fillna(0.0, inplace=True)

In [22]:
train.fillna(0.0, inplace=True)

In [23]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# from keras.models import Model
# from keras.layers import Input, Dense
# from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    VotingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

SEED = 42


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


def TrainML(model_class, test_data):
    X = train.drop(["sii", "id"], axis=1)
    y = train["sii"]

    n_splits = 4
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(
        tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)
    ):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(
            y_train, y_train_pred.round(0).astype(int)
        )
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        # test_preds[:, fold] = model.predict(test_data.drop(columns=["id"]))

        print(
            f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}"
        )
        clear_output(wait=True)

        voting_model = model
        # modelの保存
        with open(f"./assets/voting_model_{fold}.pkl", "wb") as f:
            pickle.dump(voting_model, f)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"CV: {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(
        evaluate_predictions,
        x0=[0.5, 1.5, 2.5],
        args=(y, oof_non_rounded),
        method="Nelder-Mead",
    )
    assert KappaOPtimizer.success, "Optimization did not converge."

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"tuned Kappa: {tKappa:.3f}")

    # tpm = test_preds.mean(axis=1)
    # tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)


# Model parameters for LightGBM
Params = {
    "learning_rate": 0.046,
    "max_depth": 12,
    "num_leaves": 478,
    "min_data_in_leaf": 13,
    "feature_fraction": 0.893,
    "bagging_fraction": 0.784,
    "bagging_freq": 4,
    "lambda_l1": 10,  # Increased from 6.59
    "lambda_l2": 0.01,  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 1,  # Increased from 0.1
    "reg_lambda": 5,  # Increased from 1
    "random_state": SEED,
}


CatBoost_Params = {
    "learning_rate": 0.05,
    "depth": 6,
    "iterations": 200,
    "random_seed": SEED,
    "verbose": 0,
    "l2_leaf_reg": 10,  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(
    estimators=[
        ("lightgbm", Light),
        ("xgboost", XGB_Model),
        ("catboost", CatBoost_Model),
    ]
)

# Train the ensemble model
TrainML(voting_model, test)

Training Folds: 100%|██████████| 4/4 [00:03<00:00,  1.09it/s]

Mean Train QWK --> 0.7412
CV: 0.3770
tuned Kappa: 0.440



