In [15]:
import numpy as np
import pandas as pd
from src.dataloader_ import *
from src.network_ import *
from src.utils import *

import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd

train_series_dir = "../../inputs/series_train.parquet/"
test_series_dir = "../../inputs/series_test.parquet/"

data_dic_path = "../../inputs/data_dictionary.csv"
sample_submission_path = "../../inputs/sample_submission.csv"
train_path = "../../inputs/train.csv"
test_path = "../../inputs/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
data_dic = pd.read_csv(data_dic_path)

import os
import random

import numpy as np
import torch


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


nb_name = os.path.basename(os.getcwd())  # notebook name
seed_torch(seed=42)

In [16]:
from sklearn.impute import SimpleImputer, KNNImputer


def feature_engineering(df):
    # season_cols = [col for col in df.columns if "Season" in col]
    # df = df.drop(season_cols, axis=1)
    df["BMI_Age"] = df["Physical-BMI"] * df["Basic_Demos-Age"]
    df["Internet_Hours_Age"] = (
        df["PreInt_EduHx-computerinternet_hoursday"] * df["Basic_Demos-Age"]
    )
    df["BMI_Internet_Hours"] = (
        df["Physical-BMI"] * df["PreInt_EduHx-computerinternet_hoursday"]
    )
    df["BFP_BMI"] = df["BIA-BIA_Fat"] / df["BIA-BIA_BMI"]
    df["FFMI_BFP"] = df["BIA-BIA_FFMI"] / df["BIA-BIA_Fat"]
    df["FMI_BFP"] = df["BIA-BIA_FMI"] / df["BIA-BIA_Fat"]
    df["LST_TBW"] = df["BIA-BIA_LST"] / df["BIA-BIA_TBW"]
    df["BFP_BMR"] = df["BIA-BIA_Fat"] * df["BIA-BIA_BMR"]
    df["BFP_DEE"] = df["BIA-BIA_Fat"] * df["BIA-BIA_DEE"]
    df["BMR_Weight"] = df["BIA-BIA_BMR"] / df["Physical-Weight"]
    df["DEE_Weight"] = df["BIA-BIA_DEE"] / df["Physical-Weight"]
    df["SMM_Height"] = df["BIA-BIA_SMM"] / df["Physical-Height"]
    df["Muscle_to_Fat"] = df["BIA-BIA_SMM"] / df["BIA-BIA_FMI"]
    df["Hydration_Status"] = df["BIA-BIA_TBW"] / df["Physical-Weight"]
    df["ICW_TBW"] = df["BIA-BIA_ICW"] / df["BIA-BIA_TBW"]

    return df


imputer = KNNImputer(n_neighbors=5)
numeric_cols = test.select_dtypes(include=["float64", "int64"]).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
# train_imputed["sii"] = train_imputed["sii"].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]

train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

train_id_df = train["id"]
test_id_df = test["id"]

train = train.drop("id", axis=1)
test = test.drop("id", axis=1)


featuresCols = [
    "Basic_Demos-Age",
    "Basic_Demos-Sex",
    "CGAS-CGAS_Score",
    "Physical-BMI",
    "Physical-Height",
    "Physical-Weight",
    "Physical-Waist_Circumference",
    "Physical-Diastolic_BP",
    "Physical-HeartRate",
    "Physical-Systolic_BP",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
    "FGC-FGC_CU",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_PU",
    "FGC-FGC_PU_Zone",
    "FGC-FGC_SRL",
    "FGC-FGC_SRL_Zone",
    "FGC-FGC_SRR",
    "FGC-FGC_SRR_Zone",
    "FGC-FGC_TL",
    "FGC-FGC_TL_Zone",
    "BIA-BIA_Activity_Level_num",
    "BIA-BIA_BMC",
    "BIA-BIA_BMI",
    "BIA-BIA_BMR",
    "BIA-BIA_DEE",
    "BIA-BIA_ECW",
    "BIA-BIA_FFM",
    "BIA-BIA_FFMI",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "BIA-BIA_Frame_num",
    "BIA-BIA_ICW",
    "BIA-BIA_LDM",
    "BIA-BIA_LST",
    "BIA-BIA_SMM",
    "BIA-BIA_TBW",
    "PAQ_A-PAQ_A_Total",
    "PAQ_C-PAQ_C_Total",
    "SDS-SDS_Total_Raw",
    "SDS-SDS_Total_T",
    "PreInt_EduHx-computerinternet_hoursday",
    "sii",
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]

# featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset="sii")
train_sii_df = train["sii"]
train = train.drop("sii", axis=1)

featuresCols = [
    "Basic_Demos-Age",
    "Basic_Demos-Sex",
    "CGAS-CGAS_Score",
    "Physical-BMI",
    "Physical-Height",
    "Physical-Weight",
    "Physical-Waist_Circumference",
    "Physical-Diastolic_BP",
    "Physical-HeartRate",
    "Physical-Systolic_BP",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
    "FGC-FGC_CU",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_PU",
    "FGC-FGC_PU_Zone",
    "FGC-FGC_SRL",
    "FGC-FGC_SRL_Zone",
    "FGC-FGC_SRR",
    "FGC-FGC_SRR_Zone",
    "FGC-FGC_TL",
    "FGC-FGC_TL_Zone",
    "BIA-BIA_Activity_Level_num",
    "BIA-BIA_BMC",
    "BIA-BIA_BMI",
    "BIA-BIA_BMR",
    "BIA-BIA_DEE",
    "BIA-BIA_ECW",
    "BIA-BIA_FFM",
    "BIA-BIA_FFMI",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "BIA-BIA_Frame_num",
    "BIA-BIA_ICW",
    "BIA-BIA_LDM",
    "BIA-BIA_LST",
    "BIA-BIA_SMM",
    "BIA-BIA_TBW",
    "PAQ_A-PAQ_A_Total",
    "PAQ_C-PAQ_C_Total",
    "SDS-SDS_Total_Raw",
    "SDS-SDS_Total_T",
    "PreInt_EduHx-computerinternet_hoursday",
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]

# featuresCols += time_series_cols
test = test[featuresCols]
test["id"] = test_id_df

# column名をfeature_iに変更

feature_cols = [f"feature_{i}" for i in range(train.shape[1])]
train.columns = feature_cols
train["id"] = train_id_df
train["sii"] = train_sii_df

In [None]:
train

In [18]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
# scaler = MinMaxScaler()

# 各列のinf, -infを各列における最大値、最小値に変換
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(train.max())

train[feature_cols] = scaler.fit_transform(train[feature_cols].values)

with open("./assets/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [19]:
train_df = train

### テーブルデータセット

In [20]:
# # onehotEncoderの作成
# from sklearn.preprocessing import OneHotEncoder

# categorical_columns = [
#     "Basic_Demos-Enroll_Season",
#     "CGAS-Season",
#     "Physical-Season",
#     "PAQ_C-Season",
#     "FGC-Season",
#     "Fitness_Endurance-Season",
#     "PAQ_A-Season",
#     "BIA-Season",
#     "SDS-Season",
#     "PreInt_EduHx-Season",
# ]

# double_columns = [
#     "FGC-FGC_SRR_Zone",
#     "BIA-BIA_SMM",
#     "Physical-Waist_Circumference",
#     "BIA-BIA_FFMI",
#     "FGC-FGC_CU",
#     "PreInt_EduHx-computerinternet_hoursday",
#     "BIA-BIA_ECW",
#     "FGC-FGC_CU_Zone",
#     "FGC-FGC_SRL_Zone",
#     "BIA-BIA_DEE",
#     "Physical-Weight",
#     "Fitness_Endurance-Time_Mins",
#     "FGC-FGC_SRR",
#     "SDS-SDS_Total_T",
#     "FGC-FGC_PU",
#     "BIA-BIA_FFM",
#     "FGC-FGC_TL_Zone",
#     "Physical-BMI",
#     "Physical-Systolic_BP",
#     "Physical-HeartRate",
#     "BIA-BIA_ICW",
#     "Physical-Height",
#     "FGC-FGC_SRL",
#     "BIA-BIA_BMC",
#     "Fitness_Endurance-Time_Sec",
#     "BIA-BIA_Frame_num",
#     "Basic_Demos-Age",
#     "FGC-FGC_GSND_Zone",
#     "Basic_Demos-Sex",
#     "FGC-FGC_GSND",
#     "BIA-BIA_LST",
#     "FGC-FGC_TL",
#     "BIA-BIA_BMI",
#     "BIA-BIA_FMI",
#     "PAQ_C-PAQ_C_Total",
#     "BIA-BIA_Activity_Level_num",
#     "FGC-FGC_GSD",
#     "BIA-BIA_BMR",
#     "BIA-BIA_Fat",
#     "SDS-SDS_Total_Raw",
#     "CGAS-CGAS_Score",
#     "FGC-FGC_PU_Zone",
#     "BIA-BIA_LDM",
#     "Fitness_Endurance-Max_Stage",
#     "PAQ_A-PAQ_A_Total",
#     "BIA-BIA_TBW",
#     "FGC-FGC_GSD_Zone",
#     "Physical-Diastolic_BP",
# ]

# ###################### categorical columns ######################
# # trainのtargetをonehot化
# onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
# onehot_encoder.fit(train[categorical_columns])

# with open("./assets/onehot_encoder.pkl", "wb") as f:
#     pickle.dump(onehot_encoder, f)

# categorical_feature = onehot_encoder.transform(train[categorical_columns])

# ###################### double columns ######################
# # trainのtargetを標準化
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(train[double_columns + add_features])

# with open("./assets/scaler.pkl", "wb") as f:
#     pickle.dump(scaler, f)

# double_feature = scaler.transform(train[double_columns + add_features])
# # double_feature = train[double_columns].values

# # 欠損値の補完
# double_feature = np.nan_to_num(double_feature)

# ###################### inputの作成 ######################

# ids = train["id"].values.reshape(-1, 1)
# X = np.concatenate([categorical_feature, double_feature], axis=1)
# y = train["sii"].fillna(-1).values.reshape(-1, 1)

# # DataFrameの作成
# ids_df = pd.DataFrame(ids, columns=["id"])
# X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
# y_df = pd.DataFrame(y, columns=["sii"])

# train_df = pd.concat([ids_df, X_df, y_df], axis=1)
# train_df

In [21]:
def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    return pd.read_parquet(path)


def get_valid_ids(base_dir):
    return [f.split("=")[1].split(".")[0] for f in os.listdir(base_dir)]


p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="ffcd4dbd")
# p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="10e46254")
# p

In [None]:
from glob import glob

# len(glob("../../normalized/*"))
len(glob("../../inputs/series_train.parquet/*"))

## Metric

In [23]:
from sklearn.metrics import *


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

## Model, Dataset

In [None]:
train_df.head()

In [None]:
from sklearn.model_selection import train_test_split

use_ids = list(
    train_df[train_df["sii"] != -1]["id"].unique()
)  # get_valid_ids(base_dir="../../normalized/")

len(use_ids)

## Training

In [26]:
from tqdm import tqdm
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
use_ids = np.array(use_ids)
for train_index, valid_index in kf.split(use_ids):
    train_ids = [use_ids[i] for i in train_index]
    valid_ids = [use_ids[i] for i in valid_index]

    train_dataset = CMIDataset(
        table_df=train_df,
        valid_ids=use_ids,
        base_dir="../../inputs/series_train.parquet/",
        save_filename=nb_name,
    )

In [None]:
train_dataset[0]["time_input"].shape

In [None]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

CV = []

# use_ids = np.array(use_ids[:30]) # debug
use_ids = np.array(use_ids)

extract_df = train[train["id"].isin(use_ids)].reset_index(drop=True)

test_df = train[["id", "sii"]].copy()
# test_df["pred_sii"] = 0
oof_preds = []

for fold, (train_ids, valid_ids) in enumerate(
    skf.split(extract_df["id"], extract_df["sii"])
):
    print(f"################### fold:{fold} ###################")
    best_valid_score = -100

    train_ids = use_ids[train_ids]
    valid_ids = use_ids[valid_ids]

    train_dataset = CMIDataset(
        table_df=train_df,
        valid_ids=train_ids,
        base_dir="../../inputs/series_train.parquet/",
        save_filename=nb_name,
    )
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=30)

    vlaid_dataset = CMIDataset(
        table_df=train_df,
        valid_ids=valid_ids,
        base_dir="../../inputs/series_train.parquet/",
        save_filename=nb_name,
    )

    valid_loader = DataLoader(
        vlaid_dataset, batch_size=1, shuffle=False, num_workers=30
    )
    # data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

    # model = TimeEncoder(input_size=26, hidden_size=13, num_layers=2).to("cuda")
    model = CMIModel(input_size=26, hidden_size=13, num_layers=2).to("cuda")

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    spot_oof_preds = []

    for epoch in range(5):
        total_train_loss = 0
        total_valid_loss = 0

        train_pred = []
        valid_pred = []
        trian_gt = []
        valid_gt = []

        tq = tqdm(train_loader)
        for i, data in enumerate(train_loader):
            table_input = data["table_input"].to("cuda")
            time_input = data["time_input"].to("cuda")
            mask = data["mask"].to("cuda").to(torch.float32)
            target_ = data["output"].to("cuda")
            optimizer.zero_grad()
            output, attention_weight = model(table_input, time_input, active_mask=mask)
            loss = criterion(output, target_)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

            train_pred.append(output.detach().cpu().numpy())
            trian_gt.append(target_.detach().cpu().numpy())

            tq.set_postfix(loss=total_train_loss / (i + 1))
            tq.update()
        tq.close()

        tq = tqdm(valid_loader)
        for i, data in enumerate(valid_loader):
            table_input = data["table_input"].to("cuda")
            time_input = data["time_input"].to("cuda")
            mask = data["mask"].to("cuda").to(torch.float32)
            target_ = data["output"].to("cuda")
            output, attention_weight = model(table_input, time_input, active_mask=mask)
            loss = criterion(output, target_)
            total_valid_loss += loss.item()

            valid_pred.append(output.detach().cpu().numpy())
            valid_gt.append(target_.detach().cpu().numpy())

            tq.set_postfix(loss=total_valid_loss / (i + 1))
            tq.update()
        tq.close()

        metric_train_pred = np.concatenate(train_pred)
        metric_valid_pred = np.concatenate(valid_pred)
        metric_train_gt = np.concatenate(trian_gt)
        metric_valid_gt = np.concatenate(valid_gt)

        train_score = quadratic_weighted_kappa(
            metric_train_gt, metric_train_pred.round(0).astype(int)
        )

        valid_score = quadratic_weighted_kappa(
            metric_valid_gt, metric_valid_pred.round(0).astype(int)
        )

        print(
            f"epoch: {epoch}, loss: {total_train_loss / len(train_loader)}, valid_loss: {total_valid_loss / len(valid_loader)}, train_score: {train_score}, valid_score: {valid_score}"
        )

        if valid_score > best_valid_score:
            best_valid_score = valid_score
            torch.save(model.state_dict(), f"./assets/model_{fold}.pth")

            spot_oof_preds = []
            for i, id_ in enumerate(valid_ids):
                spot_oof_preds.append({"id": id_, "pred_sii": valid_pred[i][0][0]})

    oof_preds.append(spot_oof_preds)
    CV.append(best_valid_score)

print(f"CV: {np.mean(CV)}")

In [None]:
train_df

In [None]:
oof_preds_df = pd.concat([pd.DataFrame(p) for p in oof_preds], axis=0).reset_index(
    drop=True
)
oof_preds_df.head()

In [None]:
test_pred_df = test_df.merge(oof_preds_df, on="id", how="inner")
test_pred_df.head()

In [None]:
from scipy.optimize import minimize

KappaOPtimizer = minimize(
    evaluate_predictions,
    x0=[0.5, 1.5, 2.5],
    args=(test_pred_df["sii"], test_pred_df["pred_sii"]),
    method="Nelder-Mead",
)
assert KappaOPtimizer.success, "Optimization did not converge."

oof_tuned = threshold_Rounder(test_pred_df["pred_sii"], KappaOPtimizer.x)
tKappa = quadratic_weighted_kappa(test_pred_df["sii"], oof_tuned)
print(f"tuned Kappa: {tKappa}")

In [None]:
print(KappaOPtimizer.x)

In [None]:
import torch
import torch.nn as nn


class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.deconv1 = nn.ConvTranspose1d(
            32, 64, kernel_size=4, stride=2, padding=1
        )  # 時間方向に拡大
        self.deconv2 = nn.ConvTranspose1d(64, 32, kernel_size=4, stride=2, padding=1)
        self.deconv3 = nn.ConvTranspose1d(32, 15, kernel_size=4, stride=2, padding=1)

        self.upsample = nn.Upsample(size=17280, mode="linear")  # 最終的な長さに調整

    def forward(self, x):
        batch, days, time, channels = x.shape
        x = x.view(batch * days, channels, time)  # (batch*days, 32, 10)

        x = self.deconv1(x)
        x = self.deconv2(x)
        x = self.deconv3(x)

        x = self.upsample(x)  # (batch*days, 15, 17280)
        x = x.view(batch, days, 17280, 15)

        return x


# テスト
encoder_output = torch.randn(1, 31, 10, 32)
decoder = Decoder()
output = decoder(encoder_output)
print(output.shape)  # (1, 31, 17280, 15)