In [1]:
import numpy as np
import pandas as pd
from src.dataloader_ import *
from src.network_ import *
from src.utils import *

import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd

train_series_dir = "../../inputs/series_train.parquet/"
test_series_dir = "../../inputs/series_test.parquet/"

data_dic_path = "../../inputs/data_dictionary.csv"
sample_submission_path = "../../inputs/sample_submission.csv"
train_path = "../../inputs/train.csv"
test_path = "../../inputs/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
data_dic = pd.read_csv(data_dic_path)

import os
import random

import numpy as np
import torch


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=42)

In [2]:
from sklearn.impute import SimpleImputer, KNNImputer


def feature_engineering(df):
    # season_cols = [col for col in df.columns if "Season" in col]
    # df = df.drop(season_cols, axis=1)
    df["BMI_Age"] = df["Physical-BMI"] * df["Basic_Demos-Age"]
    df["Internet_Hours_Age"] = (
        df["PreInt_EduHx-computerinternet_hoursday"] * df["Basic_Demos-Age"]
    )
    df["BMI_Internet_Hours"] = (
        df["Physical-BMI"] * df["PreInt_EduHx-computerinternet_hoursday"]
    )
    df["BFP_BMI"] = df["BIA-BIA_Fat"] / df["BIA-BIA_BMI"]
    df["FFMI_BFP"] = df["BIA-BIA_FFMI"] / df["BIA-BIA_Fat"]
    df["FMI_BFP"] = df["BIA-BIA_FMI"] / df["BIA-BIA_Fat"]
    df["LST_TBW"] = df["BIA-BIA_LST"] / df["BIA-BIA_TBW"]
    df["BFP_BMR"] = df["BIA-BIA_Fat"] * df["BIA-BIA_BMR"]
    df["BFP_DEE"] = df["BIA-BIA_Fat"] * df["BIA-BIA_DEE"]
    df["BMR_Weight"] = df["BIA-BIA_BMR"] / df["Physical-Weight"]
    df["DEE_Weight"] = df["BIA-BIA_DEE"] / df["Physical-Weight"]
    df["SMM_Height"] = df["BIA-BIA_SMM"] / df["Physical-Height"]
    df["Muscle_to_Fat"] = df["BIA-BIA_SMM"] / df["BIA-BIA_FMI"]
    df["Hydration_Status"] = df["BIA-BIA_TBW"] / df["Physical-Weight"]
    df["ICW_TBW"] = df["BIA-BIA_ICW"] / df["BIA-BIA_TBW"]

    return df


imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=["float64", "int64"]).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed["sii"] = train_imputed["sii"].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]

train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

train_id_df = train["id"]
test_id_df = test["id"]

train = train.drop("id", axis=1)
test = test.drop("id", axis=1)


featuresCols = [
    "Basic_Demos-Age",
    "Basic_Demos-Sex",
    "CGAS-CGAS_Score",
    "Physical-BMI",
    "Physical-Height",
    "Physical-Weight",
    "Physical-Waist_Circumference",
    "Physical-Diastolic_BP",
    "Physical-HeartRate",
    "Physical-Systolic_BP",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
    "FGC-FGC_CU",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_PU",
    "FGC-FGC_PU_Zone",
    "FGC-FGC_SRL",
    "FGC-FGC_SRL_Zone",
    "FGC-FGC_SRR",
    "FGC-FGC_SRR_Zone",
    "FGC-FGC_TL",
    "FGC-FGC_TL_Zone",
    "BIA-BIA_Activity_Level_num",
    "BIA-BIA_BMC",
    "BIA-BIA_BMI",
    "BIA-BIA_BMR",
    "BIA-BIA_DEE",
    "BIA-BIA_ECW",
    "BIA-BIA_FFM",
    "BIA-BIA_FFMI",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "BIA-BIA_Frame_num",
    "BIA-BIA_ICW",
    "BIA-BIA_LDM",
    "BIA-BIA_LST",
    "BIA-BIA_SMM",
    "BIA-BIA_TBW",
    "PAQ_A-PAQ_A_Total",
    "PAQ_C-PAQ_C_Total",
    "SDS-SDS_Total_Raw",
    "SDS-SDS_Total_T",
    "PreInt_EduHx-computerinternet_hoursday",
    "sii",
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]

# featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset="sii")
train_sii_df = train["sii"]
train = train.drop("sii", axis=1)

featuresCols = [
    "Basic_Demos-Age",
    "Basic_Demos-Sex",
    "CGAS-CGAS_Score",
    "Physical-BMI",
    "Physical-Height",
    "Physical-Weight",
    "Physical-Waist_Circumference",
    "Physical-Diastolic_BP",
    "Physical-HeartRate",
    "Physical-Systolic_BP",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
    "FGC-FGC_CU",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_PU",
    "FGC-FGC_PU_Zone",
    "FGC-FGC_SRL",
    "FGC-FGC_SRL_Zone",
    "FGC-FGC_SRR",
    "FGC-FGC_SRR_Zone",
    "FGC-FGC_TL",
    "FGC-FGC_TL_Zone",
    "BIA-BIA_Activity_Level_num",
    "BIA-BIA_BMC",
    "BIA-BIA_BMI",
    "BIA-BIA_BMR",
    "BIA-BIA_DEE",
    "BIA-BIA_ECW",
    "BIA-BIA_FFM",
    "BIA-BIA_FFMI",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "BIA-BIA_Frame_num",
    "BIA-BIA_ICW",
    "BIA-BIA_LDM",
    "BIA-BIA_LST",
    "BIA-BIA_SMM",
    "BIA-BIA_TBW",
    "PAQ_A-PAQ_A_Total",
    "PAQ_C-PAQ_C_Total",
    "SDS-SDS_Total_Raw",
    "SDS-SDS_Total_T",
    "PreInt_EduHx-computerinternet_hoursday",
    "BMI_Age",
    "Internet_Hours_Age",
    "BMI_Internet_Hours",
    "BFP_BMI",
    "FFMI_BFP",
    "FMI_BFP",
    "LST_TBW",
    "BFP_BMR",
    "BFP_DEE",
    "BMR_Weight",
    "DEE_Weight",
    "SMM_Height",
    "Muscle_to_Fat",
    "Hydration_Status",
    "ICW_TBW",
]

# featuresCols += time_series_cols
test = test[featuresCols]
test["id"] = test_id_df

# column名をfeature_iに変更

feature_cols = [f"feature_{i}" for i in range(train.shape[1])]
train.columns = feature_cols
train["id"] = train_id_df
train["sii"] = train_sii_df

In [3]:
train

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,id,sii
0,5.0,0.0,51.0,16.877316,46.00,50.8,23.0,61.2,86.4,110.6,...,8591.822097,13746.944840,18.356260,29.370079,0.424811,6.383063,0.643522,0.747453,00008ff9,2
1,9.0,0.0,70.0,14.035590,48.00,46.0,22.0,75.0,70.0,122.0,...,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,000fd460,0
2,10.0,1.0,71.0,16.648696,56.50,75.6,24.8,65.0,94.0,117.0,...,19528.656877,33346.609152,14.634960,24.990265,0.488767,6.473938,0.608640,0.626200,00105258,0
3,9.0,0.0,71.0,18.292347,56.00,81.6,25.4,60.0,97.0,117.0,...,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,00115b9f,1
4,18.0,1.0,69.4,26.713639,64.14,125.0,33.6,70.6,74.2,125.8,...,79057.011034,121546.184578,11.159904,17.157792,0.741197,4.731007,0.520795,0.543432,0016bb22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,13.0,0.0,60.0,16.362460,59.50,82.4,25.0,71.0,70.0,104.0,...,13985.687504,23775.715110,14.646602,24.899272,0.500487,12.939628,0.641165,0.631642,ff8a2de4,1
3956,10.0,0.0,58.6,18.764678,53.50,76.4,27.0,60.0,78.0,118.0,...,15914.063643,28114.560289,13.225976,23.365654,0.399926,5.075547,0.496013,0.665237,ffa9794a,0
3957,11.0,0.0,68.0,21.441500,60.00,109.8,28.6,79.0,99.0,116.0,...,42623.022658,68197.040233,11.418397,18.269490,0.479653,4.334530,0.496020,0.623919,ffcd4dbd,1
3958,13.0,0.0,70.0,12.235895,70.70,87.0,27.6,59.0,61.0,113.0,...,-8357.575498,-17550.944000,16.256782,34.139310,0.642631,-54.662704,0.780503,0.609266,ffed1dd5,0


In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
# scaler = MinMaxScaler()

# 各列のinf, -infを各列における最大値、最小値に変換
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(train.max())

train[feature_cols] = scaler.fit_transform(train[feature_cols].values)

with open("./assets/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [5]:
train_df = train

### テーブルデータセット

In [6]:
# # onehotEncoderの作成
# from sklearn.preprocessing import OneHotEncoder

# categorical_columns = [
#     "Basic_Demos-Enroll_Season",
#     "CGAS-Season",
#     "Physical-Season",
#     "PAQ_C-Season",
#     "FGC-Season",
#     "Fitness_Endurance-Season",
#     "PAQ_A-Season",
#     "BIA-Season",
#     "SDS-Season",
#     "PreInt_EduHx-Season",
# ]

# double_columns = [
#     "FGC-FGC_SRR_Zone",
#     "BIA-BIA_SMM",
#     "Physical-Waist_Circumference",
#     "BIA-BIA_FFMI",
#     "FGC-FGC_CU",
#     "PreInt_EduHx-computerinternet_hoursday",
#     "BIA-BIA_ECW",
#     "FGC-FGC_CU_Zone",
#     "FGC-FGC_SRL_Zone",
#     "BIA-BIA_DEE",
#     "Physical-Weight",
#     "Fitness_Endurance-Time_Mins",
#     "FGC-FGC_SRR",
#     "SDS-SDS_Total_T",
#     "FGC-FGC_PU",
#     "BIA-BIA_FFM",
#     "FGC-FGC_TL_Zone",
#     "Physical-BMI",
#     "Physical-Systolic_BP",
#     "Physical-HeartRate",
#     "BIA-BIA_ICW",
#     "Physical-Height",
#     "FGC-FGC_SRL",
#     "BIA-BIA_BMC",
#     "Fitness_Endurance-Time_Sec",
#     "BIA-BIA_Frame_num",
#     "Basic_Demos-Age",
#     "FGC-FGC_GSND_Zone",
#     "Basic_Demos-Sex",
#     "FGC-FGC_GSND",
#     "BIA-BIA_LST",
#     "FGC-FGC_TL",
#     "BIA-BIA_BMI",
#     "BIA-BIA_FMI",
#     "PAQ_C-PAQ_C_Total",
#     "BIA-BIA_Activity_Level_num",
#     "FGC-FGC_GSD",
#     "BIA-BIA_BMR",
#     "BIA-BIA_Fat",
#     "SDS-SDS_Total_Raw",
#     "CGAS-CGAS_Score",
#     "FGC-FGC_PU_Zone",
#     "BIA-BIA_LDM",
#     "Fitness_Endurance-Max_Stage",
#     "PAQ_A-PAQ_A_Total",
#     "BIA-BIA_TBW",
#     "FGC-FGC_GSD_Zone",
#     "Physical-Diastolic_BP",
# ]

# ###################### categorical columns ######################
# # trainのtargetをonehot化
# onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
# onehot_encoder.fit(train[categorical_columns])

# with open("./assets/onehot_encoder.pkl", "wb") as f:
#     pickle.dump(onehot_encoder, f)

# categorical_feature = onehot_encoder.transform(train[categorical_columns])

# ###################### double columns ######################
# # trainのtargetを標準化
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(train[double_columns + add_features])

# with open("./assets/scaler.pkl", "wb") as f:
#     pickle.dump(scaler, f)

# double_feature = scaler.transform(train[double_columns + add_features])
# # double_feature = train[double_columns].values

# # 欠損値の補完
# double_feature = np.nan_to_num(double_feature)

# ###################### inputの作成 ######################

# ids = train["id"].values.reshape(-1, 1)
# X = np.concatenate([categorical_feature, double_feature], axis=1)
# y = train["sii"].fillna(-1).values.reshape(-1, 1)

# # DataFrameの作成
# ids_df = pd.DataFrame(ids, columns=["id"])
# X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
# y_df = pd.DataFrame(y, columns=["sii"])

# train_df = pd.concat([ids_df, X_df, y_df], axis=1)
# train_df

In [7]:
def read_parquet(base_dir, id_):
    path = os.path.join(base_dir, f"id={id_}", "part-0.parquet")
    return pd.read_parquet(path)


def get_valid_ids(base_dir):
    return [f.split("=")[1].split(".")[0] for f in os.listdir(base_dir)]


p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="ffcd4dbd")
# p = read_parquet(base_dir="../../inputs/series_train.parquet/", id_="10e46254")
# p

In [8]:
from glob import glob

# len(glob("../../normalized/*"))
len(glob("../../inputs/series_train.parquet/*"))

996

## Metric

In [9]:
from sklearn.metrics import *


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

## Model, Dataset

In [10]:
train_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,id,sii
0,-1.520226,-0.770846,-0.589492,-0.451793,-1.376877,-0.906968,-0.93639,-0.688396,0.41299,-0.406752,...,0.015158,0.01493,-0.105272,-0.108014,-0.13801,-0.015858,-0.125355,1.713182,00008ff9,2
1,-0.401093,-0.770846,0.070969,-1.000316,-1.104518,-1.018138,-1.155609,0.449795,-0.9176,0.334534,...,0.014737,0.014481,-0.091848,-0.093678,-0.230897,0.197596,-0.130743,2.189205,000fd460,0
2,-0.12131,1.297277,0.10573,-0.495922,0.053007,-0.332587,-0.541797,-0.374982,1.029605,0.009409,...,0.016104,0.01606,-0.130176,-0.12758,-0.080752,-0.012796,-0.128749,-0.208267,00105258,0
3,-0.401093,-0.770846,0.10573,-0.178657,-0.015082,-0.193624,-0.410266,-0.78737,1.273005,0.009409,...,0.016257,0.016225,-0.135325,-0.133917,-0.094999,-0.019522,-0.133124,0.343316,00115b9f,1
4,2.116955,1.297277,0.050112,1.44686,1.093418,0.811541,1.387325,0.086893,-0.576839,0.58163,...,0.021251,0.021144,-0.153431,-0.162569,0.145237,-0.071524,-0.137298,-1.519868,0016bb22,1


In [11]:
from sklearn.model_selection import train_test_split

use_ids = list(
    train_df[train_df["sii"] != -1]["id"].unique()
)  # get_valid_ids(base_dir="../../normalized/")

len(use_ids)

3960

## Training

In [12]:
from tqdm import tqdm
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
use_ids = np.array(use_ids)
for train_index, valid_index in kf.split(use_ids):
    train_ids = [use_ids[i] for i in train_index]
    valid_ids = [use_ids[i] for i in valid_index]

    train_dataset = CMIDataset(
        table_df=train_df,
        valid_ids=use_ids,
        base_dir="../../inputs/series_train.parquet/",
    )

In [13]:
train_dataset[0]["time_input"].shape

torch.Size([31, 17280, 15])

In [14]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

CV = []

# use_ids = np.array(use_ids[:30]) # debug
use_ids = np.array(use_ids)

extract_df = train[train["id"].isin(use_ids)].reset_index(drop=True)

test_df = train[["id", "sii"]].copy()
# test_df["pred_sii"] = 0
oof_preds = []

for fold, (train_ids, valid_ids) in enumerate(
    skf.split(extract_df["id"], extract_df["sii"])
):
    print(f"################### fold:{fold} ###################")
    best_valid_score = -100

    train_ids = use_ids[train_ids]
    valid_ids = use_ids[valid_ids]

    train_dataset = CMIDataset(
        table_df=train_df,
        valid_ids=train_ids,
        base_dir="../../inputs/series_train.parquet/",
    )
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=30)

    vlaid_dataset = CMIDataset(
        table_df=train_df,
        valid_ids=valid_ids,
        base_dir="../../inputs/series_train.parquet/",
    )

    valid_loader = DataLoader(
        vlaid_dataset, batch_size=1, shuffle=False, num_workers=30
    )
    # data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

    # model = TimeEncoder(input_size=26, hidden_size=13, num_layers=2).to("cuda")
    model = CMIModel(input_size=26, hidden_size=13, num_layers=2).to("cuda")

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    spot_oof_preds = []

    for epoch in range(5):
        total_train_loss = 0
        total_valid_loss = 0

        train_pred = []
        valid_pred = []
        trian_gt = []
        valid_gt = []

        tq = tqdm(train_loader)
        for i, data in enumerate(train_loader):
            table_input = data["table_input"].to("cuda")
            time_input = data["time_input"].to("cuda")
            mask = data["mask"].to("cuda").to(torch.float32)
            target_ = data["output"].to("cuda")
            optimizer.zero_grad()
            output, attention_weight = model(table_input, time_input, active_mask=mask)
            loss = criterion(output, target_)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

            train_pred.append(output.detach().cpu().numpy())
            trian_gt.append(target_.detach().cpu().numpy())

            tq.set_postfix(loss=total_train_loss / (i + 1))
            tq.update()
        tq.close()

        tq = tqdm(valid_loader)
        for i, data in enumerate(valid_loader):
            table_input = data["table_input"].to("cuda")
            time_input = data["time_input"].to("cuda")
            mask = data["mask"].to("cuda").to(torch.float32)
            target_ = data["output"].to("cuda")
            output, attention_weight = model(table_input, time_input, active_mask=mask)
            loss = criterion(output, target_)
            total_valid_loss += loss.item()

            valid_pred.append(output.detach().cpu().numpy())
            valid_gt.append(target_.detach().cpu().numpy())

            tq.set_postfix(loss=total_valid_loss / (i + 1))
            tq.update()
        tq.close()

        metric_train_pred = np.concatenate(train_pred)
        metric_valid_pred = np.concatenate(valid_pred)
        metric_train_gt = np.concatenate(trian_gt)
        metric_valid_gt = np.concatenate(valid_gt)

        train_score = quadratic_weighted_kappa(
            metric_train_gt, metric_train_pred.round(0).astype(int)
        )

        valid_score = quadratic_weighted_kappa(
            metric_valid_gt, metric_valid_pred.round(0).astype(int)
        )

        print(
            f"epoch: {epoch}, loss: {total_train_loss / len(train_loader)}, valid_loss: {total_valid_loss / len(valid_loader)}, train_score: {train_score}, valid_score: {valid_score}"
        )

        if valid_score > best_valid_score:
            best_valid_score = valid_score
            torch.save(model.state_dict(), f"./assets/model_{fold}.pth")

            spot_oof_preds = []
            for i, id_ in enumerate(valid_ids):
                spot_oof_preds.append({"id": id_, "pred_sii": valid_pred[i][0][0]})

    oof_preds.append(spot_oof_preds)
    CV.append(best_valid_score)

print(f"CV: {np.mean(CV)}")

################### fold:0 ###################


  0%|          | 0/2970 [00:00<?, ?it/s]Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /usr/local/cuda-11.3/lib64/libcublas.so.11: undefined symbol: cublasGetSmCountTarget
100%|██████████| 2970/2970 [02:46<00:00, 17.83it/s, loss=0.441]  
100%|██████████| 990/990 [01:05<00:00, 15.11it/s, loss=0.434]


epoch: 0, loss: 0.44084649783607516, valid_loss: 0.43379595518242514, train_score: 0.34326047953845296, valid_score: 0.33652644731361925


100%|██████████| 2970/2970 [02:27<00:00, 20.19it/s, loss=0.395]
100%|██████████| 990/990 [00:23<00:00, 42.65it/s, loss=0.386]


epoch: 1, loss: 0.3950285530407722, valid_loss: 0.3860223429509552, train_score: 0.40636613804646937, valid_score: 0.4077766699900299


100%|██████████| 2970/2970 [01:59<00:00, 24.93it/s, loss=0.375]
100%|██████████| 990/990 [00:29<00:00, 33.76it/s, loss=0.395]


epoch: 2, loss: 0.37472700890828764, valid_loss: 0.3947395301245306, train_score: 0.43603250927862036, valid_score: 0.4091395655086906


100%|██████████| 2970/2970 [02:04<00:00, 23.92it/s, loss=0.362]
100%|██████████| 990/990 [00:23<00:00, 42.25it/s, loss=0.383]


epoch: 3, loss: 0.36239774840671407, valid_loss: 0.3825439754109812, train_score: 0.4737812386676432, valid_score: 0.457930279248038


100%|██████████| 2970/2970 [02:11<00:00, 22.57it/s, loss=0.348]
100%|██████████| 990/990 [00:26<00:00, 36.77it/s, loss=0.396]


epoch: 4, loss: 0.3475780213330133, valid_loss: 0.39601033965870297, train_score: 0.4857032756969507, valid_score: 0.4630178673585412
################### fold:1 ###################


100%|██████████| 2970/2970 [01:52<00:00, 26.34it/s, loss=0.43] 
100%|██████████| 990/990 [00:49<00:00, 19.97it/s, loss=0.615]


epoch: 0, loss: 0.42963629119592367, valid_loss: 0.614956059206068, train_score: 0.3618310703070805, valid_score: 0.3877257241566834


100%|██████████| 2970/2970 [02:15<00:00, 21.89it/s, loss=0.39] 
100%|██████████| 990/990 [00:28<00:00, 34.37it/s, loss=0.4]  


epoch: 1, loss: 0.3903443461913345, valid_loss: 0.3999830988978076, train_score: 0.43224290139135146, valid_score: 0.36426684280052835


100%|██████████| 2970/2970 [02:21<00:00, 20.96it/s, loss=0.37] 
100%|██████████| 990/990 [00:29<00:00, 33.22it/s, loss=0.459]


epoch: 2, loss: 0.3701532671979392, valid_loss: 0.4585671595101905, train_score: 0.4629410761282776, valid_score: 0.39564717331681865


100%|██████████| 2970/2970 [02:33<00:00, 19.37it/s, loss=0.359]
100%|██████████| 990/990 [00:25<00:00, 38.81it/s, loss=0.401]


epoch: 3, loss: 0.35880165580931106, valid_loss: 0.4010274596902862, train_score: 0.4914133945506062, valid_score: 0.3657478640333143


100%|██████████| 2970/2970 [02:17<00:00, 21.59it/s, loss=0.334]
100%|██████████| 990/990 [00:26<00:00, 37.39it/s, loss=0.38] 


epoch: 4, loss: 0.3342860061855714, valid_loss: 0.37996606638131775, train_score: 0.51755339115317, valid_score: 0.4419306184012066
################### fold:2 ###################


100%|██████████| 2970/2970 [01:59<00:00, 24.83it/s, loss=0.434]
100%|██████████| 990/990 [00:30<00:00, 32.74it/s, loss=0.392]


epoch: 0, loss: 0.43442165944185435, valid_loss: 0.3923503103148096, train_score: 0.36204773884865127, valid_score: 0.41551812149055645


100%|██████████| 2970/2970 [02:10<00:00, 22.78it/s, loss=0.426]
100%|██████████| 990/990 [00:26<00:00, 36.93it/s, loss=0.387]


epoch: 1, loss: 0.42635058907422474, valid_loss: 0.38702482964424284, train_score: 0.41724215532819653, valid_score: 0.4526415992384578


100%|██████████| 2970/2970 [02:10<00:00, 22.74it/s, loss=0.372]  
100%|██████████| 990/990 [00:26<00:00, 37.37it/s, loss=0.409]


epoch: 2, loss: 0.3722149031194066, valid_loss: 0.40945858888835857, train_score: 0.46188434910056897, valid_score: 0.3882912184138104


100%|██████████| 2970/2970 [02:18<00:00, 21.45it/s, loss=0.365]  
100%|██████████| 990/990 [00:26<00:00, 37.34it/s, loss=0.46] 


epoch: 3, loss: 0.3645767631913973, valid_loss: 0.4595784872187547, train_score: 0.4964220632028091, valid_score: 0.47314890154597233


100%|██████████| 2970/2970 [02:21<00:00, 20.97it/s, loss=0.336]  
100%|██████████| 990/990 [00:26<00:00, 36.96it/s, loss=0.433]


epoch: 4, loss: 0.33615433579812853, valid_loss: 0.43289725311733246, train_score: 0.5291996195552481, valid_score: 0.47413061916878707
################### fold:3 ###################


100%|██████████| 2970/2970 [01:58<00:00, 24.97it/s, loss=0.448] 
100%|██████████| 990/990 [00:29<00:00, 33.10it/s, loss=0.391]


epoch: 0, loss: 0.44798399985824083, valid_loss: 0.39096465636167876, train_score: 0.3462748391803264, valid_score: 0.3984646434364577


100%|██████████| 2970/2970 [02:05<00:00, 23.62it/s, loss=0.39] 
100%|██████████| 990/990 [00:27<00:00, 36.31it/s, loss=0.399]


epoch: 1, loss: 0.39031818462887824, valid_loss: 0.39937501861249436, train_score: 0.43260837110808537, valid_score: 0.4036237092345638


100%|██████████| 2970/2970 [02:24<00:00, 20.59it/s, loss=0.427] 
100%|██████████| 990/990 [00:27<00:00, 36.02it/s, loss=0.379]


epoch: 2, loss: 0.42653917906658295, valid_loss: 0.37931590058994014, train_score: 0.4256683913963555, valid_score: 0.4256580185227772


100%|██████████| 2970/2970 [02:06<00:00, 23.53it/s, loss=0.36] 
100%|██████████| 990/990 [00:26<00:00, 37.30it/s, loss=0.384]


epoch: 3, loss: 0.36010806253694405, valid_loss: 0.38356759508854205, train_score: 0.4749781728489052, valid_score: 0.43892570960784616


100%|██████████| 2970/2970 [02:32<00:00, 19.45it/s, loss=0.35] 
100%|██████████| 990/990 [00:28<00:00, 34.25it/s, loss=0.377]

epoch: 4, loss: 0.3504352685988859, valid_loss: 0.37666753617412485, train_score: 0.49198983273756414, valid_score: 0.4365565835797066
CV: 0.4545012036340953





In [15]:
train_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,id,sii
0,-1.520226,-0.770846,-0.589492,-0.451793,-1.376877,-0.906968,-0.936390,-0.688396,0.412990,-0.406752,...,0.015158,0.014930,-0.105272,-0.108014,-0.138010,-0.015858,-0.125355,1.713182,00008ff9,2
1,-0.401093,-0.770846,0.070969,-1.000316,-1.104518,-1.018138,-1.155609,0.449795,-0.917600,0.334534,...,0.014737,0.014481,-0.091848,-0.093678,-0.230897,0.197596,-0.130743,2.189205,000fd460,0
2,-0.121310,1.297277,0.105730,-0.495922,0.053007,-0.332587,-0.541797,-0.374982,1.029605,0.009409,...,0.016104,0.016060,-0.130176,-0.127580,-0.080752,-0.012796,-0.128749,-0.208267,00105258,0
3,-0.401093,-0.770846,0.105730,-0.178657,-0.015082,-0.193624,-0.410266,-0.787370,1.273005,0.009409,...,0.016257,0.016225,-0.135325,-0.133917,-0.094999,-0.019522,-0.133124,0.343316,00115b9f,1
4,2.116955,1.297277,0.050112,1.446860,1.093418,0.811541,1.387325,0.086893,-0.576839,0.581630,...,0.021251,0.021144,-0.153431,-0.162569,0.145237,-0.071524,-0.137298,-1.519868,0016bb22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,0.718039,-0.770846,-0.276642,-0.551172,0.461546,-0.175096,-0.497954,0.119884,-0.917600,-0.835917,...,0.015624,0.015508,-0.130098,-0.127986,-0.070260,0.205062,-0.125584,-0.122033,ff8a2de4,1
3956,-0.121310,-0.770846,-0.325307,-0.087485,-0.355531,-0.314059,-0.059517,-0.787370,-0.268531,0.074434,...,0.015791,0.015758,-0.139605,-0.134837,-0.160289,-0.059915,-0.139710,0.410338,ffa9794a,0
3957,0.158473,-0.770846,0.001447,0.429207,0.529635,0.459502,0.291233,0.779705,1.435272,-0.055616,...,0.018100,0.018069,-0.151702,-0.157602,-0.088912,-0.084883,-0.139709,-0.244414,ffcd4dbd,1
3958,0.718039,-0.770846,0.070969,-1.347701,1.986756,-0.068558,0.072014,-0.869847,-1.647801,-0.250691,...,0.013693,0.013126,-0.119322,-0.086709,0.056995,-2.072768,-0.112025,-0.476609,ffed1dd5,0


In [16]:
oof_preds_df = pd.concat([pd.DataFrame(p) for p in oof_preds], axis=0).reset_index(
    drop=True
)
oof_preds_df.head()

Unnamed: 0,id,pred_sii
0,000fd460,0.329042
1,0068a485,0.494195
2,00abe655,0.926678
3,00bd4359,1.028653
4,01182ce3,1.70901


In [17]:
test_pred_df = test_df.merge(oof_preds_df, on="id", how="inner")
test_pred_df.head()

Unnamed: 0,id,sii,pred_sii
0,00008ff9,2,0.601703
1,000fd460,0,0.329042
2,00105258,0,0.597664
3,00115b9f,1,0.551932
4,0016bb22,1,0.74888


In [18]:
from scipy.optimize import minimize

KappaOPtimizer = minimize(
    evaluate_predictions,
    x0=[0.5, 1.5, 2.5],
    args=(test_pred_df["sii"], test_pred_df["pred_sii"]),
    method="Nelder-Mead",
)
assert KappaOPtimizer.success, "Optimization did not converge."

oof_tuned = threshold_Rounder(test_pred_df["pred_sii"], KappaOPtimizer.x)
tKappa = quadratic_weighted_kappa(test_pred_df["sii"], oof_tuned)
print(f"tuned Kappa: {tKappa}")

tuned Kappa: 0.4889356244432699


In [19]:
print(KappaOPtimizer.x)

[0.56304075 1.12921852 2.70040294]
