In [1]:
import numpy as np
import pandas as pd
from src.dataloader_ import *
from src.network_ import *
from src.utils import *

import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd

train_series_dir = "../../inputs/series_train.parquet/"
test_series_dir = "../../inputs/series_test.parquet/"

data_dic_path = "../../inputs/data_dictionary.csv"
sample_submission_path = "../../inputs/sample_submission.csv"
train_path = "../../inputs/train.csv"
test_path = "../../inputs/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
data_dic = pd.read_csv(data_dic_path)

In [2]:
train.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [3]:
# onehotEncoderの作成
from sklearn.preprocessing import OneHotEncoder

categorical_columns = [
    "Basic_Demos-Enroll_Season",
    "CGAS-Season",
    "Physical-Season",
    "PAQ_C-Season",
    "FGC-Season",
    "Fitness_Endurance-Season",
    "PAQ_A-Season",
    "BIA-Season",
    "SDS-Season",
    "PreInt_EduHx-Season",
]

double_columns = [
    "FGC-FGC_SRR_Zone",
    "BIA-BIA_SMM",
    "Physical-Waist_Circumference",
    "BIA-BIA_FFMI",
    "FGC-FGC_CU",
    "PreInt_EduHx-computerinternet_hoursday",
    "BIA-BIA_ECW",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_SRL_Zone",
    "BIA-BIA_DEE",
    "Physical-Weight",
    "Fitness_Endurance-Time_Mins",
    "FGC-FGC_SRR",
    "SDS-SDS_Total_T",
    "FGC-FGC_PU",
    "BIA-BIA_FFM",
    "FGC-FGC_TL_Zone",
    "Physical-BMI",
    "Physical-Systolic_BP",
    "Physical-HeartRate",
    "BIA-BIA_ICW",
    "Physical-Height",
    "FGC-FGC_SRL",
    "BIA-BIA_BMC",
    "Fitness_Endurance-Time_Sec",
    "BIA-BIA_Frame_num",
    "Basic_Demos-Age",
    "FGC-FGC_GSND_Zone",
    "Basic_Demos-Sex",
    "FGC-FGC_GSND",
    "BIA-BIA_LST",
    "FGC-FGC_TL",
    "BIA-BIA_BMI",
    "BIA-BIA_FMI",
    "PAQ_C-PAQ_C_Total",
    "BIA-BIA_Activity_Level_num",
    "FGC-FGC_GSD",
    "BIA-BIA_BMR",
    "BIA-BIA_Fat",
    "SDS-SDS_Total_Raw",
    "CGAS-CGAS_Score",
    "FGC-FGC_PU_Zone",
    "BIA-BIA_LDM",
    "Fitness_Endurance-Max_Stage",
    "PAQ_A-PAQ_A_Total",
    "BIA-BIA_TBW",
    "FGC-FGC_GSD_Zone",
    "Physical-Diastolic_BP",
]

###################### categorical columns ######################
# trainのtargetをonehot化
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
onehot_encoder.fit(train[categorical_columns])

with open("./assets/onehot_encoder.pkl", "wb") as f:
    pickle.dump(onehot_encoder, f)

categorical_feature = onehot_encoder.transform(train[categorical_columns])

###################### double columns ######################
# trainのtargetを標準化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train[double_columns])
scaler.transform(train[double_columns])

with open("./assets/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

double_feature = scaler.transform(train[double_columns])
# double_feature = train[double_columns].values

# 欠損値の補完
double_feature = np.nan_to_num(double_feature)

###################### inputの作成 ######################

X = np.concatenate([categorical_feature, double_feature], axis=1)
y = train["sii"].fillna(-1).values



In [4]:
# AutoEncoderの学習
from torch.utils.data import Dataset, DataLoader


criterion = nn.MSELoss()
model = TableAutoEncoder(num_items=X.shape[1], embedding_dim=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_dataset = TableAutoEncoderDataset(X)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [5]:
for epoch in range(100):
    model.train()
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        data = data.float()
        output = model(data, mode="encode-decode")
        loss = criterion(output, data)
        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}, loss: {loss.item()}")


def create_embedding(model, feature):
    model.eval()
    with torch.no_grad():
        feature = torch.tensor(feature).float()
        embedding = model(feature, mode="encode")
    # numpyに変換
    embedding = embedding.numpy()
    return embedding

epoch: 0, loss: 0.2092577964067459
epoch: 1, loss: 0.21601088345050812
epoch: 2, loss: 0.18041060864925385
epoch: 3, loss: 0.13794930279254913
epoch: 4, loss: 0.11821821331977844
epoch: 5, loss: 0.11648336052894592
epoch: 6, loss: 0.1272527575492859
epoch: 7, loss: 0.09799154102802277
epoch: 8, loss: 0.1133531704545021
epoch: 9, loss: 0.09383285790681839
epoch: 10, loss: 0.09178228676319122
epoch: 11, loss: 0.1045646145939827
epoch: 12, loss: 0.09052885323762894
epoch: 13, loss: 0.10242121666669846
epoch: 14, loss: 0.09037310630083084
epoch: 15, loss: 0.08754543215036392
epoch: 16, loss: 0.0799224004149437
epoch: 17, loss: 0.0736144408583641
epoch: 18, loss: 0.07157250493764877
epoch: 19, loss: 0.08299994468688965
epoch: 20, loss: 0.07940196245908737
epoch: 21, loss: 0.07140784710645676
epoch: 22, loss: 0.08187949657440186
epoch: 23, loss: 0.0886145681142807
epoch: 24, loss: 0.07061388343572617
epoch: 25, loss: 0.0779246836900711
epoch: 26, loss: 0.07802388817071915
epoch: 27, loss: 0.

In [6]:
from sklearn.metrics import *


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")


def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(
        oof_non_rounded < thresholds[0],
        0,
        np.where(
            oof_non_rounded < thresholds[1],
            1,
            np.where(oof_non_rounded < thresholds[2], 2, 3),
        ),
    )


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [7]:
embedding_features = create_embedding(model, X)
# embedding_features = X
embedding_features.shape

# yとconcat
dataset = np.concatenate([embedding_features, y.reshape(-1, 1)], axis=1)
# dataset[:, -1]が-1以外のものだけを取得

dataset = dataset[dataset[:, -1] != -1]

X, y = dataset[:, :-1], dataset[:, -1]

# train_test_split
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# モデルの学習
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)

Params7 = {
    "learning_rate": 0.03884249148676395,
    "max_depth": 12,
    "num_leaves": 413,
    "min_data_in_leaf": 14,
    "feature_fraction": 0.7987976913702801,
    "bagging_fraction": 0.7602261703576205,
    "bagging_freq": 2,
    "lambda_l1": 4.735462555910575,
    "lambda_l2": 4.735028557007343e-06,
}


# lgb_model = lgb.LGBMClassifier(**Params7)
lgb_model = lgb.LGBMRegressor(**Params7, verbose=-1, n_estimators=200)

lgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    early_stopping_rounds=10,
    verbose=10,
)

Training until validation scores don't improve for 10 rounds
[10]	valid_0's l2: 0.557193
[20]	valid_0's l2: 0.523545
[30]	valid_0's l2: 0.503658
[40]	valid_0's l2: 0.493892
[50]	valid_0's l2: 0.485666
[60]	valid_0's l2: 0.480531
[70]	valid_0's l2: 0.47754
[80]	valid_0's l2: 0.476645
Early stopping, best iteration is:
[75]	valid_0's l2: 0.476453


In [8]:
train_score = quadratic_weighted_kappa(
    y_train, lgb_model.predict(X_train).round(0).astype(int)
)
valid_score = quadratic_weighted_kappa(
    y_valid, lgb_model.predict(X_valid).round(0).astype(int)
)

print(f"train_score: {train_score}, valid_score: {valid_score}")

# original-feature : train_score: 0.5867837902742401, valid_score: 0.36020517531771556
# embedding-feature : train_score: 0.7461982276735281, valid_score: 0.32452859350850083

train_score: 0.6224396472882263, valid_score: 0.34173037364526726


In [9]:
feature_importance_df = pd.DataFrame(
    {
        "Feature": lgb_model.booster_.feature_name(),
        "Importance": lgb_model.booster_.feature_importance(importance_type="gain"),
    }
)

feature_importance_df

Unnamed: 0,Feature,Importance
0,Column_0,115.55489
1,Column_1,88.176112
2,Column_2,101.4838
3,Column_3,79.177335
4,Column_4,121.659096
5,Column_5,153.446836
6,Column_6,97.578291
7,Column_7,192.837301
8,Column_8,176.062377
9,Column_9,166.507213


In [10]:
"""
	Feature	Importance
0	Column_0	0.000000
1	Column_1	380.774703
2	Column_2	93.412222
3	Column_3	59.987604
4	Column_4	1022.835918
...	...	...
59	Column_59	244.442884
60	Column_60	62.446166
61	Column_61	76.901441
62	Column_62	0.000000
63	Column_63	88.173296
"""

'\n\tFeature\tImportance\n0\tColumn_0\t0.000000\n1\tColumn_1\t380.774703\n2\tColumn_2\t93.412222\n3\tColumn_3\t59.987604\n4\tColumn_4\t1022.835918\n...\t...\t...\n59\tColumn_59\t244.442884\n60\tColumn_60\t62.446166\n61\tColumn_61\t76.901441\n62\tColumn_62\t0.000000\n63\tColumn_63\t88.173296\n'