## Second improvement (Parquet file integration, and features engineering)

# PARQUET file processing

In [1]:
import polars as pl
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import torch.optim as optim
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from sklearn.base import clone

In [2]:
def process_file(filename, dirname):
    
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:

    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [3]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [01:10<00:00, 14.10it/s]
100%|██████████| 2/2 [00:00<00:00, 11.93it/s]


#### fill parquet with autoencoder

In [4]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.GELU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.GELU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.GELU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.GELU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.GELU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [5]:
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [6]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)
df_train

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,...,1.738203,5.314874,89.422226,0.0,2626.199951,4187.0,8.639500e+13,7.0,2.0,57.0
1,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,...,2.475326,3.966906,89.080330,1.0,2628.199951,4146.0,8.639500e+13,7.0,2.0,243.0
2,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,...,1.746797,5.066334,86.987267,0.0,2618.199951,4183.0,8.636500e+13,7.0,3.0,134.0
3,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,...,1.269051,6.134459,89.976074,0.0,2502.000000,6000.0,8.639500e+13,7.0,4.0,72.0
4,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,...,1.071875,2.774382,89.300034,0.0,1046.800049,4199.0,8.601500e+13,7.0,4.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,...,2.099614,3.669502,89.025551,1.0,2576.399902,4191.0,8.639500e+13,7.0,4.0,161.0
992,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,...,0.996484,1.786410,81.665283,0.0,1526.599976,4194.0,8.514000e+13,7.0,2.0,130.0
993,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,...,1.547813,3.692727,89.333710,1.0,2592.199951,4178.0,8.639500e+13,7.0,1.0,79.0
994,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,...,0.999219,1.673958,88.629547,0.0,1875.199951,4183.0,8.639500e+13,7.0,1.0,155.0


In [7]:
train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

Epoch [10/100], Loss: 1.4428]
Epoch [20/100], Loss: 1.4151]
Epoch [30/100], Loss: 1.3924]
Epoch [40/100], Loss: 1.3862]
Epoch [50/100], Loss: 1.3879]
Epoch [60/100], Loss: 1.3818]
Epoch [70/100], Loss: 1.3585]
Epoch [80/100], Loss: 1.3560]
Epoch [90/100], Loss: 1.3535]
Epoch [100/100], Loss: 1.3559]
Epoch [10/100], Loss: 1.0197]
Epoch [20/100], Loss: 0.4461]
Epoch [30/100], Loss: 0.4271]
Epoch [40/100], Loss: 0.4271]
Epoch [50/100], Loss: 0.4271]
Epoch [60/100], Loss: 0.4271]
Epoch [70/100], Loss: 0.4271]
Epoch [80/100], Loss: 0.4271]
Epoch [90/100], Loss: 0.4271]
Epoch [100/100], Loss: 0.4271]


In [8]:
time_series_cols = train_ts_encoded.columns.tolist() # lưu trữ các cột
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

## TABULAR NORMAL DATA PROCESSING

#### Drop any samplers which only missed any values in PCIAT test

In [9]:
train_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
test_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")

In [10]:
columns_not_in_test = ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'PCIAT-Season', 'sii']
train_data = train_data.dropna(subset=columns_not_in_test)

#### Drop season data

In [11]:
train_seasonal_columns = [col for col in train_data.columns if 'Season' in col]
test_seasonal_columns = [col for col in test_data.columns if 'Season' in col]

In [12]:
train_data_wo_season = train_data.drop(train_seasonal_columns, axis = 1)
test_data_wo_season = test_data.drop(test_seasonal_columns, axis = 1)

In [13]:
label_related_features = ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'sii']
label = ['PCIAT-PCIAT_Total']
X = train_data_wo_season.drop(label_related_features, axis = 1)
new_y = train_data_wo_season[label]

In [14]:
new_X = X.drop(['id'], axis = 1)
new_test = test_data_wo_season.drop(['id'], axis = 1)

# Feature Engineering

In [15]:
def feature_engineering(df):
    #Age
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['Physical-Waist_Age'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Physical-Height_Age'] = df['Basic_Demos-Age'] * df['Physical-Height']
    df['SDS_InternetHours'] = df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']

    #SDS
    df['SDS_BMI'] = df['BIA-BIA_BMI'] * df['SDS-SDS_Total_T']
    df['CGAS_SDS'] = df['CGAS-CGAS_Score'] * df['SDS-SDS_Total_T']
    df['CGAS_Endurance_Mins'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Time_Mins']
    df['SDS_Activity'] = df['BIA-BIA_Activity_Level_num'] * df['SDS-SDS_Total_T']

    df['BMI_Systolic_BP'] = df['BIA-BIA_BMI'] * df['Physical-Systolic_BP']
    df['Age_Systolic_BP'] = df['Basic_Demos-Age'] * df['Physical-Systolic_BP']
    df['PreInt_Systolic_BP'] = df['Physical-Systolic_BP'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['PAQ_A_Activity'] = df['BIA-BIA_Activity_Level_num'] * df['PAQ_A-PAQ_A_Total']
    df['Activity_CU_PU'] = df['BIA-BIA_Activity_Level_num'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']

    #FGC
    df['FGC_CU_PU'] = df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['FGC_CU_PU_Age'] = df['FGC-FGC_CU'] * df['FGC-FGC_PU'] * df['Basic_Demos-Age']
    df['FGC_GSND_GSD'] = df['FGC-FGC_GSND'] * df['FGC-FGC_GSD']
    df['FGC_GSND_GSD_Age'] = df['FGC-FGC_GSND'] * df['FGC-FGC_GSD'] * df['Basic_Demos-Age']
    df['CGAS_CU_PU'] = df['CGAS-CGAS_Score'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['PreInt_FGC_CU_PU'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['Endurance_CU_PU'] = df['Fitness_Endurance-Time_Mins'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']

    return df

In [16]:
new_X = feature_engineering(new_X)
new_test = feature_engineering(new_test)

In [18]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

new_X = pd.DataFrame(standard_scaler.fit_transform(new_X), columns=new_X.columns)
new_test = pd.DataFrame(standard_scaler.fit_transform(new_test), columns=new_test.columns)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


## IMPUTATION IN NORMAL TABULAR DATA

In [20]:
def fill_na_with_MICE(df):
    df_copy = df.copy()
    missing_mask = df_copy.isna()
    original_columns = df_copy.columns.tolist()
    imputer = IterativeImputer(max_iter=50, random_state=0)
    imputed_values = imputer.fit_transform(df_copy)
    imputed_df = pd.DataFrame(
        imputed_values,
        columns=original_columns,
        index=df_copy.index
    )
    df_copy[missing_mask] = imputed_df[missing_mask]
    return df_copy
    

## MERGE PARQUET INTO TABULAR DATA

In [None]:
train_ts_encoded["id"] = train_ts["id"]
new_X['id'] =  train_data['id']

In [None]:
test_ts_encoded['id'] = test_ts['id']
new_test['id'] = test_data['id']

In [None]:
merged_train = pd.merge(new_X, train_ts_encoded, how='left', on = 'id')
merged_test = pd.merge(new_test, test_ts_encoded, how='left', on = 'id')

In [None]:
merged_train_not_fill = merged_train.drop(['id'], axis = 1)

## AFTER MERGED, FILL NULL WITH MERGED DATA

In [None]:
merged_train_wo_id = merged_train.drop('id', axis = 1)
merged_test_wo_id = merged_test.drop('id', axis = 1)

In [None]:
float64_cols = merged_train_wo_id.select_dtypes(include=['float64']).columns
merged_train_wo_id[float64_cols] = merged_train_wo_id[float64_cols].astype('float32')
merged_test_wo_id[float64_cols] = merged_test_wo_id[float64_cols].astype('float32')

In [None]:
imputer = KNNImputer(n_neighbors=10)

numeric_cols = merged_train_wo_id.select_dtypes(include=['float64', 'int64', 'float32']).columns
print(numeric_cols)
imputed_data = imputer.fit_transform(merged_train_wo_id[numeric_cols])

train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
for col in merged_train_wo_id.columns:
    if col not in numeric_cols:
        train_imputed[col] = merged_train_wo_id[col]

merged_train_wo_id = train_imputed

imputed_data = imputer.fit_transform(merged_test_wo_id[numeric_cols])

test_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
for col in merged_test_wo_id.columns:
    if col not in numeric_cols:
        test_imputed[col] = merged_test_wo_id[col]

merged_test_wo_id = test_imputed

In [None]:
new_X = merged_train_wo_id
new_test = merged_test_wo_id

### Training

In [None]:
def get_info_for_ftt(df):
    number_of_cat = 0
    cat_ranges = []
    all_features = df.columns.tolist()
    cat_idx = []
    for i, feature in enumerate(all_features):
        if (df[feature].nunique() <= 2):
            number_of_cat = number_of_cat + 1
            cat_ranges.append(df[feature].nunique())
            cat_idx.append(i)
    
    num_continuous = df.shape[-1] - number_of_cat
    return cat_ranges, num_continuous, cat_idx


In [None]:
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, samples, labels, cat_idx):
        if isinstance(labels, pd.DataFrame) or isinstance(labels, pd.Series):
            labels = labels.to_numpy()
        self.samples = samples
        self.labels = labels
        self.cat_idx = cat_idx
        
    def __len__(self):
        return len(self.labels)
        
    def divide_cat_num(self, row_data):
        mask = np.zeros(len(row_data), dtype=bool)
        mask[self.cat_idx] = True
        cat_elements = row_data[mask]
        remaining_elements = row_data[~mask]
        return cat_elements, remaining_elements

    def __getitem__(self, idx):
        row_value = self.samples[idx]
        tensor_row_value = torch.tensor(row_value)
        cat_values, num_values = self.divide_cat_num(row_value)
        cat_values = torch.tensor(cat_values, dtype=torch.int32)
        num_values = torch.tensor(num_values, dtype=torch.float32)
        label = self.labels[idx]
        tensor_label = torch.tensor(label, dtype=torch.float32) 
        return cat_values, num_values, tensor_label

In [None]:
from torch.utils.data import DataLoader

In [None]:
!pip install /kaggle/input/fttransformer/einops-0.8.0-py3-none-any.whl

In [None]:
import torch.nn as nn
import torch.optim as optim

In [None]:
cat_ranges, num_continuous, cat_idx = get_info_for_ftt(new_X)

## Making wrapper for FTTransformer

In [None]:
import torch
import torch.nn.functional as F
from torch import nn, einsum

from einops import rearrange, repeat

# feedforward and attention

class GEGLU(nn.Module):
    def forward(self, x):
        x, gates = x.chunk(2, dim = -1)
        return x * F.gelu(gates)

def FeedForward(dim, mult = 4, dropout = 0.):
    return nn.Sequential(
        nn.LayerNorm(dim),
        nn.Linear(dim, dim * mult * 2),
        GEGLU(),
        nn.Dropout(dropout),
        nn.Linear(dim * mult, dim)
    )

class Attention(nn.Module):
    def __init__(
        self,
        dim,
        heads = 8,
        dim_head = 64,
        dropout = 0.
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.scale = dim_head ** -0.5

        self.norm = nn.LayerNorm(dim)
        
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Linear(inner_dim, dim, bias = False)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.heads

        x = self.norm(x)

        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
        q = q * self.scale

        sim = einsum('b h i d, b h j d -> b h i j', q, k)

        attn = sim.softmax(dim = -1)
        dropped_attn = self.dropout(attn)

        out = einsum('b h i j, b h j d -> b h i d', dropped_attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)', h = h)
        out = self.to_out(out)

        return out, attn

# transformer

class Transformer(nn.Module):
    def __init__(
        self,
        dim,
        depth,
        heads,
        dim_head,
        attn_dropout,
        ff_dropout
    ):
        super().__init__()
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout),
                FeedForward(dim, dropout = ff_dropout),
            ]))

    def forward(self, x, return_attn = False):
        post_softmax_attns = []

        for attn, ff in self.layers:
            attn_out, post_softmax_attn = attn(x)
            post_softmax_attns.append(post_softmax_attn)

            x = attn_out + x
            x = ff(x) + x

        if not return_attn:
            return x

        return x, torch.stack(post_softmax_attns)
        
# batch norm
class BatchNormSequence(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.bn = nn.BatchNorm1d(dim)
        
    def forward(self, x):
        # x: (batch, sequence, features)
        x = x.transpose(1, 2)
        x = self.bn(x)
        x = x.transpose(1, 2) 
        return x
        
# numerical embedder
class NumericalEmbedder(nn.Module):
    def __init__(self, dim, num_numerical_types):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(num_numerical_types, dim))
        self.biases = nn.Parameter(torch.randn(num_numerical_types, dim))

    def forward(self, x):
        x = rearrange(x, 'b n -> b n 1')
        return x * self.weights + self.biases

# main class

class FTTransformer(nn.Module):
    def __init__(
        self,
        *,
        categories,
        num_continuous,
        dim,
        depth,
        heads,
        dim_head = 16,
        dim_out = 1,
        num_special_tokens = 2,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__()
        assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive'
        assert len(categories) + num_continuous > 0, 'input shape must not be null'

       
        self.num_categories = len(categories)
        self.num_unique_categories = sum(categories)

       

        self.num_special_tokens = num_special_tokens
        total_tokens = self.num_unique_categories + num_special_tokens

        

        if self.num_unique_categories > 0:
            categories_offset = F.pad(torch.tensor(list(categories)), (1, 0), value = num_special_tokens)
            categories_offset = categories_offset.cumsum(dim = -1)[:-1]
            self.register_buffer('categories_offset', categories_offset)

            

            self.categorical_embeds = nn.Embedding(total_tokens, dim)
            self.categ_bn = BatchNormSequence(dim)


        

        self.num_continuous = num_continuous

        if self.num_continuous > 0:
            self.numerical_embedder = NumericalEmbedder(dim, self.num_continuous)
            self.numer_bn = BatchNormSequence(dim)


        # cls token

        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.pre_transformer_bn = BatchNormSequence(dim)

        # transformer

        self.transformer = Transformer(
            dim = dim,
            depth = depth,
            heads = heads,
            dim_head = dim_head,
            attn_dropout = attn_dropout,
            ff_dropout = ff_dropout
        )

        # to logits

        self.to_logits = nn.Sequential(
            nn.LayerNorm(dim),
            nn.ReLU(),
            nn.Linear(dim, dim_out)
        )

    def forward(self, x_categ, x_numer, return_attn = False):
        assert x_categ.shape[-1] == self.num_categories, f'you must pass in {self.num_categories} values for your categories input'

        xs = []
        if self.num_unique_categories > 0:
            x_categ = x_categ + self.categories_offset

            x_categ = self.categorical_embeds(x_categ)
            x_categ = self.categ_bn(x_categ)
            xs.append(x_categ)

        # add numerically embedded tokens
        if self.num_continuous > 0:
            x_numer = self.numerical_embedder(x_numer)
            x_numer = self.numer_bn(x_numer)
            xs.append(x_numer)

        # concat categorical and numerical

        x = torch.cat(xs, dim = 1)
        x = self.pre_transformer_bn(x)


        # append cls tokens
        b = x.shape[0]
        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
        x = torch.cat((cls_tokens, x), dim = 1)
        
        # attend

        x, attns = self.transformer(x, return_attn = True)

        # get cls token

        x = x[:, 0]

        

        logits = self.to_logits(x)

        if not return_attn:
            return logits

        return logits, attns


In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import torch

In [None]:
from sklearn.metrics import cohen_kappa_score

def qwk(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [None]:
class FTTransformerWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, categories, num_continuous, dim, dim_out, depth, heads, attn_dropout, 
                 ff_dropout, batch_size, num_epochs, learning_rate, cat_ranges, cat_idx):
        self.categories = categories
        self.num_continuous = num_continuous
        self.dim = dim
        self.dim_out = dim_out
        self.depth = depth
        self.heads = heads
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.cat_ranges = cat_ranges
        self.cat_idx = cat_idx
        self.num_epochs = num_epochs
        
    def _init_model(self):
        self.model = FTTransformer(
            categories=self.categories,
            num_continuous=self.num_continuous,
            dim=self.dim,
            dim_out=self.dim_out,
            depth=self.depth,
            heads=self.heads,
            attn_dropout=self.attn_dropout,
            ff_dropout=self.ff_dropout
        ).to(self.device)
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def fit(self, X, y):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        print("--------------------------------type of X_train", type(X_train))
        print("---------------------------------------shape of X", X_train.shape)
        from sklearn.preprocessing import StandardScaler
        self.scaler = StandardScaler()
        scaled_X_train = self.scaler.fit_transform(X_train)
        scaled_X_val = self.scaler.transform(X_val)
        
        train_dataset = MyDataset(scaled_X_train, y_train, self.cat_idx)
        val_dataset = MyDataset(scaled_X_val, y_val, self.cat_idx)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

        self._init_model()
        best_val_loss = float('inf')

        save_path='model_checkpoints'
        os.makedirs(save_path, exist_ok=True)

        for epoch in range(self.num_epochs):
            self.model.train()
            train_loss = 0
            for x_cat, x_num, y in train_loader:
                x_cat = x_cat.to(self.device) if x_cat is not None else None
                x_num = x_num.to(self.device)
                y = y.to(self.device)

                self.optimizer.zero_grad()
                output = self.model(x_cat, x_num).squeeze(1)
                
                loss = self.criterion(output, y)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
            
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for x_cat, x_num, y in val_loader:
                    x_cat = x_cat.to(self.device) if x_cat is not None else None
                    x_num = x_num.to(self.device)
                    y = y.to(self.device)
                    
                    output = self.model(x_cat, x_num).squeeze(1)

                    val_loss += self.criterion(output, y).item()

            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'train_loss': avg_train_loss,
                    'val_loss': avg_val_loss,
                    'best_val_loss': best_val_loss
                }
                torch.save(checkpoint, os.path.join(save_path, f'best_model.pth'))
                print(f'Saved best model with validation loss: {best_val_loss:.4f}')

    def load_model(self, model, checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        return model

    def load_model_checkpoint(self, checkpoint_path):
        try:
            checkpoint = torch.load(checkpoint_path, map_location=self.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.model.to(self.device)
            self.model.eval()
            print(f"Model loaded from {checkpoint_path}")
        except Exception as e:
            print(f"Error loading the model: {e}")
    
    def predict(self, X):
        self.model = self.load_model(self.model, "/kaggle/working/model_checkpoints/best_model.pth")

        def divide_cat_num(data, cat_idx):
            mask = np.zeros(data.shape[1], dtype=bool)
            mask[cat_idx] = True
            cat_elements = data[:, mask]
            remaining_elements = data[:, ~mask]
            return cat_elements, remaining_elements

        self.model.eval()
        new_test_values = X.values
        new_test_values = self.scaler.transform(new_test_values)
        num_samples = len(new_test_values)
        predictions = []

        for i in range(0, num_samples, self.batch_size):
            batch_data = new_test_values[i:i + self.batch_size]
            cat, num = divide_cat_num(batch_data, self.cat_idx)
            cat = torch.tensor(cat, dtype=torch.int32).to(self.device)
            num = torch.tensor(num, dtype=torch.float32).to(self.device)

            with torch.no_grad():
                output = self.model(cat, num)
                output = output.squeeze(1)
                predictions.extend(output.cpu().numpy())

            del cat
            del num
            del output
            torch.cuda.empty_cache()

        return np.array(predictions)

        for epoch in range(self.num_epochs):
            self.model.train()
            train_loss = 0
            for x_cat, x_num, y in train_loader:
                x_cat = x_cat.to(self.device) if x_cat is not None else None
                x_num = x_num.to(self.device)
                y = y.to(self.device)

                self.optimizer.zero_grad()
                output = self.model(x_cat, x_num).squeeze(1)
                
                loss = self.criterion(output, y)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
            
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for x_cat, x_num, y in val_loader:
                    x_cat = x_cat.to(self.device) if x_cat is not None else None
                    x_num = x_num.to(self.device)
                    y = y.to(self.device)
                    
                    output = self.model(x_cat, x_num).squeeze(1)

                    val_loss += self.criterion(output, y).item()

            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'train_loss': avg_train_loss,
                    'val_loss': avg_val_loss,
                    'best_val_loss': best_val_loss
                }
                torch.save(checkpoint, os.path.join(save_path, f'best_model.pth'))
                print(f'Saved best model with validation loss: {best_val_loss:.4f}')

## QWK THRESHOLD OPTIMIZER

In [None]:
n_splits = 5

In [None]:
model_params = {
    'categories': tuple(cat_ranges), 
    'num_continuous': num_continuous,    
    'dim': 10,             
    'dim_out': 1,          
    'depth': 2,            
    'heads': 3,            
    'attn_dropout': 0.1,   
    'ff_dropout': 0.1,
    'batch_size': 32,
    'num_epochs': 100,
    'learning_rate': 1e-3,
    'cat_ranges': cat_ranges,
    'cat_idx': cat_idx 
}
model = FTTransformerWrapper(**model_params)
model.fit(new_X, y)

Epoch 1, Train Loss: 1137.0349, Val Loss: 1061.5314
Saved best model with validation loss: 1061.5314
Epoch 2, Train Loss: 1100.1245, Val Loss: 1033.1532
Saved best model with validation loss: 1033.1532
Epoch 3, Train Loss: 1065.5473, Val Loss: 994.4426
Saved best model with validation loss: 994.4426
Epoch 4, Train Loss: 1027.9088, Val Loss: 950.4918
Saved best model with validation loss: 950.4918
Epoch 5, Train Loss: 981.9695, Val Loss: 907.5274
Saved best model with validation loss: 907.5274
Epoch 6, Train Loss: 931.1604, Val Loss: 859.9179
Saved best model with validation loss: 859.9179
Epoch 7, Train Loss: 880.3323, Val Loss: 800.4043
Saved best model with validation loss: 800.4043
Epoch 8, Train Loss: 829.5012, Val Loss: 758.3185
Saved best model with validation loss: 758.3185
Epoch 9, Train Loss: 775.5452, Val Loss: 714.2839
Saved best model with validation loss: 714.2839
Epoch 10, Train Loss: 724.2818, Val Loss: 665.0346
Saved best model with validation loss: 665.0346
Epoch 11, T

In [None]:
predictions = model.predict(new_test)

  checkpoint = torch.load(checkpoint_path)


In [None]:
checkpoint_path = "/kaggle/working/model_checkpoints/best_model.pth"
# checkpoint_path = "/kaggle/input/best_v1/pytorch/default/1/best_model_val_loss_291.7448.pth"
model.load_model_checkpoint(checkpoint_path)

Model loaded from /kaggle/working/model_checkpoints/best_model.pth


  checkpoint = torch.load(checkpoint_path, map_location=self.device)


In [None]:
def get_prediction(new_test, cat_idx, model, batch_size=16, track_memory=False):
    def divide_cat_num(data, cat_idx):
        mask = np.zeros(data.shape[1], dtype=bool)
        mask[cat_idx] = True
        cat_elements = data[:, mask]
        remaining_elements = data[:, ~mask]
        return cat_elements, remaining_elements
    
    device = 'cuda'
    new_test_values = new_test.values
    num_samples = len(new_test_values)
    predictions = []

    
    # Process data in batches
    for i in range(0, num_samples, batch_size):
        if track_memory:
            print(f'GPU Memory before batch {i}: {torch.cuda.memory_allocated()/1024**2:.2f} MB')
            
        batch_data = new_test_values[i:i + batch_size]
        cat, num = divide_cat_num(batch_data, cat_idx)
        
        cat = torch.tensor(cat, dtype=torch.int32).to(device)
        num = torch.tensor(num, dtype=torch.float32).to(device)
        
        with torch.no_grad():
            output = model.model(cat, num)
            output = output.squeeze(1)
            predictions.extend(output.cpu().numpy())
        
        # Clear GPU memory
        del cat
        del num
        del output
        torch.cuda.empty_cache()
        
        if track_memory:
            print(f'GPU Memory after batch {i}: {torch.cuda.memory_allocated()/1024**2:.2f} MB')
    
    return predictions

In [None]:
prediction = get_prediction(new_test, cat_idx, model)
prediction

[19.95567,
 15.092413,
 42.51784,
 18.462057,
 27.865429,
 27.32825,
 23.246046,
 24.63583,
 33.182278,
 26.369068,
 35.208183,
 22.191006,
 36.317226,
 42.061874,
 31.93947,
 28.113201,
 10.839161,
 14.951689,
 26.358326,
 35.54263]

In [None]:
test_id = test_data['id']

In [None]:
def handle_prediction(predictions, test_id):
    # return submission.csv
    sii = []
    for i in range(len(predictions)):
        predict = predictions[i]
        if (predict >=  0 and predict <= 30):
            sii.append(0)
        elif(predict < 50):
            sii.append(1)
        elif(predict < 80):
            sii.append(2)
        else:
            sii.append(3)
    sii = pd.DataFrame(sii)
    submission = pd.concat([test_id, sii], axis = 1)
    submission = submission.rename(columns={0: 'sii'})
    return submission
submission = handle_prediction(prediction, test_id)

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
submission

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,0
5,001f3379,0
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0
