## PARQUET PROCESSING

In [1]:
import polars as pl
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import torch.optim as optim

In [2]:
def process_file(filename, dirname):
    """
    return describe value and people's id.
    """
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    """
    ids: chuỗi các file trong parquet
    results: kết quả áp dụng process_file cho từng file trong paquet
    
    """
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [3]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [01:09<00:00, 14.35it/s]
100%|██████████| 2/2 [00:00<00:00, 11.62it/s]


#### Fill parquet with autoencoder

In [4]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.GELU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.GELU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.GELU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.GELU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.GELU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [5]:
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [6]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)
df_train

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,...,1.738203,5.314874,89.422226,0.0,2626.199951,4187.0,8.639500e+13,7.0,2.0,57.0
1,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,...,2.475326,3.966906,89.080330,1.0,2628.199951,4146.0,8.639500e+13,7.0,2.0,243.0
2,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,...,1.746797,5.066334,86.987267,0.0,2618.199951,4183.0,8.636500e+13,7.0,3.0,134.0
3,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,...,1.269051,6.134459,89.976074,0.0,2502.000000,6000.0,8.639500e+13,7.0,4.0,72.0
4,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,...,1.071875,2.774382,89.300034,0.0,1046.800049,4199.0,8.601500e+13,7.0,4.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,394128.0,...,2.099614,3.669502,89.025551,1.0,2576.399902,4191.0,8.639500e+13,7.0,4.0,161.0
992,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,1195.0,...,0.996484,1.786410,81.665283,0.0,1526.599976,4194.0,8.514000e+13,7.0,2.0,130.0
993,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,...,1.547813,3.692727,89.333710,1.0,2592.199951,4178.0,8.639500e+13,7.0,1.0,79.0
994,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,...,0.999219,1.673958,88.629547,0.0,1875.199951,4183.0,8.639500e+13,7.0,1.0,155.0


In [7]:
train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

Epoch [10/100], Loss: 1.5313]
Epoch [20/100], Loss: 1.4248]
Epoch [30/100], Loss: 1.3880]
Epoch [40/100], Loss: 1.3860]
Epoch [50/100], Loss: 1.3811]
Epoch [60/100], Loss: 1.3788]
Epoch [70/100], Loss: 1.3776]
Epoch [80/100], Loss: 1.3760]
Epoch [90/100], Loss: 1.3751]
Epoch [100/100], Loss: 1.3693]
Epoch [10/100], Loss: 1.0188]
Epoch [20/100], Loss: 0.4822]
Epoch [30/100], Loss: 0.4271]
Epoch [40/100], Loss: 0.4271]
Epoch [50/100], Loss: 0.4271]
Epoch [60/100], Loss: 0.4271]
Epoch [70/100], Loss: 0.4271]
Epoch [80/100], Loss: 0.4271]
Epoch [90/100], Loss: 0.4271]
Epoch [100/100], Loss: 0.4271]


In [8]:
time_series_cols = train_ts_encoded.columns.tolist() # lưu trữ các cột
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

## TABULAR NORMAL DATA PROCESSING

#### Drop any samplers which only missed any values in PCIAT test

In [9]:
train_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
test_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")

In [10]:
columns_not_in_test = ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'PCIAT-Season', 'sii']
train_data = train_data.dropna(subset=columns_not_in_test)

#### Drop season data

In [11]:
train_seasonal_columns = [col for col in train_data.columns if 'Season' in col]
test_seasonal_columns = [col for col in test_data.columns if 'Season' in col]

In [12]:
train_data_wo_season = train_data.drop(train_seasonal_columns, axis = 1)
test_data_wo_season = test_data.drop(test_seasonal_columns, axis = 1)

#### So, we got a quire reliable labels here, the next step would be create X and y

In [13]:
label_related_features = ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'sii']
label = ['PCIAT-PCIAT_Total']
X = train_data_wo_season.drop(label_related_features, axis = 1)
new_y = train_data_wo_season[label]

In [14]:
new_X = X.drop(['id'], axis = 1)
new_test = test_data_wo_season.drop(['id'], axis = 1)

In [15]:
def feature_engineering(df):
    #Age
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['Physical-Waist_Age'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Physical-Height_Age'] = df['Basic_Demos-Age'] * df['Physical-Height']
    df['SDS_InternetHours'] = df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']

    #SDS
    df['SDS_BMI'] = df['BIA-BIA_BMI'] * df['SDS-SDS_Total_T']
    df['CGAS_SDS'] = df['CGAS-CGAS_Score'] * df['SDS-SDS_Total_T']
    df['CGAS_Endurance_Mins'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Time_Mins']
    df['SDS_Activity'] = df['BIA-BIA_Activity_Level_num'] * df['SDS-SDS_Total_T']

    df['BMI_Systolic_BP'] = df['BIA-BIA_BMI'] * df['Physical-Systolic_BP']
    df['Age_Systolic_BP'] = df['Basic_Demos-Age'] * df['Physical-Systolic_BP']
    df['PreInt_Systolic_BP'] = df['Physical-Systolic_BP'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['PAQ_A_Activity'] = df['BIA-BIA_Activity_Level_num'] * df['PAQ_A-PAQ_A_Total']
    df['Activity_CU_PU'] = df['BIA-BIA_Activity_Level_num'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']

    #FGC
    df['FGC_CU_PU'] = df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['FGC_CU_PU_Age'] = df['FGC-FGC_CU'] * df['FGC-FGC_PU'] * df['Basic_Demos-Age']
    df['FGC_GSND_GSD'] = df['FGC-FGC_GSND'] * df['FGC-FGC_GSD']
    df['FGC_GSND_GSD_Age'] = df['FGC-FGC_GSND'] * df['FGC-FGC_GSD'] * df['Basic_Demos-Age']
    df['CGAS_CU_PU'] = df['CGAS-CGAS_Score'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['PreInt_FGC_CU_PU'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['Endurance_CU_PU'] = df['Fitness_Endurance-Time_Mins'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']

    return df

In [16]:
new_X = feature_engineering(new_X)
new_test = feature_engineering(new_test)

In [19]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

new_X = pd.DataFrame(standard_scaler.fit_transform(new_X), columns=new_X.columns)
new_test = pd.DataFrame(standard_scaler.fit_transform(new_test), columns=new_test.columns)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [20]:
new_X = new_X.drop('PAQ_A_Activity', axis=1)
new_test = new_test.drop('PAQ_A_Activity', axis=1)

## IMPUTATION IN NORMAL TABULAR DATA

In [21]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np

In [22]:
def fill_na_with_MICE(df):
    df_copy = df.copy()
    missing_mask = df_copy.isna()
    original_columns = df_copy.columns.tolist()
    imputer = IterativeImputer(max_iter=50, random_state=0)
    imputed_values = imputer.fit_transform(df_copy)
    imputed_df = pd.DataFrame(
        imputed_values,
        columns=original_columns,
        index=df_copy.index
    )
    df_copy[missing_mask] = imputed_df[missing_mask]
    return df_copy
    

In [23]:
new_X = fill_na_with_MICE(new_X)
new_test = fill_na_with_MICE(new_test)



## MERGE PARQUET INTO TABULAR DATA

In [24]:
train_ts_encoded["id"] = train_ts["id"]
new_X['id'] =  train_data['id']

In [25]:
test_ts_encoded['id'] = test_ts['id']
new_test['id'] = test_data['id']

In [26]:
merged_train = pd.merge(new_X, train_ts_encoded, how='left', on = 'id')
merged_test = pd.merge(new_test, test_ts_encoded, how='left', on = 'id')

In [27]:
merged_train_not_fill = merged_train.drop(['id'], axis = 1)

## AFTER MERGED, FILL NULL WITH MERGED DATA

In [28]:
merged_train_wo_id = merged_train.drop('id', axis = 1)
merged_test_wo_id = merged_test.drop('id', axis = 1)

In [29]:
float64_cols = merged_train_wo_id.select_dtypes(include=['float64']).columns
merged_train_wo_id[float64_cols] = merged_train_wo_id[float64_cols].astype('float32')
merged_test_wo_id[float64_cols] = merged_test_wo_id[float64_cols].astype('float32')

In [31]:
from sklearn.impute import KNNImputer

In [32]:
imputer = KNNImputer(n_neighbors=10)

numeric_cols = merged_train_wo_id.select_dtypes(include=['float64', 'int64', 'float32']).columns
print(numeric_cols)
imputed_data = imputer.fit_transform(merged_train_wo_id[numeric_cols])

train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
for col in merged_train_wo_id.columns:
    if col not in numeric_cols:
        train_imputed[col] = merged_train_wo_id[col]

merged_train_wo_id = train_imputed

imputed_data = imputer.fit_transform(merged_test_wo_id[numeric_cols])

test_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
for col in merged_test_wo_id.columns:
    if col not in numeric_cols:
        test_imputed[col] = merged_test_wo_id[col]

merged_test_wo_id = test_imputed

Index(['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       ...
       'Enc_51', 'Enc_52', 'Enc_53', 'Enc_54', 'Enc_55', 'Enc_56', 'Enc_57',
       'Enc_58', 'Enc_59', 'Enc_60'],
      dtype='object', length=128)


In [33]:
new_X = merged_train_wo_id
new_test = merged_test_wo_id

### Preparing for FTT

In [34]:
def get_info_for_ftt(df):
    number_of_cat = 0
    cat_ranges = []
    all_features = df.columns.tolist()
    cat_idx = []
    for i, feature in enumerate(all_features):
        if (df[feature].nunique() <= 2):
            number_of_cat = number_of_cat + 1
            cat_ranges.append(df[feature].nunique())
            cat_idx.append(i)
    
    num_continuous = df.shape[-1] - number_of_cat
    return cat_ranges, num_continuous, cat_idx


In [35]:
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, samples, labels, cat_idx):
        if isinstance(labels, pd.DataFrame) or isinstance(labels, pd.Series):
            labels = labels.to_numpy()
        self.samples = samples
        self.labels = labels
        self.cat_idx = cat_idx
        
    def __len__(self):
        return len(self.labels)
        
    def divide_cat_num(self, row_data):
        mask = np.zeros(len(row_data), dtype=bool)
        mask[self.cat_idx] = True
        cat_elements = row_data[mask]
        remaining_elements = row_data[~mask]
        return cat_elements, remaining_elements

    def __getitem__(self, idx):
        row_value = self.samples[idx]
        tensor_row_value = torch.tensor(row_value)
        cat_values, num_values = self.divide_cat_num(row_value)
        cat_values = torch.tensor(cat_values, dtype=torch.int32)
        num_values = torch.tensor(num_values, dtype=torch.float32)
        label = self.labels[idx]
        tensor_label = torch.tensor(label, dtype=torch.float32) 
        return cat_values, num_values, tensor_label

In [36]:
from torch.utils.data import DataLoader

In [37]:
!pip install /kaggle/input/fttransformer/einops-0.8.0-py3-none-any.whl

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid, fd = os.forkpty()


Processing /kaggle/input/fttransformer/einops-0.8.0-py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.8.0


In [38]:
import torch.nn as nn
import torch.optim as optim

In [39]:
cat_ranges, num_continuous, cat_idx = get_info_for_ftt(new_X)

## Making wrapper for FTTransformer

In [40]:
import torch
import torch.nn.functional as F
from torch import nn, einsum

from einops import rearrange, repeat


class GEGLU(nn.Module):
    def forward(self, x):
        x, gates = x.chunk(2, dim = -1)
        return x * F.gelu(gates)

def FeedForward(dim, mult = 4, dropout = 0.):
    return nn.Sequential(
        nn.LayerNorm(dim),
        nn.Linear(dim, dim * mult * 2),
        GEGLU(),
        nn.Dropout(dropout),
        nn.Linear(dim * mult, dim)
    )

class Attention(nn.Module):
    def __init__(
        self,
        dim,
        heads = 8,
        dim_head = 64,
        dropout = 0.
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.scale = dim_head ** -0.5

        self.norm = nn.LayerNorm(dim)
        
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Linear(inner_dim, dim, bias = False)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.heads

        x = self.norm(x)

        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
        q = q * self.scale

        sim = einsum('b h i d, b h j d -> b h i j', q, k)

        attn = sim.softmax(dim = -1)
        dropped_attn = self.dropout(attn)

        out = einsum('b h i j, b h j d -> b h i d', dropped_attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)', h = h)
        out = self.to_out(out)

        return out, attn

# transformer

class Transformer(nn.Module):
    def __init__(
        self,
        dim,
        depth,
        heads,
        dim_head,
        attn_dropout,
        ff_dropout
    ):
        super().__init__()
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout),
                FeedForward(dim, dropout = ff_dropout),
            ]))

    def forward(self, x, return_attn = False):
        post_softmax_attns = []

        for attn, ff in self.layers:
            attn_out, post_softmax_attn = attn(x)
            post_softmax_attns.append(post_softmax_attn)

            x = attn_out + x
            x = ff(x) + x

        if not return_attn:
            return x

        return x, torch.stack(post_softmax_attns)
        
# batch norm
class BatchNormSequence(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.bn = nn.BatchNorm1d(dim)
        
    def forward(self, x):
        # x: (batch, sequence, features)
        x = x.transpose(1, 2)
        x = self.bn(x)
        x = x.transpose(1, 2) 
        return x
        
# numerical embedder
class NumericalEmbedder(nn.Module):
    def __init__(self, dim, num_numerical_types):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(num_numerical_types, dim))
        self.biases = nn.Parameter(torch.randn(num_numerical_types, dim))

    def forward(self, x):
        x = rearrange(x, 'b n -> b n 1')
        return x * self.weights + self.biases

# main class

class FTTransformer(nn.Module):
    def __init__(
        self,
        *,
        categories,
        num_continuous,
        dim,
        depth,
        heads,
        dim_head = 16,
        dim_out = 1,
        num_special_tokens = 2,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__()
        assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive'
        assert len(categories) + num_continuous > 0, 'input shape must not be null'

        # categories related calculations

        self.num_categories = len(categories)
        self.num_unique_categories = sum(categories)

        # create category embeddings table

        self.num_special_tokens = num_special_tokens
        total_tokens = self.num_unique_categories + num_special_tokens

        # for automatically offsetting unique category ids to the correct position in the categories embedding table

        if self.num_unique_categories > 0:
            categories_offset = F.pad(torch.tensor(list(categories)), (1, 0), value = num_special_tokens)
            categories_offset = categories_offset.cumsum(dim = -1)[:-1]
            self.register_buffer('categories_offset', categories_offset)

            # categorical embedding

            self.categorical_embeds = nn.Embedding(total_tokens, dim)
            self.categ_bn = BatchNormSequence(dim)


        # continuous

        self.num_continuous = num_continuous

        if self.num_continuous > 0:
            self.numerical_embedder = NumericalEmbedder(dim, self.num_continuous)
            self.numer_bn = BatchNormSequence(dim)


        # cls token

        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.pre_transformer_bn = BatchNormSequence(dim)

        # transformer

        self.transformer = Transformer(
            dim = dim,
            depth = depth,
            heads = heads,
            dim_head = dim_head,
            attn_dropout = attn_dropout,
            ff_dropout = ff_dropout
        )

        # to logits

        self.to_logits = nn.Sequential(
            nn.LayerNorm(dim),
            nn.ReLU(),
            nn.Linear(dim, dim_out)
        )

    def forward(self, x_categ, x_numer, return_attn = False):
        assert x_categ.shape[-1] == self.num_categories, f'you must pass in {self.num_categories} values for your categories input'

        xs = []
        if self.num_unique_categories > 0:
            x_categ = x_categ + self.categories_offset

            x_categ = self.categorical_embeds(x_categ)
            x_categ = self.categ_bn(x_categ)
            xs.append(x_categ)

        # add numerically embedded tokens
        if self.num_continuous > 0:
            x_numer = self.numerical_embedder(x_numer)
            x_numer = self.numer_bn(x_numer)
            xs.append(x_numer)

        # concat categorical and numerical

        x = torch.cat(xs, dim = 1)
        x = self.pre_transformer_bn(x)


        # append cls tokens
        b = x.shape[0]
        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
        x = torch.cat((cls_tokens, x), dim = 1)
        
        # attend

        x, attns = self.transformer(x, return_attn = True)

        # get cls token

        x = x[:, 0]

        # out in the paper is linear(relu(ln(cls)))

        logits = self.to_logits(x)

        if not return_attn:
            return logits

        return logits, attns


In [41]:
from sklearn.base import BaseEstimator, RegressorMixin
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import torch

In [42]:
from sklearn.metrics import cohen_kappa_score

def qwk(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [43]:
class FTTransformerWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, categories, num_continuous, dim, dim_out, depth, heads, attn_dropout, 
                 ff_dropout, batch_size, num_epochs, learning_rate, cat_ranges, cat_idx):
        self.categories = categories
        self.num_continuous = num_continuous
        self.dim = dim
        self.dim_out = dim_out
        self.depth = depth
        self.heads = heads
        self.attn_dropout = attn_dropout
        self.ff_dropout = ff_dropout
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.cat_ranges = cat_ranges
        self.cat_idx = cat_idx
        self.num_epochs = num_epochs
        
    def _init_model(self):
        self.model = FTTransformer(
            categories=self.categories,
            num_continuous=self.num_continuous,
            dim=self.dim,
            dim_out=self.dim_out,
            depth=self.depth,
            heads=self.heads,
            attn_dropout=self.attn_dropout,
            ff_dropout=self.ff_dropout
        ).to(self.device)
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def fit(self, X, y):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        from sklearn.preprocessing import StandardScaler
        self.scaler = StandardScaler()
        scaled_X_train = self.scaler.fit_transform(X_train)
        scaled_X_val = self.scaler.transform(X_val)
        
        train_dataset = MyDataset(scaled_X_train, y_train, self.cat_idx)
        val_dataset = MyDataset(scaled_X_val, y_val, self.cat_idx)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

        self._init_model()
        best_val_loss = float('inf')

        save_path='model_checkpoints'
        os.makedirs(save_path, exist_ok=True)

        for epoch in range(self.num_epochs):
            self.model.train()
            train_loss = 0
            for x_cat, x_num, y in train_loader:
                x_cat = x_cat.to(self.device) if x_cat is not None else None
                x_num = x_num.to(self.device)
                y = y.to(self.device)

                self.optimizer.zero_grad()
                output = self.model(x_cat, x_num).squeeze(1)
                
                loss = self.criterion(output, y)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
            
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for x_cat, x_num, y in val_loader:
                    x_cat = x_cat.to(self.device) if x_cat is not None else None
                    x_num = x_num.to(self.device)
                    y = y.to(self.device)
                    
                    output = self.model(x_cat, x_num).squeeze(1)

                    val_loss += self.criterion(output, y).item()

            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'train_loss': avg_train_loss,
                    'val_loss': avg_val_loss,
                    'best_val_loss': best_val_loss
                }
                torch.save(checkpoint, os.path.join(save_path, f'best_model.pth'))
                print(f'Saved best model with validation loss: {best_val_loss:.4f}')

    def load_model(self, model, checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        return model

    def load_model_checkpoint(self, checkpoint_path):
        try:
            checkpoint = torch.load(checkpoint_path, map_location=self.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.model.to(self.device)
            self.model.eval()
            print(f"Model loaded from {checkpoint_path}")
        except Exception as e:
            print(f"Error loading the model: {e}")
    
    def predict(self, X):
        self.model = self.load_model(self.model, "/kaggle/working/model_checkpoints/best_model.pth")

        def divide_cat_num(data, cat_idx):
            mask = np.zeros(data.shape[1], dtype=bool)
            mask[cat_idx] = True
            cat_elements = data[:, mask]
            remaining_elements = data[:, ~mask]
            return cat_elements, remaining_elements

        self.model.eval()
        new_test_values = X.values
        new_test_values = self.scaler.transform(new_test_values)
        num_samples = len(new_test_values)
        predictions = []

        for i in range(0, num_samples, self.batch_size):
            batch_data = new_test_values[i:i + self.batch_size]
            cat, num = divide_cat_num(batch_data, self.cat_idx)
            cat = torch.tensor(cat, dtype=torch.int32).to(self.device)
            num = torch.tensor(num, dtype=torch.float32).to(self.device)

            with torch.no_grad():
                output = self.model(cat, num)
                output = output.squeeze(1)
                predictions.extend(output.cpu().numpy())

            del cat
            del num
            del output
            torch.cuda.empty_cache()

        return np.array(predictions)

## Submission preparation

In [45]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from sklearn.base import clone

In [46]:
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

In [47]:
n_splits = 5
SEED = 42

In [48]:
def thresh_rounder(oof_non_rounded):
    return np.where(oof_non_rounded <= 30, 0,
                    np.where(oof_non_rounded <= 49, 1,
                             np.where(oof_non_rounded <= 79, 2, 3)))

In [49]:
def prepare_submission(model_class, test_data):
    X = new_X
    y = new_y['PCIAT-PCIAT_Total']
    
    KF = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(KF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)
        
        test_preds[:, fold] = model.predict(test_data)

    tpm = test_preds.mean(axis=1)

    sii_tpm = thresh_rounder(tpm)

    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': sii_tpm
    })

    return submission

## Ensemble using bagging techiniques

In [50]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

### Submission 1

In [51]:
lgbm_params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01,
    'device': 'cpu'
}

xgb_params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': 42,
    'device': "cuda"
}


catboost_params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': 42,
    'verbose': 0,
    'l2_leaf_reg': 10,
    'task_type': 'GPU',
}

ftt_params = {
    'categories': tuple(cat_ranges), 
    'num_continuous': num_continuous,    
    'dim': 16,             
    'dim_out': 1,          
    'depth': 2,            
    'heads': 3,            
    'attn_dropout': 0.1,   
    'ff_dropout': 0.2,
    'batch_size': 32,
    'num_epochs': 70,
    'learning_rate': 0.001,
    'cat_ranges': cat_ranges,
    'cat_idx': cat_idx 
}

In [52]:
FTT_Model = FTTransformerWrapper(**ftt_params)
LGBM_Model = LGBMRegressor(**lgbm_params, random_state=42, n_estimators=300, verbose=1)
XGB_Model = XGBRegressor(**xgb_params)
CatBoost_Model = CatBoostRegressor(**catboost_params)

In [None]:
n_splits = 5
SEED = 42
X = new_X
y = new_y['PCIAT-PCIAT_Total']

KF = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(tqdm(KF.split(X, y), desc="Training Folds", total=n_splits)):
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train LightGBM
    LGBM_Model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mse',
    )
    
    # Train XGBoost
    XGB_Model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        early_stopping_rounds=30,
        verbose=True
    )
    
    # Train CatBoost
    CatBoost_Model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True,
        verbose=10
    )
    
    # Train FTT_Model
    FTT_Model.fit(
        X_train, y_train,
    )

In [54]:
voting_model = VotingRegressor(estimators=[
    ('lightgbm', LGBM_Model),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('ftt', FTT_Model),
],weights=[4.0,4.0,5.0, 3.0])

In [56]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
submission1 = prepare_submission(voting_model, new_test)

In [58]:
submission1

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1


#### Submission 2

In [59]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid, fd = os.forkpty()


In [60]:
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.callbacks import Callback
import pytorch_tabnet
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

In [61]:
class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = KNNImputer(n_neighbors=5)
        #self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'

    def fit(self, X, y):
        X_imputed = self.imputer.fit_transform(X)

        if hasattr(y, 'values'):
            y = y.values

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed,
            y,
            test_size=0.2,
            random_state=42
        )

        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse', 'mae', 'rmse'],
            max_epochs=500,
            patience=50,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )

        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file

        return self

    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result
        
TabNet_Params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'lambda_sparse': 1e-4,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}


class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min',
                 save_best_only=True, verbose=1):
        super().__init__()
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')

    def on_train_begin(self, logs=None):
        self.model = self.trainer

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return

        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)

In [62]:
lgbm_params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01,
    'device': 'cpu'
}
CatBoost_Params = {
        'learning_rate': 0.05,
        'depth': 6,
        'iterations': 200,
        'random_seed': SEED,
        'verbose': 0,
        'l2_leaf_reg': 10,
        'task_type': 'CPU',
        'use_best_model': True
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  
    'reg_lambda': 5,  
    'random_state': SEED,
    'tree_method': 'gpu_hist',
}

ftt_params = {
    'categories': tuple(cat_ranges), 
    'num_continuous': num_continuous,    
    'dim': 16,             
    'dim_out': 1,          
    'depth': 2,            
    'heads': 3,            
    'attn_dropout': 0.1,   
    'ff_dropout': 0.3,
    'batch_size': 64 ,
    'num_epochs': 70,
    'learning_rate': 0.001,
    'cat_ranges': cat_ranges,
    'cat_idx': cat_idx 
}

In [63]:
TabNet_Model = TabNetWrapper(**TabNet_Params) 
FTT_Model = FTTransformerWrapper(**ftt_params)
XGB_Model = XGBRegressor(**xgb_params)
CatBoost_Model = CatBoostRegressor(**catboost_params)
LGBM_Model = LGBMRegressor(**lgbm_params, random_state=42, n_estimators=300)



In [None]:
n_splits = 5
SEED = 42
X = new_X
y = new_y['PCIAT-PCIAT_Total']

KF = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(tqdm(KF.split(X, y), desc="Training Folds", total=n_splits)):
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train LightGBM
    LGBM_Model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mse',
    )
    
    # Train XGBoost
    XGB_Model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        early_stopping_rounds=30,
        verbose=True
    )
    
    # Train CatBoost
    CatBoost_Model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True,
        verbose=10
    )
    
    # Train FTT_Model
    FTT_Model.fit(
        X_train, y_train,
    )

In [65]:
voting_model = VotingRegressor(estimators=[
    ('lightgbm', LGBM_Model),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('ftt', FTT_Model),
    ('tabnet', TabNet_Model)
],weights=[4.0,4.0,4.0, 3.0,5.0])

In [None]:
submission2= prepare_submission(voting_model, new_test)

In [67]:
submission2

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1


### Submission 3

In [68]:
TabNet_Model = TabNetWrapper(**TabNet_Params) 
XGB_Model = XGBRegressor(**xgb_params)
CatBoost_Model = CatBoostRegressor(**catboost_params)
LGBM_Model = LGBMRegressor(**lgbm_params, random_state=42, n_estimators=300)



In [None]:
n_splits = 5
SEED = 42
X = new_X
y = new_y['PCIAT-PCIAT_Total']

KF = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(tqdm(KF.split(X, y), desc="Training Folds", total=n_splits)):
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train LightGBM
    LGBM_Model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mse',
    )
    
    # Train XGBoost
    XGB_Model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        early_stopping_rounds=30,
        verbose=True
    )
    
    # Train CatBoost
    CatBoost_Model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True,
        verbose=10
    )
    
    # Train FTT_Model
    FTT_Model.fit(
        X_train, y_train,
    )

In [70]:
voting_model = VotingRegressor(estimators=[
    ('lightgbm', LGBM_Model),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
],weights=[4.0,3.0,5.0, 4.0])

In [None]:
submission3= prepare_submission(voting_model, new_test)

In [72]:
submission3

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1


## Final submision

In [73]:
sub1 = submission1
sub2 = submission2
sub3 = submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

sum_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})
sum_submission.to_csv('submission.csv', index=False)

In [74]:
sum_submission

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1
