# My solution

Train three models and combine the predictions of the three models as the final prediction:

1. Model 1

    + Extracting features from time series

        + skew, kurtosis and statistical information (max, min, std, mean, ...)

        + Calculation of statistical information by time period 
    
    + Encode the features from time series

    + Process outliers in train.csv and test.csv

    + use KNN and MICE fill the nan values, except 'sii'(No filling on the 'sii')

    + train model[ensemble model(LGBM, XGB, HistGradientBoosting, CatBoost, TabNet)] by `RepeatedKFold`(Training results are more robust)

2. Model 2 

    + Extracting features from time series

        + skew, kurtosis and statistical information (max, min, std, mean, ...)

    + use `SimpleImputer` fill the nan values

    + train model[ensemble model(LGBM, XGB, GradientBoosting, CatBoost, RandomForest)] by `RepeatedKFold`(Training results are more robust)

3. Model 3

    + semi supervised: Trained first with non-missing data, labelled against pseudo-labels, and later trained against pseudo-labels [ensemble model(LGBM, XGB, CatBoost)] by `RepeatedKFold`(Training results are more robust)


Using `RepeatedKFold` can get more stable training results.

In [None]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import os
import re

from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks

from sklearn.decomposition import PCA
from sklearn.base import clone,BaseEstimator, RegressorMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score
from sklearn.semi_supervised import LabelPropagation,LabelSpreading
from sklearn.model_selection import train_test_split,StratifiedKFold,RepeatedKFold
from sklearn.preprocessing import LabelEncoder,StandardScaler,RobustScaler,MinMaxScaler
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
from sklearn.ensemble import HistGradientBoostingRegressor,VotingRegressor,AdaBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


from pytorch_tabnet.callbacks import Callback
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


import matplotlib.pyplot as plt

from colorama import Fore, Style
from IPython.display import clear_output
import warnings


warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
import optuna


from keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from tensorflow.keras.regularizers import l1



In [None]:
import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2024)

# Process time series file

## Extract daily feature

In [None]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    df['battery_voltage_percent'] = df['battery_voltage'] / 5000
    df['time_of_day'] = df['time_of_day'] / 1000000000
    df['hour'] = pd.to_datetime(df['time_of_day'], unit='s').dt.hour
    
    df['total_accel'] = np.sqrt(df['X']**2 + df['Y']**2 + df['Z']**2)
    df['abs_anglez'] = np.abs(df['anglez'])
    
    df['diff_total_accel'] = df['total_accel'].diff().abs()
    df['diff_light'] = df['light'].diff().abs()
    df['light_spike'] = df['diff_light'] > df['diff_light'].mean()
    df['acc_spike'] = df['diff_total_accel'] > df['diff_total_accel'].mean()
    
    select_ts_col = ['X','Y','Z','enmo','anglez','non-wear_flag','light','battery_voltage_percent',
     'weekday','quarter','relative_date_PCIAT','hour','total_accel','abs_anglez']
    select_ts_col_0 = ['X','Y','Z','enmo','anglez','light','battery_voltage_percent',
     'weekday','hour','total_accel','abs_anglez']
    
    skew_info = skew(df[select_ts_col_0])
    kurtosis_info = kurtosis(df[select_ts_col_0])
    stats_info = df[select_ts_col].describe().values.reshape(-1)[len(select_ts_col):]
    extract_info =  np.concatenate((stats_info, skew_info, kurtosis_info))
    
    stats_feature = pd.DataFrame(extract_info.reshape(1, -1), columns=[f"stat_{i}" for i in range(len(extract_info))])
    
    awake_agg_functions = {
            'enmo': ['mean', 'std', 'skew', ('kurtosis', lambda x: kurtosis(x, nan_policy='omit'))],
            'total_accel': ['mean', 'std',  'skew', ('kurtosis', lambda x: kurtosis(x, nan_policy='omit'))],
            'abs_anglez': ['mean', 'std', 'max', 'min', 'skew', ('kurtosis', lambda x: kurtosis(x, nan_policy='omit'))],
            'light': ['mean', 'std'],
            'battery_voltage_percent': ['mean', 'std'],
            'acc_spike': ['sum'],
        }
    
    sleep_agg_functions = {
            'enmo': ['mean', 'std', 'skew', ('kurtosis', lambda x: kurtosis(x, nan_policy='omit'))],
            'total_accel': ['mean', 'std',  'skew', ('kurtosis', lambda x: kurtosis(x, nan_policy='omit'))],
            'abs_anglez': ['mean', 'std', 'max', 'min', 'skew', ('kurtosis', lambda x: kurtosis(x, nan_policy='omit'))],
            'light': ['mean', 'std','max'],
            'light_spike': ['sum'],
            'acc_spike': ['sum'],
        }
    act_df = df[df['hour'].between(7, 22)]
    
    sleep_df = df[(df['hour'].between(0, 6)) | (df['hour']==23)]
    
    workday_act = act_df[act_df['weekday'].between(1, 5)]
    weekend_act = act_df[act_df['weekday'].between(6, 7)]
    workday_sleep = sleep_df[sleep_df['weekday'].between(1, 5)]
    weekend_sleep = sleep_df[sleep_df['weekday'].between(6, 7)]
    
    workday_act_features = workday_act.groupby('relative_date_PCIAT').agg(awake_agg_functions)
    weekend_act_features = weekend_act.groupby('relative_date_PCIAT').agg(awake_agg_functions)
    workday_sleep_features = workday_sleep.groupby('relative_date_PCIAT').agg(sleep_agg_functions)
    weekend_sleep_features = weekend_sleep.groupby('relative_date_PCIAT').agg(sleep_agg_functions)
    
    
    # rename columns
    workday_act_features.columns = ['work_daily_' + '_'.join(col) for col in workday_act_features.columns.values]
    weekend_act_features.columns = ['wekd_daily_' + '_'.join(col) for col in weekend_act_features.columns.values]
    workday_sleep_features.columns = ['work_sleep_' + '_'.join(col) for col in workday_sleep_features.columns.values]
    weekend_sleep_features.columns = ['wekd_sleep_' + '_'.join(col) for col in weekend_sleep_features.columns.values]
    
    
    
    peaks, _ = find_peaks(df['time_of_day'], height=0)
    peak_intervals = np.diff(peaks)
    
    workday_act_features['peak_interval_mean'] = np.mean(peak_intervals) if len(peak_intervals) > 0 else 0
    workday_act_features['peak_interval_std'] = np.std(peak_intervals) if len(peak_intervals) > 0 else 0
    
    
    daily_feature = pd.concat([workday_act_features.mean(), weekend_act_features.mean(),workday_sleep_features.mean(),weekend_sleep_features.mean()]).to_frame().T
    
    result = stats_feature.join(daily_feature)
    
    result['id'] = filename.split('=')[1]
    return result


def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    df = pd.concat(results, ignore_index=True)
    
    return df

In [None]:
%%time
SEED = 42
n_splits = 5
n_repeats = 5
# Load datasets
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
# fillna as 0
train_ts = train_ts.fillna(0)
test_ts = test_ts.fillna(0)

train_ts.to_csv('train_ts.csv')
test_ts.to_csv('test_ts.csv')
# copy loaded tiem series
train_ts_copy = train_ts.filter(regex='^stat_|^id')
test_ts_copy = test_ts.filter(regex='^stat_|^id')

In [None]:
print(f'train.shape:{train.shape}')

print(f'train_ts.shape:{train_ts.shape}')

print(f'test.shape:{test.shape}')

print(f'test_ts.shape:{test_ts.shape}')

## Encode daily feature 

In [None]:
# Feature selection based on variance threshold
feature_names = train_ts.drop(['id'], axis=1).columns
selector = VarianceThreshold(threshold=0.05)
train_ts_filtered = selector.fit_transform(train_ts.drop(['id'], axis=1))
train_ts_filtered = pd.DataFrame(train_ts_filtered, columns=feature_names[selector.get_support()])

test_ts_filtered = test_ts[train_ts_filtered.columns]

# calculate correlation matrix
correlation_matrix = train_ts_filtered.corr()
correlation_threshold = 0.85  


to_drop = set() 

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            colname = correlation_matrix.columns[i]
            to_drop.add(colname)

train_ts_filtered = train_ts_filtered.drop(to_drop, axis=1)
test_ts_filtered = test_ts[train_ts_filtered.columns]

In [None]:
train_ts_filtered

In [None]:
non_stat_columns = [col for col in train_ts_filtered.columns if not col.startswith("stat_")]
stat_columns = [col for col in train_ts_filtered.columns if col.startswith("stat_")]
train_ts_filtered[stat_columns]

In [None]:
import keras.backend as K

In [None]:
def build_autoencoder(input_dim, encoding_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    encoder = Model(inputs=input_layer, outputs=encoded)
    
    autoencoder.compile(optimizer=Adam(), loss='mse')
    
    return autoencoder, encoder

def perform_autoencoder(df_train, df_test=None, encoding_dim=50, epochs=50, batch_size=32):

    scaler = StandardScaler()
    df_train_scaled = scaler.fit_transform(df_train)
    
    input_dim = df_train_scaled.shape[1]
    autoencoder, encoder = build_autoencoder(input_dim, encoding_dim)
    
    autoencoder.fit(df_train_scaled, df_train_scaled, epochs=epochs, batch_size=batch_size, shuffle=True, verbose=1)
    
    encoded_train_data = encoder.predict(df_train_scaled)
    df_encoded_train = pd.DataFrame(encoded_train_data, columns=[f'Enc_{i+1}' for i in range(encoded_train_data.shape[1])])
    
    if df_test is not None:
        
        df_test_scaled = scaler.transform(df_test)
        encoded_test_data = encoder.predict(df_test_scaled)
        df_encoded_test = pd.DataFrame(encoded_test_data, columns=[f'Enc_{i+1}' for i in range(encoded_test_data.shape[1])])
        
        return df_encoded_train, df_encoded_test
    
    return df_encoded_train


# encoded_train_ts, encoded_test_ts = perform_autoencoder(train_ts.drop(['id'], axis=1), test_ts.drop(['id'], axis=1), encoding_dim=60, epochs=20, batch_size=32)
encoded_train_ts, encoded_test_ts = perform_autoencoder(train_ts_filtered[stat_columns], test_ts_filtered[stat_columns], encoding_dim=20, epochs=81, batch_size=32)


In [None]:
encoded_train_ts = pd.concat([encoded_train_ts, train_ts_filtered[non_stat_columns]], axis=1)

encoded_test_ts = pd.concat([encoded_test_ts, test_ts_filtered[non_stat_columns]], axis=1)

In [None]:
encoded_train_ts['id'] = train_ts['id']
encoded_test_ts['id'] = test_ts['id']

# Process train/test

## Process outliers in train

In [None]:
for col in train.select_dtypes(include=[np.number]).columns:  
    negative_count = (train[col] < 0).sum() 
    if negative_count > 0:
        print(f"Column name: {col}, Number of negative numbers: {negative_count}")

In [None]:
def plot_boxplots(df, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(15, 5))

    for i, col in enumerate(columns):
        axes[i].boxplot(df[col].dropna())
        axes[i].set_title(col)
        axes[i].set_ylabel('Value')

    plt.tight_layout()
    plt.show()

plot_boxplots(train, ['CGAS-CGAS_Score', 'BIA-BIA_BMR', 'BIA-BIA_TBW'])

The columns ‘CGAS-CGAS_Score’, ‘BIA-BIA_BMR’, ‘BIA-BIA_TBW’ are used as examples only, outliers exist in other columns as well.

## Process Outliers

In [None]:
processCols = ['id','Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

cat_int_col = [
    'Basic_Demos-Sex','FGC-FGC_CU_Zone','FGC-FGC_GSND_Zone','FGC-FGC_GSD_Zone','FGC-FGC_PU_Zone','FGC-FGC_SRL_Zone',
    'FGC-FGC_SRR_Zone','FGC-FGC_TL_Zone','BIA-BIA_Activity_Level_num','BIA-BIA_Frame_num','PreInt_EduHx-computerinternet_hoursday'
]

In [None]:
def process_outliers(train_df, test_df, multiplier=1.5):
    # Processing negative numbers
    y = train_df['sii']
    train_df = train_df.drop(['sii'],axis=1)
    numeric_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
    # Select which columns are not categorical to process outliers
    numeric_cols = [col for col in numeric_cols if col not in cat_int_col]
    
    train_cleaned = train_df[numeric_cols].copy()
    train_cleaned[train_cleaned < 0] = np.nan

    bounds = {}

    for col in numeric_cols:
        # calculate Q1, Q3, IQR
        Q1 = train_cleaned[col].quantile(0.25)
        Q3 = train_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1

        # calculate upper and lower bound
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR

        
        bounds[col] = (lower_bound, upper_bound)

        # replace outliers with NaN
        train_cleaned.loc[(train_cleaned[col] < lower_bound) | (train_cleaned[col] > upper_bound), col] = np.nan
    
    # process test
    test_cleaned = test_df.copy()

    for col in numeric_cols:
        if col in bounds:
            lower_bound, upper_bound = bounds[col]
            test_cleaned.loc[(test_cleaned[col] < lower_bound) | (test_cleaned[col] > upper_bound), col] = np.nan

    
    for col in train_df.columns:
        if col not in numeric_cols:
            test_cleaned[col] = test_df[col]
            train_cleaned[col] = train_df[col]
            
    train_cleaned['sii'] = y
    return train_cleaned, test_cleaned


train, test = process_outliers(train[processCols], test, multiplier=15)

Set a large multiplier(15), to process the  extreme outliers.

In [None]:
plot_boxplots(train, ['CGAS-CGAS_Score', 'BIA-BIA_BMR', 'BIA-BIA_TBW'])

## Fill the missing values in train

### KNN and MICE (No imputer on the value of sii)

In [None]:
%%time

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns

# KNN
imputer_knn = KNNImputer(n_neighbors=5)
imputed_data_knn = imputer_knn.fit_transform(train[numeric_cols])
train_imputed_knn = pd.DataFrame(imputed_data_knn, columns=numeric_cols)
train_imputed_knn['sii'] = train['sii']

# MICE
imputer_mice = IterativeImputer(min_value=0)  
imputed_data_mice = imputer_mice.fit_transform(train[numeric_cols])
train_imputed_mice = pd.DataFrame(imputed_data_mice, columns=numeric_cols)
train_imputed_mice['sii'] = train['sii']

int_col = cat_int_col.copy()
int_col.append('sii')
int_col

### Taking the mean for numerical data and the mode for categorical data

In [None]:
def combine_imputation_results(train_imputed_knn, train_imputed_mice, int_cols, train):

    dfs = [train_imputed_knn, train_imputed_mice]
    data_arrays = [df.values for df in dfs]
    train_imputed_array = np.empty_like(data_arrays[0])

    for j in range(train_imputed_mice.shape[1]):
        col = train_imputed_mice.columns[j]
        if col in int_cols:
            values = [data_array[:, j] for data_array in data_arrays]
            values = np.stack(values, axis=1)
            mode = pd.DataFrame(values).mode(axis=1)[0].values
            train_imputed_array[:, j] = mode
        else:
            values = [data_array[:, j] for data_array in data_arrays]
            mean = np.mean(values, axis=0)
            train_imputed_array[:, j] = mean

    train_imputed = pd.DataFrame(train_imputed_array, 
                                 index=train_imputed_mice.index, 
                                 columns=train_imputed_mice.columns)

    
    for col in train.columns:  
        if col not in train_imputed.columns:
            train_imputed[col] = train[col] 

    return train_imputed


In [None]:
train_imputed = combine_imputation_results(train_imputed_knn, train_imputed_mice, int_col,train)
train_imputed

In [None]:
train['sii'].value_counts()

In [None]:
train_imputed['sii'].value_counts()

In [None]:
train.shape

# Fill the missing values in test

In [None]:
test['sii'] = 0
test_imputed_knn = imputer_knn.transform(test[numeric_cols])
test_imputed_knn = pd.DataFrame(test_imputed_knn, columns=numeric_cols)
test_imputed_knn['sii'] = test_imputed_knn['sii'].round().astype(int)

test_imputed_mice = imputer_mice.transform(test[numeric_cols])
test_imputed_mice = pd.DataFrame(test_imputed_mice, columns=numeric_cols)
test_imputed_mice['sii'] = test_imputed_mice['sii'].round().astype(int)


In [None]:
test_imputed = combine_imputation_results(test_imputed_knn, test_imputed_mice, int_col,test)
print(test_imputed.shape)

In [None]:
test_imputed = test_imputed.drop(['sii'],axis=1)
test_imputed

In [None]:
train_imputed = train_imputed.dropna(subset=['sii'])

In [None]:
print(train_imputed.shape)
print('==========')
print(test_imputed.shape)

## Feature Engineering on train/test

### Classification of blood pressure

according to the AAP 2017 pediatric hypertension definitions

1. https://academic.oup.com/view-large/441516841

2. https://publications.aap.org/pediatrics/article/140/3/e20171904/38358/Clinical-Practice-Guideline-for-Screening-and

In [None]:
def classify_bp(train, test=None):
    # calculate the bp percentile in train
    sbp_percentiles = np.percentile(train.loc[train['Basic_Demos-Age'] < 13, 'Physical-Systolic_BP'], [90, 95])
    dbp_percentiles = np.percentile(train.loc[train['Basic_Demos-Age'] < 13, 'Physical-Diastolic_BP'], [90, 95])
    
    # classify function
    def classify_single_df(df, sbp_percentiles, dbp_percentiles):
        bp_categories = []

        for i in range(len(df)):
            age = df['Basic_Demos-Age'].iloc[i]
            sbp = df['Physical-Systolic_BP'].iloc[i]
            dbp = df['Physical-Diastolic_BP'].iloc[i]

            if age < 13:
                if sbp < sbp_percentiles[0]:
                    category = "Normal"
                elif sbp < sbp_percentiles[1] or (sbp >= 120 and sbp < sbp_percentiles[1] and dbp < 80):
                    category = "Elevated"
                elif sbp < sbp_percentiles[1] + 12 or (sbp >= 130 and sbp < 140 and dbp >= 80 and dbp < 90):
                    category = "Stage 1"
                elif sbp >= sbp_percentiles[1] + 12 or (sbp >= 140 or dbp >= 90):
                    category = "Stage 2"
                elif sbp > sbp_percentiles[1] + 30:
                    category = "Hypertensive urgency"
                else:
                    category = np.nan
            else:
                if sbp < 120 and dbp < 80:
                    category = "Normal"
                elif sbp < 130 or dbp < 80:
                    category = "Elevated"
                elif sbp < 140 or dbp < 90:
                    category = "Stage 1"
                elif sbp >= 140 or dbp >= 90:
                    category = "Stage 2"
                elif sbp > 180 or dbp > 120:
                    category = "Hypertensive urgency"
                else:
                    category = np.nan

            bp_categories.append(category)

        df['BP_Category'] = bp_categories
        return df
    
    # classify train
    train = classify_single_df(train, sbp_percentiles, dbp_percentiles)

    # if test exist, classify test
    if test is not None:
        test = classify_single_df(test, sbp_percentiles, dbp_percentiles)
    
    return train, test

In [None]:
train_imputed,test_imputed = classify_bp(train_imputed,test=test_imputed)

In [None]:
print(train_imputed['BP_Category'].value_counts())

print("==========")

print(test_imputed['BP_Category'].value_counts())

### FeatureEngineering

In [None]:
def FeatureEngineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    old_list = df.columns
    
    df['Waist-to-Height_Ratio'] = df['Physical-Waist_Circumference'] / df['Physical-Height']
    df['Fat_Mass'] = df['Physical-Weight'] * df['BIA-BIA_Fat']
    df['Muscle_Fat_Ratio'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Lean_Body_Mass'] = df['Physical-Weight'] - df['Fat_Mass']
    df['BP-HR_Ratio'] = (df['Physical-Systolic_BP'] + df['Physical-Diastolic_BP']) / df['Physical-HeartRate']
    df['Pulse_Pressure'] = df['Physical-Systolic_BP'] - df['Physical-Diastolic_BP']
    df['Mean_Arterial_Pressure'] = (2 * df['Physical-Diastolic_BP'] + df['Physical-Systolic_BP']) / 3
    df['GS_diff'] = df['FGC-FGC_GSD'] - df['FGC-FGC_GSND']
    df['GS-SMM_Ratio'] = (df['FGC-FGC_GSD'] + df['FGC-FGC_GSND'])/2 / df['BIA-BIA_SMM']
    df['Upper_Limb_Strength'] = df['FGC-FGC_PU'] * (df['FGC-FGC_GSD'] + df['FGC-FGC_GSND']) * df['FGC-FGC_TL']
    df['Core_Strength'] = df['FGC-FGC_CU'] * df['FGC-FGC_PU'] * df['FGC-FGC_TL']
    df['Daily_Active'] = df['BIA-BIA_DEE'] / df['BIA-BIA_BMR']
    df['ECW_ICW'] = df['BIA-BIA_ECW'] / df['BIA-BIA_ICW']
    df['Endurance'] = df['Fitness_Endurance-Max_Stage'] * df['FGC-FGC_CU'] * df['FGC-FGC_PU']
    df['Fitness_Endurance-Total_Time(sec)'] = df['Fitness_Endurance-Time_Mins'] * 60 + df['Fitness_Endurance-Time_Sec']
    df['Sleep-Internet'] = df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['Sleep_Active'] = df['SDS-SDS_Total_T'] / df['PAQ_C-PAQ_C_Total']
    df['FitnessGram_Add'] = df['FGC-FGC_CU_Zone'] + df['FGC-FGC_PU_Zone'] + df['FGC-FGC_SRL_Zone'] + df['FGC-FGC_SRR_Zone'] + df['FGC-FGC_TL_Zone']
    
    new_list = df.columns
    add_list = list(set(new_list) - set(old_list))
    return df,add_list

In [None]:
train_imputed, FE_cols = FeatureEngineering(train_imputed)
test_imputed, _ = FeatureEngineering(test_imputed)

In [None]:
train_imputed.sum().tail(60)

In [None]:
print(train_imputed.shape)
print('==========')
print(test_imputed.shape)
print('==========')
print(FE_cols)

In [None]:
print(encoded_train_ts.shape)
print('==========')
print(encoded_test_ts.shape)

In [None]:
train = pd.merge(train_imputed, encoded_train_ts, how="outer", on='id')
test = pd.merge(test_imputed, encoded_test_ts, how="outer", on='id')
time_series_cols = encoded_train_ts.columns.tolist()
time_series_cols.remove("id")

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
print(train.shape)
print('==========')
print(test.shape)

In [None]:
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii','BP_Category'
               ]

featuresCols += FE_cols
featuresCols += time_series_cols

train = train[featuresCols]
y = train['sii']
cat_cols=['BP_Category']

# LabelEncode

In [None]:
%%time
import category_encoders as ce

def preprocess_cat_data(df_train, cat_cols,df_test=None):
    for col in cat_cols:
        df_train[col] = df_train[col].fillna('MissingValue')
        df_test[col] = df_test[col].fillna('MissingValue')
    
    enc = ce.OrdinalEncoder(cols=cat_cols)  
        
    df_train = enc.fit_transform(df_train)

    df_test = enc.transform(df_test)
    
    return df_train,df_test


In [None]:
train,test = preprocess_cat_data(train.drop(['sii'],axis=1), cat_cols, df_test=test)
train['sii'] = y

In [None]:
print(train.shape)
print(test.shape)

In [None]:

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits * n_repeats))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return model, submission

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor


class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result


class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

In [None]:
LGBM_Params = {
    'learning_rate': 0.035, 
    'max_depth': 10, 
    'num_leaves': 400, 
    'min_data_in_leaf': 10,
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.75, 
    'bagging_freq': 2, 
    'lambda_l1': 5, 
    'lambda_l2': 5,
    'device': 'gpu' if torch.cuda.is_available() else 'cpu',
    'random_state': SEED,
}

XGB_Params = {
    'learning_rate': 0.04,
    'max_depth': 12,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  
    'reg_lambda': 5,  
    'random_state': SEED,
    'tree_method': 'gpu_hist' if torch.cuda.is_available() else 'cpu',
    'tree_method': 'exact'
}

HGB_Params = {
    'learning_rate': 0.025, 
    'max_iter': 300, 
    'max_leaf_nodes': 15, 
    'min_samples_leaf': 10,
    'l2_regularization': 5, 
    'random_state': SEED,
}

CatBoost_Params = {
    'learning_rate': 0.025,
    'depth': 7,
    'iterations': 600,
    'l2_leaf_reg': 7, 
    'random_seed': SEED,
    'cat_features': cat_cols,
    'subsample': 0.7,
    'random_strength': 1.7, 
    'bagging_temperature': 0.03,
    'border_count': 12
}

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.35,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-5,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 10,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

LGBM_Model =LGBMRegressor(**LGBM_Params, verbose=-1,n_estimators=250)

XGB_Model = XGBRegressor(**XGB_Params, verbose=-1)

HGB_Model = HistGradientBoostingRegressor(**HGB_Params)

CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

TabNet_Model = TabNetWrapper(**TabNet_Params) 

In [None]:
# reindex the col in test
test = test.reindex(columns=train.columns.tolist())
test = test.drop(['sii'],axis=1)
test

In [None]:
%%time
# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', LGBM_Model),
    ('xgboost', XGB_Model),
    ('HGB', HGB_Model),
    ('catboost', CatBoost_Model),
    ('TabNet_Model',TabNet_Model),
],
   weights=[1.0,1.0,1.0,1.0,1.0]
                              )

vote_model, vote_Submission = TrainML(voting_model, test)

In [None]:
vote_Submission

# Model 2

In [None]:
def extract_statistic(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_ts(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: extract_statistic(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_ts("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_ts("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)   

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

# Combine models using Voting Regressor
imputer = SimpleImputer(strategy='median')

voting_model = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
])

# Train the ensemble model
vote_model_1, vote_Submission_1 = TrainML(voting_model, test)

# Save submission
#Submission2.to_csv('submission.csv', index=False)
vote_Submission_1

# Model 3

In [None]:
def FeatureEngineeringPlus(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    old_list = df.columns
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    

    new_list = df.columns
    add_list = list(set(new_list) - set(old_list))
    return df,add_list

In [None]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')
        
# train_ts = load_ts("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
# test_ts = load_ts("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
train_ts = train_ts_copy
test_ts = test_ts_copy

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")


train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)   

train, FE_cols = FeatureEngineeringPlus(train)
test, _ = FeatureEngineeringPlus(test)

if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)
if np.any(np.isinf(test)):
    test = test.replace([np.inf, -np.inf], np.nan)


featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii'
               ]

featuresCols += FE_cols
featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = []


def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)
# train,test = preprocess_cat_data(train, cat_c, df_test=test)

# reindex the col in test
test = test.reindex(columns=train.columns.tolist())
test = test.drop(['sii'],axis=1)
test

In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def SemiTrain(model_class, test_data):

    
    X = train.drop(['sii'], axis=1)
    y = train['sii']
    # Dealing with missing values (NaN), replacing NaN with -1 for LabelSpreading
    y_filled = y.fillna(-1).astype(int)

    imputer = IterativeImputer(estimator=ElasticNet(), min_value=0, # TODO:different estimator
                               max_iter=10, random_state=SEED, 
                               initial_strategy='median')
    # imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)  # fit_transform 
    X_imputed = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
    test_data_imputed = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns, index=test_data.index)
    
    SKF = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=SEED)

    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits * n_repeats))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X_imputed, y_filled), desc="Training Folds", total=n_splits* n_repeats)):
        X_train, X_val = X_imputed.iloc[train_idx], X_imputed.iloc[test_idx]
        y_train, y_val = y_filled.iloc[train_idx], y_filled.iloc[test_idx]

        model = LabelSpreading(kernel='rbf', gamma=12, alpha=0.2) # TODO: different kernel, gamma and alpha
        model.fit(X_train, y_train)
        pseudo_labels = model.transduction_[y_train == -1]
        X_train_pseudo = X_train[y_train == -1]
        y_train_pseudo = pd.Series(pseudo_labels, index=X_train_pseudo.index)
        
        # Merge labelled and pseudo-labelled data
        X_train_combined = pd.concat([X_train[y_train != -1], X_train_pseudo])
        y_train_combined = pd.concat([y_train[y_train != -1], y_train_pseudo])
        
        final_model = clone(model_class)
        final_model.fit(X_train_combined, y_train_combined)                                                              
                                                                      
        y_train_pred = final_model.predict(X_train[y_train != -1])  # 只在有标签数据上评估训练性能
        y_val_pred = final_model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train[y_train != -1], y_train_pred.round(0).astype(int)) 
        val_kappa = quadratic_weighted_kappa(y_val[y_val != -1], y_val_pred_rounded[y_val != -1]) 

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = final_model.predict(test_data_imputed)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y[y.notna()], oof_non_rounded[y.notna()]), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y[y.notna()], oof_tuned[y.notna()]) 

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return final_model, submission

In [None]:
# Model parameters for LightGBM
LGBM_Params1 = {
    'learning_rate': 0.045,
    'max_depth': 7,
    'num_leaves': 350,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'random_state': SEED
}


# XGBoost parameters
XGB_Params1 = {
    'learning_rate': 0.045,
    'max_depth': 7,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params1 = {
    'learning_rate': 0.045,
    'depth': 7,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}
from sklearn.linear_model import BayesianRidge,ElasticNet

# Create model instances
Light = LGBMRegressor(**LGBM_Params1,  verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params1)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params1)

# Combine models using Voting Regressor

ensemble = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

vote_model_2, vote_Submission_2 = SemiTrain(ensemble, test)

In [None]:
vote_Submission = vote_Submission.sort_values(by='id').reset_index(drop=True)
vote_Submission_1 = vote_Submission_1.sort_values(by='id').reset_index(drop=True)
vote_Submission_2 = vote_Submission_2.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': vote_Submission['id'],
    'sii_1': vote_Submission['sii'],
    'sii_2': vote_Submission_1['sii'],
    'sii_3': vote_Submission_2['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)
final_submission

In [None]:
combined

In [None]:
print("finish")