In [1]:
import numpy as np
import pandas as pd
import os
from colorama import Fore, Style

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import minimize
from sklearn.ensemble import VotingRegressor

# from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm



### Mapping - Dropping features not in test sets

In [2]:
df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
df = df.dropna(subset=['sii']).reset_index() # keeping labeled values only
test = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")
season_mapping = {
    'Winter': -1,
    'Spring': -0.5,
    'Summer': 0.5,
    'Fall': 1
}
# mapping non-string values
df = df.replace(season_mapping)
test = test.replace(season_mapping)

# dropping questions not in test dataset
test_missing_columns = set(df.columns) - set(test.columns)
for col in test_missing_columns:
    if col != 'sii':  # Retain the target column for training
        df.drop(columns=col, inplace=True)
# for later use
train_ids = df['id']
test_ids = test['id']
train_labels = df['sii']



  df = df.replace(season_mapping)
  test = test.replace(season_mapping)


In [3]:
df['sii'].value_counts()

sii
0.0    1594
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

# K-Nearest neighboors missing data imputation

In [4]:
featureCols = sorted(list(set(df.columns) - set(['sii', 'id'])))
# featureCols

In [5]:
dropCols = []
for column in featureCols:
    if (df[column].isnull().sum() > 1300):
        dropCols.append(column)
dropCols
df = df.drop(dropCols, axis=1)

In [6]:
featureCols = sorted(list(set(df.columns) - set(['sii', 'id'])))
train = pd.DataFrame(df, columns=featureCols)

In [7]:
!pip install /kaggle/input/cmi-wheel/cloudpickle-3.1.0-py3-none-any.whl
!pip install /kaggle/input/cmi-wheel/geomloss-0.2.4-py3-none-any.whl
!pip install /kaggle/input/cmi-wheel/loguru-0.6.0-py3-none-any.whl
!pip install /kaggle/input/cmi-wheel/miracle_imputation-0.1.6-py3-none-any.whl
!pip install /kaggle/input/cmi-wheel/pydantic-2.10.3-py3-none-any.whl
# !pip install /kaggle/input/cmi-wheel/redis_wheel-6.2.5.1-202410081222-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/cmi-wheel/cloudpickle-3.1.0-py3-none-any.whl
cloudpickle is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/cmi-wheel/geomloss-0.2.4-py3-none-any.whl
Installing collected packages: geomloss
Successfully installed geomloss-0.2.4
Processing /kaggle/input/cmi-wheel/loguru-0.6.0-py3-none-any.whl
Installing collected packages: loguru
Successfully installed loguru-0.6.0
Processing /kaggle/input/cmi-wheel/miracle_imputation-0.1.6-py3-none-any.whl
Installing collected packages: miracle-imputation
Successfully installed miracle-imputation-0.1.6
Processing /kaggle/input/cmi-wheel/pydantic-2.10.3-py3-none-any.whl
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.10.1
    Uninstalling pydantic-2.10.1:
      Successfully uninstalled pydantic-2.10.1
[31mERROR: pip's dependency resolver does n

In [8]:
!pip install /kaggle/input/cmi-wheel-fix/pydantic-1.10.0-py3-none-any.whl 
!pip install /kaggle/input/cmi-wheel-fix/redis-5.2.1-py3-none-any.whl

Processing /kaggle/input/cmi-wheel-fix/pydantic-1.10.0-py3-none-any.whl
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.10.2
    Uninstalling pydantic-2.10.2:
      Successfully uninstalled pydantic-2.10.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 1.4.21 requires pydantic>=2.7.0, but you have pydantic 1.10.0 which is incompatible.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
ydata-profiling 4.12.0 requires pydantic>=2, but you have pydantic 1.10.0 which is incompatible.
ydata-profiling 4.12.0 requires scipy<1.14,>=1.4.1, but you have scipy 1.14.1 which is incompatible.[0m[31m
[0mSuccessfully installed pydantic-1.10.0
Processing /kaggle/input/cmi-wheel-fix/redis-5.2.1-py3-none-any.whl

In [9]:
pip install /kaggle/input/jupyter-lsp-wheel/jupyter_lsp-2.2.5-py3-none-any.whl

Processing /kaggle/input/jupyter-lsp-wheel/jupyter_lsp-2.2.5-py3-none-any.whl
Installing collected packages: jupyter-lsp
  Attempting uninstall: jupyter-lsp
    Found existing installation: jupyter-lsp 1.5.1
    Uninstalling jupyter-lsp-1.5.1:
      Successfully uninstalled jupyter-lsp-1.5.1
Successfully installed jupyter-lsp-2.2.5
Note: you may need to restart the kernel to use updated packages.


In [10]:
!pip  install /kaggle/input/cmi-wheel-addition/jupyter-1.1.1-py2.py3-none-any.whl


Processing /kaggle/input/cmi-wheel-addition/jupyter-1.1.1-py2.py3-none-any.whl
Installing collected packages: jupyter
Successfully installed jupyter-1.1.1


In [11]:
pip install /kaggle/input/jupyterlab-wheel/jupyterlab-4.3.3-py3-none-any.whl

Processing /kaggle/input/jupyterlab-wheel/jupyterlab-4.3.3-py3-none-any.whl
Installing collected packages: jupyterlab
  Attempting uninstall: jupyterlab
    Found existing installation: jupyterlab 4.3.1
    Uninstalling jupyterlab-4.3.1:
      Successfully uninstalled jupyterlab-4.3.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 2024.66.154055 requires jupyterlab~=3.6.0, but you have jupyterlab 4.3.3 which is incompatible.[0m[31m
[0mSuccessfully installed jupyterlab-4.3.3
Note: you may need to restart the kernel to use updated packages.


In [12]:
!pip  install /kaggle/input/another-hyperimputer/hyperimpute-0.1.17-py3-none-any.whl

Processing /kaggle/input/another-hyperimputer/hyperimpute-0.1.17-py3-none-any.whl
Installing collected packages: hyperimpute
Successfully installed hyperimpute-0.1.17


In [13]:
from hyperimpute.plugins.imputers import Imputers

imputers = Imputers()

imputers.list()

['miwae',
 'miracle',
 'sinkhorn',
 'hyperimpute',
 'most_frequent',
 'softimpute',
 'EM',
 'median',
 'mean',
 'mice',
 'sklearn_missforest',
 'missforest',
 'ice',
 'nop',
 'gain',
 'sklearn_ice']

In [14]:
plugin = Imputers().get(
    "hyperimpute",
    optimizer="hyperband",
    classifier_seed=["logistic_regression"],
    regression_seed=["linear_regression"],
 
)

# method = "gain"

# plugin = Imputers().get("gian", potimizer=)
data = pd.concat([train, test], axis=0, ignore_index=True)
imputed_data = plugin.fit_transform(data[featureCols])
data = pd.DataFrame(imputed_data, columns=featureCols)
data

  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_st

Unnamed: 0,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2.000000,2.668550,16.879200,932.498000,1492.000000,8.255980,41.586200,13.817700,3.061430,9.213770,...,85.453346,46.000000,1.000000,110.701934,50.800000,1.0,3.0,40.904650,57.557086,0.885058
1,2.000000,2.579490,14.037100,936.656000,1498.650000,6.019930,42.029100,12.825400,1.211720,3.970850,...,70.000000,48.000000,1.000000,122.000000,46.000000,0.5,0.0,46.000000,64.000000,1.000000
2,2.606120,6.088803,17.622621,1202.105684,2008.496903,19.656142,70.281893,14.019216,3.754320,8.202029,...,94.000000,56.500000,1.000000,117.000000,75.600000,0.5,2.0,38.000000,54.000000,1.000000
3,3.000000,3.841910,18.294300,1131.430000,1923.440000,15.592500,62.775700,14.074000,4.220330,18.824300,...,97.000000,56.000000,0.500000,117.000000,81.600000,-1.0,0.0,31.000000,45.000000,0.500000
4,2.000000,4.330360,30.186500,1330.970000,1996.450000,30.212400,84.028500,16.687700,13.498800,67.971500,...,73.000000,59.500000,0.500000,102.000000,112.200000,-0.5,0.0,40.000000,56.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,2.830187,4.485740,25.085592,1239.342400,2105.562790,20.962593,74.329481,18.466593,5.379082,41.600407,...,75.000000,54.000000,-0.500000,99.000000,121.600000,-1.0,2.0,35.000000,50.000000,-0.500000
2752,2.351483,7.993372,19.499778,1190.361196,1916.021305,18.712847,69.043553,17.071568,2.747794,-15.641453,...,76.000000,44.000000,-0.500000,109.000000,47.600000,-0.5,0.0,37.000000,53.000000,-0.500000
2753,2.378035,4.991404,19.619916,1173.370171,1917.298477,18.073940,67.248831,15.366165,4.512180,18.958704,...,81.000000,55.000000,1.000000,116.389866,85.600000,1.0,1.0,40.661338,57.336254,0.930612
2754,2.000000,2.750350,17.273800,1003.070000,1504.610000,15.145600,49.103400,14.089800,3.184070,11.096600,...,91.000000,37.500000,-1.000000,95.000000,60.200000,-1.0,3.0,39.000000,55.000000,-1.000000


In [15]:
# normal
# imputer = KNNImputer(n_neighbors=5)  # k=4
# # # test 1 0.439
# # imputed_data = imputer.fit_transform(df[featureCols])
# # train = pd.DataFrame(imputed_data, columns=featureCols)
# # # train['sii'] = df['sii']
# # test_imputed = imputer.fit_transform(test[featureCols])
# # test = pd.DataFrame(test_imputed, columns=featureCols)
# # test
# # test 2
# data = pd.concat([train, test], axis=0, ignore_index=True)
# imputed_data = imputer.fit_transform(data[featureCols])
# data = pd.DataFrame(imputed_data, columns=featureCols)
# data


In [16]:
train.shape[0]

2736

In [17]:
train = data.head(train.shape[0])
test = data.drop(train.index, axis=0).reset_index()

In [18]:
train

Unnamed: 0,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2.00000,2.668550,16.879200,932.498000,1492.000000,8.255980,41.586200,13.817700,3.061430,9.213770,...,85.453346,46.0,1.0,110.701934,50.8,1.0,3.0,40.90465,57.557086,0.885058
1,2.00000,2.579490,14.037100,936.656000,1498.650000,6.019930,42.029100,12.825400,1.211720,3.970850,...,70.000000,48.0,1.0,122.000000,46.0,0.5,0.0,46.00000,64.000000,1.000000
2,2.60612,6.088803,17.622621,1202.105684,2008.496903,19.656142,70.281893,14.019216,3.754320,8.202029,...,94.000000,56.5,1.0,117.000000,75.6,0.5,2.0,38.00000,54.000000,1.000000
3,3.00000,3.841910,18.294300,1131.430000,1923.440000,15.592500,62.775700,14.074000,4.220330,18.824300,...,97.000000,56.0,0.5,117.000000,81.6,-1.0,0.0,31.00000,45.000000,0.500000
4,2.00000,4.330360,30.186500,1330.970000,1996.450000,30.212400,84.028500,16.687700,13.498800,67.971500,...,73.000000,59.5,0.5,102.000000,112.2,-0.5,0.0,40.00000,56.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,3.00000,3.203030,17.141700,1035.270000,1759.960000,11.006300,52.533100,13.400400,3.741300,14.666900,...,65.000000,52.5,1.0,112.000000,67.2,1.0,2.0,41.00000,58.000000,1.000000
2732,1.00000,2.366800,13.645700,966.287000,1256.170000,9.988020,45.185300,13.231500,0.414263,1.414700,...,75.000000,48.5,0.5,105.000000,46.6,0.5,0.0,48.00000,67.000000,0.500000
2733,3.00000,4.522770,16.364200,1206.880000,2051.700000,19.461100,70.811700,14.062900,2.301380,11.588300,...,70.000000,59.5,1.0,104.000000,82.4,1.0,1.0,35.00000,50.000000,-1.000000
2734,2.00000,4.413050,21.443800,1253.740000,2005.990000,20.482500,75.803300,14.804300,6.639520,33.996700,...,99.000000,60.0,-1.0,116.000000,109.8,1.0,0.0,56.00000,77.000000,-1.000000


In [19]:
test

Unnamed: 0,index,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2736,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,...,85.453346,46.0,1.0,110.701934,50.8,1.0,3.0,40.90465,57.557086,0.885058
1,2737,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,...,70.0,48.0,1.0,122.0,46.0,0.5,0.0,46.0,64.0,1.0
2,2738,2.60612,6.088803,17.622621,1202.105684,2008.496903,19.656142,70.281893,14.019216,3.75432,...,94.0,56.5,1.0,117.0,75.6,0.5,2.0,38.0,54.0,1.0
3,2739,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,...,97.0,56.0,0.5,117.0,81.6,-1.0,0.0,31.0,45.0,0.5
4,2740,2.419751,4.441743,18.315752,1176.293936,1847.561733,20.981585,67.544003,13.974458,3.97717,...,78.460529,58.995661,-0.157421,117.557447,87.411126,-0.303973,1.577706,40.821754,57.546288,-0.325199
5,2741,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,...,73.0,59.5,0.5,102.0,112.2,-0.5,0.0,40.0,56.0,0.5
6,2742,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,...,83.0,55.0,1.0,163.0,84.6,1.0,3.0,27.0,40.0,-1.0
7,2743,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,...,90.0,59.25,1.0,116.0,84.2,1.0,2.0,40.614353,57.327029,0.955986
8,2744,2.524616,7.744335,18.565199,1245.760827,2023.297164,22.09175,74.944295,14.246957,4.13208,...,80.268234,57.664071,-0.5,117.356804,87.821071,0.5,2.0,40.938433,57.644582,-0.344926
9,2745,2.426176,4.560248,18.218754,1177.864458,1846.536258,21.334367,67.710099,13.875174,3.942237,...,77.592425,59.457225,0.081915,117.595601,87.40196,0.2723,1.642425,40.823658,57.552609,-0.070444


In [20]:
train['sii'] = df['sii']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['sii'] = df['sii']


In [21]:
test

Unnamed: 0,index,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2736,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,...,85.453346,46.0,1.0,110.701934,50.8,1.0,3.0,40.90465,57.557086,0.885058
1,2737,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,...,70.0,48.0,1.0,122.0,46.0,0.5,0.0,46.0,64.0,1.0
2,2738,2.60612,6.088803,17.622621,1202.105684,2008.496903,19.656142,70.281893,14.019216,3.75432,...,94.0,56.5,1.0,117.0,75.6,0.5,2.0,38.0,54.0,1.0
3,2739,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,...,97.0,56.0,0.5,117.0,81.6,-1.0,0.0,31.0,45.0,0.5
4,2740,2.419751,4.441743,18.315752,1176.293936,1847.561733,20.981585,67.544003,13.974458,3.97717,...,78.460529,58.995661,-0.157421,117.557447,87.411126,-0.303973,1.577706,40.821754,57.546288,-0.325199
5,2741,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,...,73.0,59.5,0.5,102.0,112.2,-0.5,0.0,40.0,56.0,0.5
6,2742,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,...,83.0,55.0,1.0,163.0,84.6,1.0,3.0,27.0,40.0,-1.0
7,2743,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,...,90.0,59.25,1.0,116.0,84.2,1.0,2.0,40.614353,57.327029,0.955986
8,2744,2.524616,7.744335,18.565199,1245.760827,2023.297164,22.09175,74.944295,14.246957,4.13208,...,80.268234,57.664071,-0.5,117.356804,87.821071,0.5,2.0,40.938433,57.644582,-0.344926
9,2745,2.426176,4.560248,18.218754,1177.864458,1846.536258,21.334367,67.710099,13.875174,3.942237,...,77.592425,59.457225,0.081915,117.595601,87.40196,0.2723,1.642425,40.823658,57.552609,-0.070444


# Training function using QWK metric and threshold optimization

In [22]:
def extract_features(df):
    # df["Feat_0"] = df["Physical-BMI"] * df["Basic_Demos-Age"]
    # df["Feat_1"] = df["PreInt_EduHx-computerinternet_hoursday"] * df["Basic_Demos-Age"]
    # df["Feat_2"] = df["Physical-BMI"] * df["PreInt_EduHx-computerinternet_hoursday"]
    # df["Feat_3"] = df["BIA-BIA_Fat"] / df["BIA-BIA_BMI"]
    # df["Feat_4"] = df["BIA-BIA_FFMI"] / df["BIA-BIA_Fat"]
    # df["Feat_5"] = df["BIA-BIA_FMI"] / df["BIA-BIA_Fat"]
    # df["Feat_6"] = df["BIA-BIA_LST"] / df["BIA-BIA_TBW"]
    # df["Feat_7"] = df["BIA-BIA_Fat"] * df["BIA-BIA_BMR"]
    # df["Feat_8"] = df["BIA-BIA_Fat"] * df["BIA-BIA_DEE"]
    # df["Feat_9"] = df["BIA-BIA_BMR"] / df["Physical-Weight"]
    # df["Feat_10"] = df["BIA-BIA_DEE"] / df["Physical-Weight"]
    # df["Feat_11"] = df["BIA-BIA_SMM"] / df["Physical-Height"]
    # df["Feat_12"] = df["BIA-BIA_SMM"] / df["BIA-BIA_FMI"]
    # df["Feat_13"] = df["BIA-BIA_TBW"] / df["Physical-Weight"]
    # df["Feat_14"] = df["BIA-BIA_ICW"] / df["BIA-BIA_TBW"]
    # df["Feat_15"] = df["Physical-BMI"] * df["Physical-HeartRate"]
    df["Feat_0"] = df["Physical-Height"] * df["PAQ_C-PAQ_C_Total"]
    df["Feat_1"] = df["FGC-FGC_TL_Zone"] * df["Physical-Height"]
    df["Feat_2"] = df["PreInt_EduHx-computerinternet_hoursday"] * df["BIA-BIA_Activity_Level_num"]
    # df["Feat_3"] = df["Fitness_Endurance-Time_Sec"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_4"] = df["CGAS-CGAS_Score"] / df["FGC-FGC_CU_Zone"]
    df["Feat_5"] = df["Basic_Demos-Age"] / df["FGC-FGC_SRR_Zone"]
    df["Feat_7"] = df["PAQ_C-PAQ_C_Total"] * df["BIA-BIA_Frame_num"]
    # df["Feat_9"] = df["FGC-FGC_GSD"] / df["SDS-SDS_Total_Raw"]
    # df["Feat_10"] = df["PAQ_A-PAQ_A_Total"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_11"] = df["BIA-BIA_LDM"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_14"] = df["BIA-BIA_BMI"] / df["SDS-SDS_Total_Raw"]
    df["Feat_15"] = df["Physical-Height"] * df["SDS-SDS_Total_T"]
    df["Feat_16"] = df["Physical-Height"] * df["Physical-Height"]
    df["Feat_17"] = df["FGC-FGC_SRL_Zone"] / df["Physical-Weight"]
    df["Feat_18"] = df["Basic_Demos-Sex"] * df["Basic_Demos-Sex"]
    # df["Feat_19"] = df["FGC-FGC_GSND_Zone"] / df["BIA-BIA_Fat"]

    return df

train = extract_features(train)
test = extract_features(test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Feat_0"] = df["Physical-Height"] * df["PAQ_C-PAQ_C_Total"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Feat_1"] = df["FGC-FGC_TL_Zone"] * df["Physical-Height"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Feat_2"] = df["PreInt_EduHx-computerinternet_hoursday"] * df["BIA-BIA_Activit

In [23]:
# qwk score
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# threshold rounder
def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))
# prediction evaluation using qwk function
def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, train, test_data, featureCols) -> list[float]:
    X = train[featureCols]
    y = train['sii']

    # Identify rows with NaN values in X

    # Fill NaN and infinite values in X and test_data
    X = X.fillna(0)
    X = X.replace([np.inf, -np.inf], 0)
    # test_data = test_data.fillna(0)
    test_data = test_data.replace([np.inf, -np.inf], 0)

    scaler = StandardScaler()
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X), columns=X.columns)
    test_data = test_data[featureCols]
    test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)
    # nan_mask = test_data.isnull().any(axis=1)
    # nan_indices = nan_mask[nan_mask].index
    # print(test_data)
    # print(nan_indices)
    n_splits = 5
    random_state = 42
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        test_preds[:, fold] = model.predict(test_data)

    KappaOptimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOptimizer.success, "Optimization did not converge."
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOptimizer.x)

    # Inject NaN predictions for rows with NaN in the original train
    predictions = np.array(tp_rounded.tolist())
    print(predictions)
    # for idx in nan_indices:
    #     predictions[idx] = np.nan

    return predictions.tolist()


# Parameters

In [24]:
LGBM_params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  
    'lambda_l2': 0.01
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 400,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  
    'reg_lambda': 5,  
    'random_state': 42,
    'tree_method': 'exact'
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 400,
    'random_seed': 42,
    'verbose': 0,
    'l2_leaf_reg': 10  
}




In [25]:
# !pip install --no-index --no-deps /kaggle/input/pytorchtransformer/tab_transformer_pytorch-0.3.0-py3-none-any.whl


In [26]:
# from pytorch_tabnet.tab_model import TabNetRegressor
# import torch
# from sklearn.base import BaseEstimator, RegressorMixin
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import train_test_split
# from pytorch_tabnet.callbacks import Callback
# import os
# import torch
# from pytorch_tabnet.callbacks import Callback

# class TabNetWrapper(BaseEstimator, RegressorMixin):
#     def __init__(self, **kwargs):
#         self.model = TabNetRegressor(**kwargs)
#         self.kwargs = kwargs
#         self.imputer = SimpleImputer(strategy='median')
#         self.best_model_path = 'best_tabnet_model.pt'
        
#     def fit(self, X, y):
#         # Handle missing values
#         X_imputed = self.imputer.fit_transform(X)
        
#         if hasattr(y, 'values'):
#             y = y.values
            
#         # Create internal validation set
#         X_train, X_valid, y_train, y_valid = train_test_split(
#             X_imputed, 
#             y, 
#             test_size=0.2,
#             random_state=42
#         )
                
#         # Train TabNet model
#         history = self.model.fit(
#             X_train=X_train,
#             y_train=y_train.reshape(-1, 1),
#             eval_set=[(X_valid, y_valid.reshape(-1, 1))],
#             eval_name=['valid'],
#             eval_metric=['mse'],
#             max_epochs=500,
#             patience=50,
#             batch_size=1024,
#             virtual_batch_size=128,
#             num_workers=0,
#             drop_last=False,
#             callbacks=[
#                 TabNetPretrainedModelCheckpoint(
#                     filepath=self.best_model_path,
#                     monitor='valid_mse',
#                     mode='min',
#                     save_best_only=True,
#                     verbose=True
#                 )
#             ]
#         )
#                 # Load the best model
#         if os.path.exists(self.best_model_path):
#             self.model.load_model(self.best_model_path)
#             os.remove(self.best_model_path)  # Remove temporary file
        
#         return self
#         def predict(self, X):
#             X_imputed = self.imputer.transform(X)
#             return self.model.predict(X_imputed).flatten()
    
#     def __deepcopy__(self, memo):
#         # Add deepcopy support for scikit-learn
#         cls = self.__class__
#         result = cls.__new__(cls)
#         memo[id(self)] = result
#         for k, v in self.__dict__.items():
#             setattr(result, k, deepcopy(v, memo))
#         return result

# TabNet_Params = {
#     'n_d': 64,              # Width of the decision prediction layer
#     'n_a': 64,              # Width of the attention embedding for each step
#     'n_steps': 5,           # Number of steps in the architecture
#     'gamma': 1.5,           # Coefficient for feature selection regularization
#     'n_independent': 2,     # Number of independent GLU layer in each GLU block
#     'n_shared': 2,          # Number of shared GLU layer in each GLU block
#     'lambda_sparse': 1e-4,  # Sparsity regularization
#     'optimizer_fn': torch.optim.Adam,
#     'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
#     'mask_type': 'entmax',
#     'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
#     'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
#     'verbose': 1,
#     'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
# }

# class TabNetPretrainedModelCheckpoint(Callback):
#     def __init__(self, filepath, monitor='val_loss', mode='min', 
#                  save_best_only=True, verbose=1):
#         super().__init__()  # Initialize parent class
#         self.filepath = filepath
#         self.monitor = monitor
#         self.mode = mode
#         self.save_best_only = save_best_only
#         self.verbose = verbose
#         self.best = float('inf') if mode == 'min' else -float('inf')
        
#     def on_train_begin(self, logs=None):
#         self.model = self.trainer  # Use trainer itself as model
        
#     def on_epoch_end(self, epoch, logs=None):
#         logs = logs or {}
#         current = logs.get(self.monitor)
#         if current is None:
#             return

#     # Check if current metric is better than best
#     if (self.mode == 'min' and current < self.best) or \
#        (self.mode == 'max' and current > self.best):
#         if self.verbose:
#             print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
#         self.best = current
#         if self.save_best_only:
#             self.model.save_model(self.filepath)  # Save the entire model

# Model decleration

In [27]:
Light = LGBMRegressor(**LGBM_params, random_state=42, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)],
     weights=[0.3, 0.5, 0.2]
)


# XGboost model

In [28]:
# xgb_preds = TrainML(model_class=XGB_Model, test_data=test)
# sub = pd.DataFrame({
    
#     'id'   : test_ids,
    
#     'sii': xgb_preds
# })
# xgb_preds
# sub

# Light Gradient Boosting Machine model

In [29]:
# lgbm_preds = TrainML(model_class=Light, test_data=test)
# sub = pd.DataFrame({
    
#     'id'   : test_ids,
    
#     'sii': lgbm_preds
# })

# sub

# Catboost model

In [30]:
# cat_preds = TrainML(model_class=CatBoost_Model, test_data=test)
# sub = pd.DataFrame({
    
#     'id'   : test_ids,
    
#     'sii': cat_preds
# })

# sub

<h1>Handle Time Series Data</h1>

In [31]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df




In [32]:
# train_ts = load_time_series('/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet')
# test_ts = load_time_series('/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet')
# train_ts

In [33]:
# train

In [34]:
# train_with_ts = pd.concat([train, train_ts], axis=1, join='inner')
# test_with_ts = pd.concat([test, test_ts], axis=1).reindex(test.index)

# train_with_ts

# Voting regressor model - ensembler

In [35]:
# test_with_ts

In [36]:
# # featureCols
# normal_preds = TrainML(voting_model, train, test, featureCols)

In [37]:
# normal_preds

In [38]:
# def TrainML(model_class, train, test_data, featureCols) -> list[int]:
nots_featureCols = sorted(list(set(train.columns) - set(['sii', 'id'])))
# featureCols
nots_preds = TrainML(voting_model, train, test, nots_featureCols)

Training Folds: 100%|██████████| 5/5 [00:27<00:00,  5.44s/it]


[2 0 0 0 1 1 0 0 2 1 1 0 1 1 2 2 0 0 0 2]


In [39]:
nots_preds

[2, 0, 0, 0, 1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 2, 2, 0, 0, 0, 2]

In [40]:
# # def TrainML(model_class, train, test_data, featureCols) -> list[int]:
# ts_featureCols = sorted(list(set(train_with_ts.columns) - set(['sii', 'id'])))
# # featureCols
# ts_preds = TrainML(voting_model, train_with_ts, test_with_ts, ts_featureCols)

In [41]:
# ts_preds

In [42]:
# nots_preds

In [43]:
# vote_preds = TrainML(model_class=voting_model, test_data=test)
final_sub = pd.DataFrame({
    
    'id'   : test_ids,
    'sii': nots_preds
})


final_sub.to_csv('submission.csv', index=False)
final_sub

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,1
