# Imports

In [1]:
import os
import random
from tqdm import tqdm
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer

from scipy.optimize import minimize
import optuna

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

#new
from sklearn.base import clone


SEED = 42

KAPPA_SCORER = make_scorer(
    cohen_kappa_score, 
    greater_is_better=True, 
    weights='quadratic',
)

# Load data

In [2]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [3]:
def load_time_series(dirname):
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

In [4]:
root = Path('/kaggle/input/child-mind-institute-problematic-internet-use')

## Tabular Data

In [5]:
df_train = pd.read_csv(root / 'train.csv')
df_test = pd.read_csv(root / 'test.csv')
df_subm = pd.read_csv(root / 'sample_submission.csv', index_col='id')

## Time Series Data

In [6]:
ts_train = load_time_series(root / "series_train.parquet")
ts_test = load_time_series(root / "series_test.parquet")

time_series_cols = ts_train.columns.tolist()
time_series_cols.remove("id")

100%|██████████| 996/996 [01:27<00:00, 11.39it/s]
100%|██████████| 2/2 [00:00<00:00,  9.27it/s]


## Merge Operation

In [7]:
df_train = pd.merge(df_train, ts_train, how="left", on='id')
df_test = pd.merge(df_test, ts_test, how="left", on='id')

df_train = df_train.set_index('id')
df_test = df_test.set_index('id')

# Features

## Global Variables

In [8]:
cat_cols = [
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
    'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
    'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 
    'PreInt_EduHx-Season'
]
num_cols = [
    'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 
    'Physical-BMI', 'Physical-Height', 'Physical-Weight', 
    'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 
    'Physical-HeartRate', 'Physical-Systolic_BP', 
    'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 
    'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 
    'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 
    'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 
    'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
    'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 
    'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 
    'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 
    'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 
    'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 
    'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 
    'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 
    'PreInt_EduHx-computerinternet_hoursday'
]
tabular_cols = [
    'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 
    'CGAS-Season', 'CGAS-CGAS_Score', 
    'Physical-Season', 'Physical-BMI', 'Physical-Height', 
    'Physical-Weight', 'Physical-Waist_Circumference', 
    'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 
    'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 
    'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 
    'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 
    'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 
    'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 
    'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
    'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 
    'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 
    'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 
    'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 
    'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total',
    'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 
    'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday'
]
target_col = 'sii'

feature_cols = tabular_cols + time_series_cols
num_cols = num_cols + time_series_cols

## Drop Rows with Missing Targets

In [9]:
df_train = df_train.dropna(subset=[target_col])

## Numeric Value Imputing

In [10]:
imputer = SimpleImputer(
    strategy='mean',
)

df_train[num_cols] = imputer.fit_transform(df_train[num_cols])
df_test[num_cols] = imputer.transform(df_test[num_cols])

## Category Encoding

In [11]:
encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown='use_encoded_value',
    unknown_value=-1,
    encoded_missing_value=-2,
)

df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train[cat_cols] = df_train[cat_cols].astype('category')

df_test[cat_cols] = encoder.transform(df_test[cat_cols])
df_test[cat_cols] = df_test[cat_cols].astype('category')

In [12]:
df_train

Unnamed: 0_level_0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008ff9,0,5.0,0.0,3,51.000000,0,16.877316,46.0,50.8,26.625259,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,8.628980e+13,6.933735,2.615462,78.834337
000fd460,2,9.0,0.0,-2,65.159266,0,14.035590,48.0,46.0,22.000000,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,8.628980e+13,6.933735,2.615462,78.834337
00105258,2,10.0,1.0,0,71.000000,0,16.648696,56.5,75.6,26.625259,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,8.628980e+13,6.933735,2.615462,78.834337
00115b9f,3,9.0,0.0,0,71.000000,2,18.292347,56.0,81.6,26.625259,...,1.546979,4.004276,89.751656,0.000000,2633.250000,4188.500000,8.611000e+13,7.000000,3.000000,85.000000
001f3379,1,13.0,1.0,3,50.000000,2,22.279952,59.5,112.2,26.625259,...,1.146284,2.952888,89.476036,1.000000,2597.800049,4175.000000,8.639500e+13,7.000000,3.000000,91.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff6c2bb8,0,8.0,0.0,-2,65.159266,0,17.139810,52.5,67.2,25.000000,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,8.628980e+13,6.933735,2.615462,78.834337
ff759544,2,7.0,1.0,-2,65.159266,2,13.927006,48.5,46.6,23.000000,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,8.628980e+13,6.933735,2.615462,78.834337
ff8a2de4,0,13.0,0.0,1,60.000000,0,16.362460,59.5,82.4,26.625259,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,8.628980e+13,6.933735,2.615462,78.834337
ffcd4dbd,0,11.0,0.0,1,68.000000,3,21.441500,60.0,109.8,26.625259,...,1.468296,3.078876,89.693832,1.000000,2605.750000,4185.000000,8.639500e+13,7.000000,1.000000,72.000000


# Train

## QWK

In [13]:
def quadratic_weighted_kappa(estimator, X, y_true):
    y_pred = estimator.predict(X).round()
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [14]:
def threshold_rounder(y_pred, thresholds):
    return np.where(y_pred < thresholds[0], 0,
                    np.where(y_pred < thresholds[1], 1,
                             np.where(y_pred < thresholds[2], 2, 3)))

In [15]:
def eval_preds(thresholds, y_true, y_pred):
    y_pred = threshold_rounder(y_pred, thresholds)
    score = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return -score

## LightGBM

In [16]:
class CustomLGBMRegressor(lgb.LGBMRegressor):
    '''
    Custom LightGBM Regressor
    
    It optimizes threshold values during fitting.
    Main goal is preventing overfit on validation data.
    '''
    def fit(self, X, y, **kwargs):
        super().fit(X, y, **kwargs)
        y_pred = super().predict(X, **kwargs)
        
        self.optimizer = minimize(
            eval_preds, 
            x0=[0.5, 1.5, 2.5], 
            args=(y, y_pred), 
            method='Nelder-Mead',
        )
        
    def predict(self, X, **kwargs):
        y_pred = super().predict(X, **kwargs)
        y_pred = threshold_rounder(y_pred, self.optimizer.x)
        return y_pred

## Optuna - Hyperparameter Tuning

In [17]:
def lgb_objective(trial):
    params = {
        'objective':         'l2',
        'verbosity':         -1,
        'n_iter':            200,
        'random_state':      SEED,
        'boosting_type':     'gbdt',
        'lambda_l1':         trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
        'lambda_l2':         trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
        'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':         trial.suggest_int('max_depth', 4, 8),
        'num_leaves':        trial.suggest_int('num_leaves', 16, 256),
        'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode':  trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq':      trial.suggest_int('bagging_freq', 1, 7),
        'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 5, 100),
    }
    
    X = df_train[feature_cols]
    y = df_train[target_col]
    cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    estimator = CustomLGBMRegressor(**params)

    val_scores = cross_val_score(
        estimator=estimator, 
        X=X, y=y, 
        cv=cv, 
        scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

In [18]:
# study = optuna.create_study(direction='maximize', study_name='Regressor')
# study.optimize(lgb_objective, n_trials=30, show_progress_bar=True)

## Tuned Hyperparameters

In [19]:
params = {
    'objective'       : 'l2',
    'verbosity'       : -1,
    'n_iter'          : 200,
    'lambda_l1'       : 0.005116829730239727,
    'lambda_l2'       : 0.0011520776712645852,
    'learning_rate'   : 0.02376367323636638,
    'max_depth'       : 5,
    'num_leaves'      : 207,
    'colsample_bytree': 0.7759862336963801,
    'colsample_bynode': 0.5110355095943208,
    'bagging_fraction': 0.5485770314992224,
    'bagging_freq'    : 7,
    'min_data_in_leaf': 78,
}

model = CustomLGBMRegressor(**params, random_state=SEED)

## Cross Validation

In [20]:
X = df_train[feature_cols]
y = df_train[target_col]

# cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
# val_scores = cross_val_score(
#     model, X, y, cv=cv, 
#     scoring=KAPPA_SCORER,
# )
# print(f'kappa score: {np.mean(val_scores):.4f}')

oof_preds = np.zeros(len(y), dtype=float)

cv = StratifiedKFold(5, shuffle=True, random_state=SEED)

val_scores = []
train_scores = []

for train_idx, val_idx in cv.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model_fold = clone(model)
    model_fold.fit(X_train, y_train)

    val_preds = model_fold.predict(X_val)
    oof_preds[val_idx] = val_preds  
    
    train_kappa = quadratic_weighted_kappa(model_fold, X_train, y_train)
    val_kappa = quadratic_weighted_kappa(model_fold, X_val, y_val)
    
    train_scores.append(train_kappa)
    val_scores.append(val_kappa)
    
    print(f"Fold {len(train_scores)}: Train QWK = {train_kappa:.4f}, Validation QWK = {val_kappa:.4f}")

mean_train_kappa = np.mean(train_scores)
mean_val_kappa = np.mean(val_scores)

print(f"\nMean Train QWK ---> {mean_train_kappa:.4f}")
print(f"Mean Validation QWK ---> {mean_val_kappa:.4f}")

KappaOptimizer = minimize(
    eval_preds, 
    x0=[0.5, 1.5, 2.5], 
    args=(y, oof_preds),
    method='Nelder-Mead'
)
assert KappaOptimizer.success, "Optimization did not converge."

optimized_thresholds = KappaOptimizer.x
oof_tuned = threshold_rounder(oof_preds, optimized_thresholds)
optimized_qwk = cohen_kappa_score(y, oof_tuned, weights='quadratic')

print(f"Optimized QWK SCORE ---> {optimized_qwk:.4f}")

Fold 1: Train QWK = 0.5834, Validation QWK = 0.4828
Fold 2: Train QWK = 0.5835, Validation QWK = 0.5143
Fold 3: Train QWK = 0.5782, Validation QWK = 0.4697
Fold 4: Train QWK = 0.5991, Validation QWK = 0.4347
Fold 5: Train QWK = 0.5945, Validation QWK = 0.4565

Mean Train QWK ---> 0.5878
Mean Validation QWK ---> 0.4716
Optimized QWK SCORE ---> 0.4714


## Seed Ensembling

In [21]:
ensemble_model = VotingRegressor([
    ('lgb_0', CustomLGBMRegressor(**params, random_state=12)),
    ('lgb_1', CustomLGBMRegressor(**params, random_state=22)),
    ('lgb_2', CustomLGBMRegressor(**params, random_state=32)),
    ('lgb_3', CustomLGBMRegressor(**params, random_state=42)),
    ('lgb_4', CustomLGBMRegressor(**params, random_state=52)),
    ('lgb_5', CustomLGBMRegressor(**params, random_state=62)),
    ('lgb_6', CustomLGBMRegressor(**params, random_state=72)),
    ('lgb_7', CustomLGBMRegressor(**params, random_state=82)),
    ('lgb_8', CustomLGBMRegressor(**params, random_state=92)),
    ('lgb_9', CustomLGBMRegressor(**params, random_state=102)),
])

## Training

In [22]:
X = df_train[feature_cols]
y = df_train[target_col]

ensemble_model.fit(X, y)

## Prediction

In [23]:
df_subm[target_col] = ensemble_model.predict(df_test[feature_cols])
df_subm[target_col] = df_subm[target_col].round()

df_subm.to_csv('submission.csv')

In [24]:
df_subm

Unnamed: 0_level_0,sii
id,Unnamed: 1_level_1
00008ff9,0.0
000fd460,0.0
00105258,1.0
00115b9f,0.0
0016bb22,1.0
001f3379,1.0
0038ba98,1.0
0068a485,0.0
0069fbed,1.0
0083e397,1.0
