# Import libraries

In [1]:
import os
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import numpy as np
from  lightgbm import LGBMClassifier,log_evaluation,early_stopping, LGBMRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import StackingRegressor
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer

from sklearn.metrics import make_scorer, cohen_kappa_score
import optuna

#config
class Config():
    seed=2024
    num_folds=10
    TARGET_NAME ='sii'
import random

def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
seed_everything(Config.seed)

# read data

In [2]:
# define helper functions to read parquet files
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname):
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

root = Path('/kaggle/input/child-mind-institute-problematic-internet-use')

# read data
train=pd.read_csv(root / "train.csv")
train = train.dropna(subset=['sii'])    # drop record has sii = NaN
test=pd.read_csv(root / "test.csv")

ts_train = load_time_series(root / "series_train.parquet")
ts_test = load_time_series(root / "series_test.parquet")
time_series_cols = ts_train.columns.tolist()
time_series_cols.remove("id")
            
# merge data
train = pd.merge(train, ts_train, how="left", on='id')
test = pd.merge(test, ts_test, how="left", on='id')
train = train.set_index('id')
test = test.set_index('id')

# group common columns
cat_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
num_cols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday']
tabular_cols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday']
# target_col = 'PCIAT-PCIAT_Total'
target_col = 'sii'
feature_cols = tabular_cols + time_series_cols
num_cols = num_cols + time_series_cols


100%|██████████| 996/996 [01:10<00:00, 14.04it/s]
100%|██████████| 2/2 [00:00<00:00, 11.00it/s]


# Imputation for numeric columns

In [None]:
# perform mean imputation for numerical values
# categorical values shouldn't be imputed since they are all "season of participation" 

imputer = SimpleImputer(
    strategy='mean',
)
train[num_cols] = imputer.fit_transform(train[num_cols])
test[num_cols] = imputer.transform(test[num_cols])

# train.head()

# Feature engineer

In [3]:
# convert categorical values to number

def FE(df):
    
    print("Basic_Demos-Enroll_Season feature")
    col2count={'Spring': 734, 'Fall': 676, 'Winter': 676, 'Summer': 650}
    for u,v in col2count.items():
        df[f'Basic_Demos-Enroll_Season_{u}']=(df['Basic_Demos-Enroll_Season']==u).astype(np.int8)
    df['Basic_Demos-Enroll_Season']=df['Basic_Demos-Enroll_Season'].apply(lambda x:col2count.get(x,0))
    
    print("CGAS-Season feature")
    col2count={'Spring': 665, 'Fall': 583, 'Summer': 559, 'Winter': 535}
    for u,v in col2count.items():
        df[f'CGAS-Season_{u}']=(df['CGAS-Season']==u).astype(np.int8)
    df['CGAS-Season']=df['CGAS-Season'].apply(lambda x:col2count.get(x,0))
    
    print("Physical-Season feature")
    col2count={'Spring': 709, 'Fall': 650, 'Winter': 634, 'Summer': 602}
    for u,v in col2count.items():
        df[f'Physical-Season_{u}']=(df['Physical-Season']==u).astype(np.int8)
    df['Physical-Season']=df['Physical-Season'].apply(lambda x:col2count.get(x,0))
    
    print("Fitness_Endurance-Season feature")
    col2count={'Spring': 377, 'Winter': 331, 'Fall': 316, 'Summer': 236}
    for u,v in col2count.items():
        df[f'Fitness_Endurance-Season_{u}']=(df['Fitness_Endurance-Season']==u).astype(np.int8)
    df['Fitness_Endurance-Season']=df['Fitness_Endurance-Season'].apply(lambda x:col2count.get(x,0))
    
    print("FGC-Season feature")
    col2count={'Spring': 771, 'Summer': 659, 'Fall': 633, 'Winter': 584}
    for u,v in col2count.items():
        df[f'FGC-Season_{u}']=(df['FGC-Season']==u).astype(np.int8)
    df['FGC-Season']=df['FGC-Season'].apply(lambda x:col2count.get(x,0))
    
    print("BIA-Season feature")
    col2count={'Summer': 585, 'Fall': 511, 'Spring': 405, 'Winter': 343}
    for u,v in col2count.items():
        df[f'BIA-Season_{u}']=(df['BIA-Season']==u).astype(np.int8)
    df['BIA-Season']=df['BIA-Season'].apply(lambda x:col2count.get(x,0))
    
    print("PAQ_A-Season feature")
    col2count={'Winter': 98, 'Summer': 97, 'Spring': 90, 'Fall': 78}   
    for u,v in col2count.items():
        df[f'PAQ_A-Season_{u}']=(df['PAQ_A-Season']==u).astype(np.int8)
    df['PAQ_A-Season']=df['PAQ_A-Season'].apply(lambda x:col2count.get(x,0))
    
    print("PAQ_C-Season feature")
    col2count={'Spring': 405, 'Winter': 385, 'Summer': 342, 'Fall': 308}
    for u,v in col2count.items():
        df[f'PAQ_C-Season_{u}']=(df['PAQ_C-Season']==u).astype(np.int8)
    df['PAQ_C-Season']=df['PAQ_C-Season'].apply(lambda x:col2count.get(x,0))
    
    print("SDS-Season feature")
    col2count={'Spring': 692, 'Winter': 629, 'Fall': 605, 'Summer': 601}
    for u,v in col2count.items():
        df[f'SDS-Season_{u}']=(df['SDS-Season']==u).astype(np.int8)
    df['SDS-Season']=df['SDS-Season'].apply(lambda x:col2count.get(x,0))
    
    print("PreInt_EduHx-Season feature")
    col2count={'Spring': 728, 'Fall': 684, 'Winter': 655, 'Summer': 652}
    for u,v in col2count.items():
        df[f'PreInt_EduHx-Season_{u}']=(df['PreInt_EduHx-Season']==u).astype(np.int8)
    df['PreInt_EduHx-Season']=df['PreInt_EduHx-Season'].apply(lambda x:col2count.get(x,0)) 

    # test_lack_cols=['PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total']
    # df.drop(['id',]+test_lack_cols,axis=1,inplace=True,errors='ignore')
    # print("-"*30)
    return df

    
train=FE(train)
test=FE(test)
# train.head()

Basic_Demos-Enroll_Season feature
CGAS-Season feature
Physical-Season feature
Fitness_Endurance-Season feature
FGC-Season feature
BIA-Season feature
PAQ_A-Season feature
PAQ_C-Season feature
SDS-Season feature
PreInt_EduHx-Season feature
Basic_Demos-Enroll_Season feature
CGAS-Season feature
Physical-Season feature
Fitness_Endurance-Season feature
FGC-Season feature
BIA-Season feature
PAQ_A-Season feature
PAQ_C-Season feature
SDS-Season feature
PreInt_EduHx-Season feature


# Objective function

In [7]:
## kappa scorer perform worse than default scoring metric in tuning hyperparameters in the current set up
# KAPPA_SCORER = make_scorer(
#     cohen_kappa_score, 
#     greater_is_better=True, 
#     weights='quadratic',
# )

def lgb_objective(trial):
    params = {
        "boosting_type": "gbdt",
        'random_state': 2024, 
        'objective': 'multiclass',
        # 'num_class': 4,
        "verbosity": -1,
        'n_estimators': trial.suggest_int('n_estimators', 32, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        # 'feature_fraction': trial.suggest_int('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0), 
        'subsample': trial.suggest_int('subsample', 0.5, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
    }

    X = train[feature_cols]
    y = train[target_col]
    cv = StratifiedKFold(5, shuffle=True, random_state=2024)

    val_scores = cross_val_score(
        estimator=LGBMClassifier(**params), 
        X=X, y=y, 
        cv=cv, 
        # scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

# Optuna - Hyperparameters optimization

In [9]:
# study = optuna.create_study(direction='maximize', study_name='Classifier')
# study.optimize(lgb_objective, n_trials=100, show_progress_bar=True)
# study.best_params

[I 2025-01-01 08:52:07,705] A new study created in memory with name: Classifier


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-01-01 08:52:14,607] Trial 0 finished with value: 0.5964918133415178 and parameters: {'n_estimators': 361, 'learning_rate': 0.03257067253183902, 'max_depth': 7, 'num_leaves': 198, 'min_data_in_leaf': 76, 'bagging_fraction': 0.4706540787503134, 'bagging_freq': 5, 'lambda_l1': 4.547273194004619, 'lambda_l2': 0.017137984233384366, 'reg_alpha': 0.9079205204030059, 'reg_lambda': 0.8531662582934991, 'colsample_bytree': 0.5720705889236004, 'subsample': 1, 'min_child_samples': 31}. Best is trial 0 with value: 0.5964918133415178.
[I 2025-01-01 08:52:32,181] Trial 1 finished with value: 0.5687172233416512 and parameters: {'n_estimators': 1014, 'learning_rate': 0.09924168236476032, 'max_depth': 7, 'num_leaves': 16, 'min_data_in_leaf': 98, 'bagging_fraction': 0.5255857596611084, 'bagging_freq': 1, 'lambda_l1': 0.0265490041637646, 'lambda_l2': 0.012294612062670988, 'reg_alpha': 0.573547578556282, 'reg_lambda': 0.6271451119630835, 'colsample_bytree': 0.8335258036782047, 'subsample': 0, 'min_c

{'n_estimators': 256,
 'learning_rate': 0.013579256659883413,
 'max_depth': 4,
 'num_leaves': 61,
 'min_data_in_leaf': 15,
 'bagging_fraction': 0.4724962272858123,
 'bagging_freq': 1,
 'lambda_l1': 0.5610890263591135,
 'lambda_l2': 5.471842497266317,
 'reg_alpha': 0.8138178726121249,
 'reg_lambda': 0.7707264717911311,
 'colsample_bytree': 0.9024157827642343,
 'subsample': 0,
 'min_child_samples': 45}

# Model training

In [10]:
def metric(y_true,y_pred):
    N=int(np.max(y_true)+1)
    O=np.zeros((N,N))
    for i in range(len(y_true)):
        O[y_true[i]][y_pred[i]]+=1
    W=np.zeros((N,N))
    for i in range(N):
        for j in range(N):
            W[i][j]=(i-j)**2/(N-1)**2
    #E
    true_count=np.zeros(N)
    pred_count=np.zeros(N)
    for i in range(len(y_true)):
        true_count[y_true[i]]+=1
        pred_count[y_pred[i]]+=1
    
    E=np.zeros((N,N))
    for i in range(len(true_count)):
        for j in range(len(pred_count)):
            E[i][j]=true_count[i]*pred_count[j]
            
    O=O/O.sum()
    E=E/E.sum()
    
    weighted_kappa=1-np.sum(W*O)/np.sum(W*E)
    
    return weighted_kappa

choose_cols=[col for col in test.columns if train[col].dtype!=object]
print(f"len(choose_cols):{len(choose_cols)}")

def fit_and_predict(train_feats=train,test_feats=test,model=None,num_folds=10,name='lgb'):
    X=train_feats[choose_cols].copy()
    y=train_feats[Config.TARGET_NAME].copy()
    oof_pred=np.zeros((len(X)))
    test_X=test_feats[choose_cols].copy()
    test_pred_pro=np.zeros((num_folds,len(test_X)))
    
    skf = StratifiedKFold(n_splits=num_folds,shuffle=True,random_state=Config.seed)
    for fold, (train_index, valid_index) in (enumerate(skf.split(X,y))):
        print(f"name {name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index].reset_index(drop=True), X.iloc[valid_index].reset_index(drop=True)
        y_train, y_valid = y.iloc[train_index].reset_index(drop=True), y.iloc[valid_index].reset_index(drop=True)
        
        y_train=(y_train!=0).astype(np.int8)
        y_valid=(y_valid!=0).astype(np.int8)
        
        model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                          callbacks=[log_evaluation(100)],
                         )
        
        oof_pred[valid_index]=model.predict_proba(X_valid)[:,1]
        test_pred_pro[fold]=model.predict_proba(test_X)[:,1]
        
        importances=model.feature_importances_
        columns=X.columns
        drop_cols=[]
        for i in range(len(columns)):
            if importances[i]<10:
                drop_cols.append(columns[i])
        print(f"drop_cols={drop_cols}")

    test_pred_pro=test_pred_pro.mean(axis=0)
    
    oof_pred_sorted=sorted(oof_pred)
    margin=[0,oof_pred_sorted[int(len(oof_pred_sorted)*0.58)],
     oof_pred_sorted[int(len(oof_pred_sorted)*0.85)],oof_pred_sorted[int(len(oof_pred_sorted)*0.99)],1]
    for i in range(len(margin)-1):
        oof_index= np.where((oof_pred >= margin[i]) & (oof_pred < margin[i+1]))[0]
        oof_pred[oof_index]=i
        test_index= np.where((test_pred_pro >= margin[i]) & (test_pred_pro < margin[i+1]))[0]
        test_pred_pro[test_index]=i
    
    print(f"weighted kappa:{metric(y.values.reshape(-1).astype(np.int8),oof_pred.astype(np.int8))}")
     
    return oof_pred,test_pred_pro

# lgb_params={
#         "boosting_type": "gbdt",'objective':'binary',"metric": "auc",
#         'random_state': 2024, 'n_estimators': 512, 
#         'learning_rate': 0.07975474666326936, 'max_depth': 10, 'num_leaves': 207,
#         'min_data_in_leaf': 41,'feature_fraction': 0.6385678848225935, 
#         'bagging_fraction': 0.9042038292349021, 'bagging_freq': 6, 
#         'lambda_l1': 9.920617415343463, 'lambda_l2': 4.351491475117983,
#         'reg_alpha': 0.006329813118558037, 'reg_lambda': 0.22366541275310856,
#         'colsample_bytree': 0.9045121369263609, 'subsample': 0.6560250299728694,
#         'min_child_samples': 16,"verbose": -1,
# }


lgb_params = {
    "boosting_type": "gbdt",
    'random_state': 2024,
    'objective': 'multiclass',
    'num_class': 4,
    "verbose": -1,
    'feature_fraction': 0.6385678848225935,
    
    'n_estimators': 256,
    'learning_rate': 0.013579256659883413,
    'max_depth': 4,
    'num_leaves': 61,
    'min_data_in_leaf': 15,
    'bagging_fraction': 0.4724962272858123,
    'bagging_freq': 1,
    'lambda_l1': 0.5610890263591135,
    'lambda_l2': 5.471842497266317,
    'reg_alpha': 0.8138178726121249,
    'reg_lambda': 0.7707264717911311,
    'colsample_bytree': 0.9024157827642343,
    'subsample': 0,
    'min_child_samples': 45

}



lgb_oof_pred_pro,lgb_test_pro=fit_and_predict(model=LGBMClassifier(**lgb_params),num_folds=Config.num_folds,name='lgb')
print(f"lgb_test_pro[:10]:{lgb_test_pro[:10]}")

len(choose_cols):194
name lgb,fold:0
[100]	valid_0's multi_logloss: 0.565416
[200]	valid_0's multi_logloss: 0.544526
drop_cols=['FGC-FGC_GSND_Zone', 'FGC-FGC_GSD_Zone', 'PAQ_A-Season', 'stat_1', 'stat_2', 'stat_3', 'stat_4', 'stat_5', 'stat_6', 'stat_7', 'stat_8', 'stat_9', 'stat_10', 'stat_11', 'stat_39', 'stat_41', 'stat_42', 'stat_43', 'stat_44', 'stat_45', 'stat_46', 'stat_53', 'stat_57', 'stat_58', 'stat_64', 'stat_65', 'stat_67', 'stat_69', 'stat_70', 'stat_77', 'stat_81', 'stat_82', 'stat_89', 'stat_92', 'stat_93', 'stat_94', 'Basic_Demos-Enroll_Season_Spring', 'Basic_Demos-Enroll_Season_Fall', 'Basic_Demos-Enroll_Season_Winter', 'Basic_Demos-Enroll_Season_Summer', 'CGAS-Season_Spring', 'CGAS-Season_Summer', 'CGAS-Season_Winter', 'Physical-Season_Spring', 'Physical-Season_Fall', 'Physical-Season_Summer', 'Fitness_Endurance-Season_Spring', 'Fitness_Endurance-Season_Winter', 'Fitness_Endurance-Season_Fall', 'FGC-Season_Spring', 'FGC-Season_Summer', 'FGC-Season_Fall', 'FGC-Season_W

# submission

In [12]:
submission=pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv")
submission[Config.TARGET_NAME]=lgb_test_pro.astype(int)
submission.to_csv("/kaggle/working/submission.csv",index=None)
# print(submission)