### Reference
- {Notebook} [CMI | Best Single Model](https://www.kaggle.com/code/abdmental01/cmi-best-single-model)
- {Notebook} [CMI | Best Single Model (explained)](https://www.kaggle.com/code/sunghoshim/cmi-best-single-model-explained)

### TODO
- TimeSeries 에서 여러 feature 만들기
- ssi 없는 애들 만들기
- threshold 값 찾기

In [2]:
from pathlib import Path

import numpy as np
import polars as pl
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, cohen_kappa_score

import lightgbm as lgb

from tqdm.notebook import tqdm

## 1. Config

In [3]:
CFG = {
    'SEED': 42,
    'LGB_PARAMS': {  # LGBM default
        'num_leaves': 31,
        'max_depth': -1,
        'n_estimators' : 100,
        'min_child_samples': 20,
    }
}

## 1. Utils for Time Series Data
- TODO: TimeSeries 에서 여러 feature 만들기

In [4]:
def get_ts_feature(id_path):
    df = pl.read_parquet(id_path / 'part-0.parquet')
    ts_feature = df.describe().filter(
        ~pl.col("statistic").is_in(["count", "null_count"])
    ).select(
        pl.all().exclude(["statistic", "step"])
    ).to_numpy().reshape(-1)
    
    patient_id = id_path.name.split("=")[1]
    
    return ts_feature, patient_id

In [5]:
def get_all_ts_feature(parquet_dir) -> pd.DataFrame:
    items = list(Path(parquet_dir).iterdir())
    features = []
    ids = []
    for id_path in tqdm(items):  # ex) "id=00115b9f"
        feature, patient_id = get_ts_feature(id_path)
        features.append(feature)
        ids.append(patient_id)
        
    columns = [f"stat_{i}" for i in range(len(features[0]))]
    df = pd.DataFrame(features, columns=columns, index=ids)
    return df

## 2. Load Datasets

In [None]:
df_train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
df_test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
df_sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')
df_train.shape, df_test.shape, df_sample.shape

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'

: 

In [None]:
df_train_ts = get_all_ts_feature("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
df_train_ts.shape

  0%|          | 0/996 [00:00<?, ?it/s]

(996, 84)

In [None]:
df_test_ts = get_all_ts_feature("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
df_test_ts.shape

  0%|          | 0/2 [00:00<?, ?it/s]

(2, 84)

In [None]:
df_train = pd.merge(df_train, df_train_ts, how='left', left_on='id', right_index=True)
df_test = pd.merge(df_test, df_test_ts, how='left', left_on='id', right_index=True)
df_train.shape, df_test.shape

((3960, 166), (20, 143))

In [None]:
set(df_train.columns) - set(df_test.columns)

{'PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20',
 'PCIAT-PCIAT_Total',
 'PCIAT-Season',
 'sii'}

In [None]:
COLS = df_test.columns.drop('id')
len(COLS)

142

## 3. Preprocessing

## .. 3.1. Drop non-label rows
- TODO: Generate sii from other features

In [None]:
df_train['sii'].value_counts(dropna=False)

sii
0.0    1594
NaN    1224
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [None]:
df_train = df_train.dropna(subset='sii').copy()
df_train.shape

(2736, 166)

## .. 3.2. Set Category Columns

In [None]:
COLS_season = COLS[COLS.str.contains('Season')].to_list()
COLS_season

['Basic_Demos-Enroll_Season',
 'CGAS-Season',
 'Physical-Season',
 'Fitness_Endurance-Season',
 'FGC-Season',
 'BIA-Season',
 'PAQ_A-Season',
 'PAQ_C-Season',
 'SDS-Season',
 'PreInt_EduHx-Season']

In [None]:
df_train[COLS_season].isna().sum()

Basic_Demos-Enroll_Season       0
CGAS-Season                   394
Physical-Season               141
Fitness_Endurance-Season     1476
FGC-Season                     89
BIA-Season                    892
PAQ_A-Season                 2373
PAQ_C-Season                 1296
SDS-Season                    209
PreInt_EduHx-Season            17
dtype: int64

In [None]:
df_test[COLS_season].isna().sum()

Basic_Demos-Enroll_Season     0
CGAS-Season                  10
Physical-Season               6
Fitness_Endurance-Season     16
FGC-Season                    3
BIA-Season                   12
PAQ_A-Season                 19
PAQ_C-Season                 11
SDS-Season                   10
PreInt_EduHx-Season           2
dtype: int64

In [None]:
for col in COLS_season:
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

## 4. cross_validate - LGBM

## .. 4.1. Prepare Scorer and CV

In [None]:
def digitize_and_cohen_kappa(y_true, y_pred, bins):
    y_pred_binned = np.digitize(y_pred, bins)
    return cohen_kappa_score(y_true, y_pred_binned, weights='quadratic')

In [None]:
bins = [0.5, 1.5, 2.5]
kappa_scorer = make_scorer(digitize_and_cohen_kappa, bins=bins, greater_is_better=True)

In [None]:
cv = StratifiedKFold(5)

## .. 4.2. cross_validate()

In [None]:
cv_results = cross_validate(
    lgb.LGBMRegressor(
        **CFG['LGB_PARAMS'],
        random_state=CFG['SEED'],
        n_jobs=1,
        force_col_wise=True,
    ),
    df_train[COLS], df_train['sii'],
    cv=cv,
    scoring={'cohen_kappa': kappa_scorer},
    return_train_score=True,
    return_estimator=True,
)

[LightGBM] [Info] Total Bins 20659
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 142
[LightGBM] [Info] Start training from score 0.579982
[LightGBM] [Info] Total Bins 20609
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.581087
[LightGBM] [Info] Total Bins 20651
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.579717
[LightGBM] [Info] Total Bins 20587
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.580630
[LightGBM] [Info] Total Bins 20632
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.580630


In [None]:
cv_results

{'fit_time': array([1.9620533 , 1.98706508, 1.62191224, 1.61389017, 1.63838005]),
 'score_time': array([0.0248847 , 0.01878643, 0.01848316, 0.01845002, 0.01870632]),
 'estimator': [LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42)],
 'test_cohen_kappa': array([0.39680902, 0.46016689, 0.39038916, 0.39233512, 0.35076928]),
 'train_cohen_kappa': array([0.93460028, 0.92723858, 0.9254701 , 0.92897209, 0.92883482])}

In [None]:
np.mean(cv_results['test_cohen_kappa'])

0.3980938927182535

## .. 4.3. Feature Importance

In [None]:
model = cv_results['estimator'][0]
model.importance_type = 'gain'

ser_imp = pd.Series(model.feature_importances_, index=model.feature_name_).sort_values(ascending=False)
ser_imp = ser_imp.sort_values(ascending=False)
ser_imp

Basic_Demos-Age                           942.004484
SDS-SDS_Total_Raw                         551.800183
PreInt_EduHx-computerinternet_hoursday    334.957506
Physical-Systolic_BP                      218.030220
CGAS-CGAS_Score                           187.051085
                                             ...    
stat_57                                     0.000000
stat_70                                     0.000000
stat_69                                     0.000000
stat_27                                     0.000000
stat_29                                     0.000000
Length: 142, dtype: float64

## .. 4.4. cross_validate with reduced columns

In [None]:
COLS_remove = ser_imp[ser_imp == 0].index
COLS_remove

Index(['stat_33', 'stat_82', 'stat_81', 'stat_41', 'stat_45', 'stat_77',
       'stat_34', 'stat_64', 'stat_65', 'stat_53', 'stat_30', 'stat_57',
       'stat_70', 'stat_69', 'stat_27', 'stat_29'],
      dtype='object')

In [None]:
COLS_REDUCED = COLS.drop(COLS_remove)
COLS_REDUCED

Index(['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
       'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       ...
       'stat_71', 'stat_72', 'stat_73', 'stat_74', 'stat_75', 'stat_76',
       'stat_78', 'stat_79', 'stat_80', 'stat_83'],
      dtype='object', length=126)

In [None]:
cv_results_reduced = cross_validate(
    lgb.LGBMRegressor(
        **CFG['LGB_PARAMS'],
        random_state=CFG['SEED'],
        n_jobs=1,
        force_col_wise=True,
    ),
    df_train[COLS_REDUCED], df_train['sii'],
    cv=cv,
    scoring={'cohen_kappa': kappa_scorer},
    return_train_score=True,
    return_estimator=True,
)
cv_results_reduced

[LightGBM] [Info] Total Bins 20320
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 126
[LightGBM] [Info] Start training from score 0.579982
[LightGBM] [Info] Total Bins 20271
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 126
[LightGBM] [Info] Start training from score 0.581087
[LightGBM] [Info] Total Bins 20313
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 126
[LightGBM] [Info] Start training from score 0.579717
[LightGBM] [Info] Total Bins 20247
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 126
[LightGBM] [Info] Start training from score 0.580630
[LightGBM] [Info] Total Bins 20291
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 126
[LightGBM] [Info] Start training from score 0.580630


{'fit_time': array([1.54086423, 1.55226159, 1.54637122, 1.54562974, 1.55814743]),
 'score_time': array([0.01903391, 0.01837277, 0.01910019, 0.01878929, 0.01873732]),
 'estimator': [LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42),
  LGBMRegressor(force_col_wise=True, n_jobs=1, random_state=42)],
 'test_cohen_kappa': array([0.39680902, 0.47121462, 0.40863188, 0.36166625, 0.35639534]),
 'train_cohen_kappa': array([0.93460028, 0.92543451, 0.91711178, 0.93428992, 0.93088863])}

In [None]:
np.mean(cv_results_reduced['test_cohen_kappa'])

0.3989434218445285

## .. 4.5. different bins

In [None]:
bins_temp = [0.6, 1.6, 2.6]
kappa_scorer_6 = make_scorer(digitize_and_cohen_kappa, bins=bins_temp, greater_is_better=True)

cv_results_bin = cross_validate(
    lgb.LGBMRegressor(
        **CFG['LGB_PARAMS'],
        random_state=CFG['SEED'],
        n_jobs=1,
        force_col_wise=True,
    ),
    df_train[COLS], df_train['sii'],
    cv=cv,
    scoring={'cohen_kappa': kappa_scorer_6},
    return_train_score=True,
    return_estimator=False,
)
cv_results_bin

[LightGBM] [Info] Total Bins 20659
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 142
[LightGBM] [Info] Start training from score 0.579982
[LightGBM] [Info] Total Bins 20609
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.581087
[LightGBM] [Info] Total Bins 20651
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.579717
[LightGBM] [Info] Total Bins 20587
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.580630
[LightGBM] [Info] Total Bins 20632
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 142
[LightGBM] [Info] Start training from score 0.580630


{'fit_time': array([1.62330651, 1.64360619, 1.66183615, 1.63719821, 1.72890592]),
 'score_time': array([0.01901269, 0.01898718, 0.01874948, 0.01865101, 0.03907037]),
 'test_cohen_kappa': array([0.38279349, 0.42449611, 0.41008203, 0.38439925, 0.38231908]),
 'train_cohen_kappa': array([0.90303812, 0.9024584 , 0.90765513, 0.9064895 , 0.90515808])}

In [None]:
np.mean(cv_results_bin['test_cohen_kappa'])

0.3968179898755836

## 5. Final Models

In [None]:
models = cv_results_reduced['estimator']

In [None]:
lgbm = lgb.LGBMRegressor(
    num_leaves=31,
    max_depth=-1,
    n_estimators=100,
    min_child_samples=20,
    random_state=CFG['SEED'],
    n_jobs=1,
    force_col_wise=True,
)

In [None]:
# Use whole data
lgbm.fit(df_train[COLS_REDUCED], df_train['sii'])
models.append(lgbm)

[LightGBM] [Info] Total Bins 20938
[LightGBM] [Info] Number of data points in the train set: 2736, number of used features: 126
[LightGBM] [Info] Start training from score 0.580409


## 6. Voting

In [None]:
WEIGHTS = [1, 1, 1, 1, 1, 2]

In [None]:
df_predict = pd.DataFrame()

for i, model in enumerate(models):
    df_predict[f'model_{i}'] = model.predict(df_test[COLS_REDUCED])
df_predict

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,model_5
0,0.518019,1.026785,1.127445,1.205338,1.172414,1.105683
1,0.224894,0.005895,0.226981,0.145753,0.102807,0.145151
2,0.292421,0.250543,0.265107,0.219911,0.380465,0.292121
3,-0.136582,0.73374,0.842335,0.830676,0.638753,0.789632
4,0.576781,0.637962,0.824462,0.592486,0.96197,0.789066
5,0.77119,0.8825,0.913016,0.897361,0.855107,0.915921
6,0.646647,0.332937,0.349142,0.15212,0.012782,0.301595
7,-0.020131,0.209108,-0.319175,0.160359,-0.102938,0.009824
8,0.889741,1.056569,0.856648,0.509349,0.847974,0.717442
9,0.413804,0.483148,0.888342,0.536092,0.774809,0.754539


In [None]:
pred = np.average(df_predict, axis=1, weights=WEIGHTS)
pred

array([ 1.0373381 ,  0.14237572,  0.28466967,  0.64116956,  0.73882761,
        0.87871674,  0.29954529, -0.00758997,  0.79930927,  0.65789625,
        0.5013937 ,  0.43859814,  1.05419951,  0.84430313,  0.71005773,
        1.2738406 ,  0.07630003,  0.16787628,  0.26690115,  0.80918311])

In [None]:
bins

[0.5, 1.5, 2.5]

In [None]:
pred = np.digitize(pred, bins)
pred

array([1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1])

## 7. Submit

In [None]:
df_submission = pd.DataFrame({
    'id': df_test['id'],
    'sii': pred
})
df_submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1


In [None]:
df_submission['sii'].value_counts()

sii
1    12
0     8
Name: count, dtype: int64

In [None]:
df_submission.to_csv('submission.csv', index=False)