# AutoML H2O - CMI  Problematic Internet Usage

![](https://i.postimg.cc/Jz5DS0dq/pexels-james-frid-81279-9823161.jpg)

# Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd #
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import h2o
from h2o.automl import H2OAutoML
from sklearn.base import clone
# from sklearn.metrics import *

SEED = 42
n_splits = 5

# Load CSV Files

In [2]:
train_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]

train_data = train_df.drop(TARGET_COLS,axis=1)

test_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
ids = test_df['id']

# Load Parquet Files

In [3]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
time_series_cols = train_ts.columns.tolist()
# time_series_cols.remove("id")

train = pd.merge(train_data, train_ts, how="left", on='id')
test = pd.merge(test_df, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
train = train.dropna(subset=['sii'])

100%|██████████| 996/996 [01:20<00:00, 12.42it/s]
100%|██████████| 2/2 [00:00<00:00,  9.49it/s]


# Preprocess

In [4]:
def preprocess_data(df,train_data=False):
    # Handle numerical columns
    num_cols = df.select_dtypes(include=np.number).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # Handle categorical columns
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill missing with the most frequent value

    return df

train = preprocess_data(train)
test = preprocess_data(test)

train = pd.DataFrame(train)
test = pd.DataFrame(test)

# train = train.fillna('null')
# test = test.fillna('null')

# View Data

In [5]:
train.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,Stat_86,Stat_87,Stat_88,Stat_89,Stat_90,Stat_91,Stat_92,Stat_93,Stat_94,Stat_95
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,26.0,...,1.740934,3.806256,89.377281,1.0,2613.625,4180.0,86395000000000.0,7.0,3.0,51.0
1,Summer,9,0,Spring,65.0,Fall,14.03559,48.0,46.0,22.0,...,1.740934,3.806256,89.377281,1.0,2613.625,4180.0,86395000000000.0,7.0,3.0,51.0
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,26.0,...,1.740934,3.806256,89.377281,1.0,2613.625,4180.0,86395000000000.0,7.0,3.0,51.0
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,26.0,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
5,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,26.0,...,1.146284,2.952888,89.476036,1.0,2597.800049,4175.0,86395000000000.0,7.0,3.0,91.0


In [6]:
test.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,Stat_86,Stat_87,Stat_88,Stat_89,Stat_90,Stat_91,Stat_92,Stat_93,Stat_94,Stat_95
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,24.0,...,1.346631,3.478582,89.613846,0.5,2615.525024,4181.75,86252500000000.0,7.0,3.0,88.0
1,Summer,9,0,Summer,63.0,Fall,14.03559,48.0,46.0,22.0,...,1.346631,3.478582,89.613846,0.5,2615.525024,4181.75,86252500000000.0,7.0,3.0,88.0
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,24.0,...,1.346631,3.478582,89.613846,0.5,2615.525024,4181.75,86252500000000.0,7.0,3.0,88.0
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,24.0,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
4,Spring,18,1,Summer,63.0,Fall,18.292347,55.0,81.6,24.0,...,1.346631,3.478582,89.613846,0.5,2615.525024,4181.75,86252500000000.0,7.0,3.0,88.0


# Quadratic Weighted Kappa

In [7]:
# def quadratic_weighted_kappa(y_true, y_pred):
#     return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# def threshold_Rounder(oof_non_rounded, thresholds):
#     return np.where(oof_non_rounded < thresholds[0], 0,
#                     np.where(oof_non_rounded < thresholds[1], 1,
#                              np.where(oof_non_rounded < thresholds[2], 2, 3)))

# def evaluate_predictions(thresholds, y_true, oof_non_rounded):
#     rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
#     return -quadratic_weighted_kappa(y_true, rounded_p)

# def TrainML(model_class, test_data):
    
#     X = train.drop(['sii'], axis=1)
#     y = train['sii']

#     SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
#     train_S = []
#     test_S = []
    
#     oof_non_rounded = np.zeros(len(y), dtype=float) 
#     oof_rounded = np.zeros(len(y), dtype=int) 
#     test_preds = np.zeros((len(test_data), n_splits))

#     for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
#         X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

#         model = clone(model_class)
#         model.fit(X_train, y_train)

#         y_train_pred = model.predict(X_train)
#         y_val_pred = model.predict(X_val)

#         oof_non_rounded[test_idx] = y_val_pred
#         y_val_pred_rounded = y_val_pred.round(0).astype(int)
#         oof_rounded[test_idx] = y_val_pred_rounded

#         train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
#         val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

#         train_S.append(train_kappa)
#         test_S.append(val_kappa)
        
#         test_preds[:, fold] = model.predict(test_data)
        
#         print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
#         clear_output(wait=True)

#     print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
#     print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

#     KappaOPtimizer = minimize(evaluate_predictions,
#                               x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
#                               method='Nelder-Mead') # Nelder-Mead | # Powell
#     assert KappaOPtimizer.success, "Optimization did not converge."
    
#     oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
#     tKappa = quadratic_weighted_kappa(y, oof_tuned)

#     print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

#     tpm = test_preds.mean(axis=1)
#     tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
#     submission = pd.DataFrame({
#         'id': sample['id'],
#         'sii': tpTuned
#     })

#     return submission,KappaOPtimizer

# Params7 = {'learning_rate': 0.03884249148676395, 'max_depth': 12, 'num_leaves': 413, 'min_data_in_leaf': 14,
#            'feature_fraction': 0.7987976913702801, 'bagging_fraction': 0.7602261703576205, 'bagging_freq': 2, 
#            'lambda_l1': 4.735462555910575, 'lambda_l2': 4.735028557007343e-06} # CV : 0.4094 | LB : 0.471

# Light = lgb.LGBMRegressor(**Params7,random_state=SEED, verbose=-1,n_estimators=200)
# Submission,KappaOPtimizer = TrainML(Light,test)
# print(KappaOPtimizer.x)
# 0.55764565,0.98394812,2.720513

# AutoML H2O

In [8]:
h2o.init()
train_data = h2o.H2OFrame(train)
test_data = h2o.H2OFrame(test)
train_data['CGAS-CGAS_Score'] = train_data['CGAS-CGAS_Score'].asnumeric()
test_data['CGAS-CGAS_Score'] = test_data['CGAS-CGAS_Score'].asnumeric()
aml = H2OAutoML(max_runtime_secs=10800,seed=5)
aml.train(y='sii', training_frame=train_data)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp6999f_ao
  JVM stdout: /tmp/tmp6999f_ao/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp6999f_ao/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_unknownUser_7renje
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.250 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
06:33:58.291: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

█
06:34:05.722: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

█
06:34:09.705: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

██
06:34:17.784: _train param, Dropping unused columns: [Stat_41, Stat_42]
06:34:18.32: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

█
06:34:21.753: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

██
06:34:41.590: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

█
06:34:47.359: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

█
06:34:53.355: _train param, Dropping bad and constant columns: [Stat_41, Stat_42]

█
06:35:01.301: _train param, Dropping u

key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,1122.107,49.827694,1139.357,1126.19,1193.9022,1083.4794,1067.6063
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.529488,0.0138391,0.5367256,0.5213609,0.550196,0.5162291,0.5229285
mean_residual_deviance,0.4470222,0.033618,0.4513268,0.4296945,0.5039251,0.4242961,0.4258684
mse,0.4470222,0.033618,0.4513268,0.4296945,0.5039251,0.4242961,0.4258684
null_deviance,325.3673,16.822369,325.1222,311.5609,354.05756,320.3949,315.70102
r2,0.247965,0.0312728,0.2332294,0.2273205,0.2161637,0.2821104,0.281001
residual_deviance,244.69044,20.206512,249.13237,240.62894,277.1588,229.54422,226.98784
rmse,0.6682325,0.0246865,0.6718086,0.6555109,0.7098768,0.6513802,0.6525859
rmsle,0.3909544,0.0081702,0.3951812,0.3891108,0.4028996,0.3833361,0.3842446


# Leaderboard

In [9]:
leaderboard = aml.leaderboard

# Predict with Best Models

In [10]:
best_model = aml.leader

predictions = best_model.predict(test_data)
predictions_df = predictions.as_data_frame()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





# Prepare Submission CSV

In [11]:
sample_submission = pd.DataFrame(columns=['id','sii'])
sample_submission['id'] = ids
sample_submission['sii'] = predictions_df['predict'].values

# prediction column is 'sii'
sample_submission['sii'] = pd.cut(sample_submission['sii'], bins=[-float('inf'), 0.55764565,0.98394812,2.720513, float('inf')], labels=[0, 1, 2, 3])

# Submit and View CSV

In [12]:
sample_submission.to_csv('submission.csv',index=False)

sample_submission.head(20)

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,0
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1
