<a href="https://www.kaggle.com/code/taimour/generative-adversarial-networks-lgb-piu-cmi?scriptVersionId=202326498" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# <div style="text-align:center"><span style="background-color:#15a15b;padding:15px;border-radius:40px;">🕸️Generative Adversarial Networks - 🧑‍💻Problametic Internet Usage</span></div>

![](https://i.postimg.cc/zGCq3CMz/pexels-marta-wave-6437642.jpg)

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🎒Import Libraries</span>

In [1]:
import numpy as np
import polars as pl
import pandas as pd
from sklearn.base import clone
from copy import deepcopy
import optuna
from scipy.optimize import minimize
import os
import matplotlib.pyplot as plt
import seaborn as sns

import re
from colorama import Fore, Style

from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import lightgbm as lgb
# from catboost import CatBoostRegressor, CatBoostClassifier
import xgboost as xgb
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import *
from sklearn.metrics import *

SEED = 42
n_splits = 5

#For GANs
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import random
import tensorflow as tf

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">✨Preprocessing</span>

In [2]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
time_series_cols = test_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
          'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping_train = create_mapping(col, train)
    mapping_test = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping_train).astype(int)
    test[col] = test[col].replace(mapping_test).astype(int)

print(f'Train Shape : {train.shape} || Test Shape : {test.shape}')

100%|██████████| 996/996 [01:18<00:00, 12.74it/s]
100%|██████████| 2/2 [00:00<00:00,  9.72it/s]

Train Shape : (2736, 155) || Test Shape : (20, 154)





# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🕸️Generative Adversarial Networks</span>

In [3]:
import time
start_time = time.time()

# Set random seeds for reproducibility
GAN_SEED=42
random.seed(GAN_SEED)
np.random.seed(GAN_SEED)
tf.random.set_seed(GAN_SEED)

# Load and preprocess data
train_data = train
X = train_data.drop(columns=['sii'])
y = train_data['sii']
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
train_data = train_data.values #convert pd dataframe to np array

# Generator model
def build_generator(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(output_dim, activation='linear'))
    return model

# Discriminator model
def build_discriminator(input_dim):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    return model

# GAN model
def build_gan(generator, discriminator):
    model = Sequential()
    model.add(generator)
    discriminator.trainable = False
    model.add(discriminator)
    return model

# Compile models
input_dim = train_data.shape[1]
output_dim = train_data.shape[1]
discriminator = build_discriminator(output_dim)
discriminator.compile(optimizer=Adam(learning_rate=0.0002, beta_1=0.5), loss='binary_crossentropy', metrics=['accuracy'])
generator = build_generator(input_dim=input_dim, output_dim=output_dim)
gan = build_gan(generator, discriminator)
gan.compile(optimizer=Adam(learning_rate=0.0002, beta_1=0.5), loss='binary_crossentropy')

# Training function
def train_gan(generator, discriminator, gan, epochs, batch_size, noise_dim, patience=500):
    real_label = np.ones((batch_size, 1))
    fake_label = np.zeros((batch_size, 1))
    best_g_loss = np.inf
    patience_counter = 0
    
    for epoch in range(epochs):
        # Train discriminator on real data
        idx = np.random.randint(0, train_data.shape[0], batch_size)
        real_data = train_data[idx]
        d_loss_real = discriminator.train_on_batch(real_data, real_label)
        
        # Train discriminator on fake data
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        fake_data = generator.predict(noise)
        d_loss_fake = discriminator.train_on_batch(fake_data, fake_label)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # Train generator via GAN
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        g_loss = gan.train_on_batch(noise, real_label)

        # Extract the generator loss value (if it's a list, take the first element)
        g_loss_value = g_loss[0] if isinstance(g_loss, list) else g_loss
        
        # Check for early stopping
        if g_loss_value < best_g_loss:
            best_g_loss = g_loss_value
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter > patience:
            print(f"Early stopping at epoch {epoch} due to no improvement in generator loss")
            break
        
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Discriminator Loss: {d_loss[0]}, Generator Loss: {g_loss_value}")

# Train GAN
epochs = 10000
batch_size = 64
noise_dim = input_dim
train_gan(generator, discriminator, gan, epochs, batch_size, noise_dim)

# Generate synthetic data
noise = np.random.normal(0, 1, (10000, noise_dim))
synthetic_data = generator.predict(noise)
synthetic_X = synthetic_data[:, :-1]
synthetic_y = synthetic_data[:, -1]
synthetic_X = scaler_X.inverse_transform(synthetic_X)
synthetic_y = scaler_y.inverse_transform(synthetic_y.reshape(-1, 1))

# Clip and round the generated sii values to ensure they are in the set {0, 1, 2, 3}
synthetic_y = np.clip(np.round(synthetic_y), 0, 3)

# Combine synthetic features and target into a single DataFrame
synthetic_data = np.hstack((synthetic_X, synthetic_y))

columns_names = list(X.columns) + ['sii']  # Assuming 'sii' is the target column
synthetic_df = pd.DataFrame(synthetic_data, columns=columns_names)

# Save the synthetic data to a CSV file

# Combine train scaled features and target into a single DataFrame
y_scaled = np.clip(np.round(y_scaled), 0, 3)
train_org_data = pd.DataFrame(np.hstack((X_scaled, y_scaled)),columns=columns_names)

# Concatenate train_data and syn_data along the rows (axis=0)
train = pd.concat([train_org_data, synthetic_df], axis=0, ignore_index=True)
# combined_data.to_csv('cmi_train_and_synthetic_data.csv', index=False)

print("Synthetic data saved to synthetic_data.csv")
print(train['sii'].value_counts())
print(train.shape)

end_time = time.time()
elapsed_time = end_time - start_time
print("Time elapsed:", elapsed_time, "seconds")

I0000 00:00:1729439527.947662      97 service.cc:145] XLA service 0x7ebf380079a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1729439527.947717      97 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1729439527.947722      97 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1729439530.775500      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  
Epoch 0, Discriminator Loss: nan, Generator Loss: nan
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🗃️Modeling</span>

In [4]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
#         clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead') # Nelder-Mead | # Powell
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission,model

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🛡️LGBMRegressor</span>

In [5]:
Params7 = {
           'learning_rate': 0.03884249148676395, 
           'max_depth': 12, 
           'num_leaves': 413, 
           'min_data_in_leaf': 14,
           'feature_fraction': 0.7987976913702801, 
           'bagging_fraction': 0.7602261703576205, 
           'bagging_freq': 2, 
           'lambda_l1': 4.735462555910575, 
           'lambda_l2': 4.735028557007343e-06
          } 

Light = lgb.LGBMRegressor(**Params7,random_state=SEED, verbose=-1,n_estimators=250, device='gpu')
Submission,model = TrainML(Light,test)

Training Folds:  20%|██        | 1/5 [00:13<00:53, 13.25s/it]

Fold 1 - Train QWK: 0.9900, Validation QWK: 0.9459


Training Folds:  40%|████      | 2/5 [00:20<00:29,  9.99s/it]

Fold 2 - Train QWK: 0.9901, Validation QWK: 0.9434


Training Folds:  60%|██████    | 3/5 [00:28<00:17,  8.92s/it]

Fold 3 - Train QWK: 0.9902, Validation QWK: 0.9458


Training Folds:  80%|████████  | 4/5 [00:36<00:08,  8.63s/it]

Fold 4 - Train QWK: 0.9893, Validation QWK: 0.9435


Training Folds: 100%|██████████| 5/5 [00:44<00:00,  8.91s/it]

Fold 5 - Train QWK: 0.9904, Validation QWK: 0.9405
Mean Train QWK --> 0.9900
Mean Validation QWK ---> 0.9438





----> || Optimized QWK SCORE :: [36m[1m 0.946[0m


# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">📁Submission</span>

In [6]:
Submission.to_csv('submission.csv', index=False)
print(Submission['sii'].value_counts())

sii
0    13
1     7
Name: count, dtype: int64
