# Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd #
from xgboost import XGBClassifier
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Load and Prepare Data

**CSV Files**

In [2]:
train_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]

train_data = train_df.drop(TARGET_COLS,axis=1)

test_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
ids = test_df['id']



**Parquet Files**

In [3]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
time_series_cols = train_ts.columns.tolist()
# time_series_cols.remove("id")

train = pd.merge(train_data, train_ts, how="left", on='id')
test = pd.merge(test_df, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
# train_data
# test_df 

100%|██████████| 996/996 [01:33<00:00, 10.64it/s]
100%|██████████| 2/2 [00:00<00:00,  8.31it/s]


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(df,train_data=False):
    # Handle numerical columns
    scaler = StandardScaler()
    num_cols = df.select_dtypes(include=np.number).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # Handle categorical columns
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill missing with the most frequent value
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    
    if train_data:
        y = list(df['sii'])
        X = scaler.fit_transform(df.drop(['sii'],axis=1))
        return X,y
    scaled_df = scaler.fit_transform(df)
    
    
    return scaled_df

X,y = preprocess_data(train,train_data=True)
test_data = preprocess_data(test)

my_model = XGBClassifier()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(X, y, verbose=False)

test_result = my_model.predict(test_data).astype(np.int32)
sample_submission = pd.DataFrame(columns=['id','sii'])
sample_submission['id'] = ids
sample_submission['sii'] = test_result

sample_submission.to_csv('submission.csv',index=False)