<a href="https://www.kaggle.com/code/taimour/xgb-gridsearchcv-problematic-internet-usage?scriptVersionId=202328045" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# <div style="text-align:center"><span style="background-color:#15a15b;padding:15px;border-radius:40px;">Child Mind Institute - Problametic Internet Usage</span></div>

![](https://i.postimg.cc/4dm5Wrvn/pexels-ron-lach-9783813.jpg)

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🎒Import Libraries</span>

In [1]:
import numpy as np # linear algebra
import pandas as pd #
import xgboost as xgb
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import GridSearchCV

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">📚Load and Prepare Data</span>

**CSV Files**

In [2]:
train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]

train_data = train_data.drop(TARGET_COLS,axis=1)

test_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
ids = test_df['id']

**Parquet Files**

In [3]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
time_series_cols = train_ts.columns.tolist()

train = pd.merge(train_data, train_ts, how="left", on='id')
test = pd.merge(test_df, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
train = train.dropna(subset=['sii'])

100%|██████████| 996/996 [01:42<00:00,  9.75it/s]
100%|██████████| 2/2 [00:00<00:00,  5.30it/s]


In [4]:
train.shape

(2736, 155)

In [5]:
test.shape

(20, 154)

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">✨Preprocessing</span>

In [6]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(df,train_data=False):
    # Handle numerical columns
    scaler = StandardScaler()
    num_cols = df.select_dtypes(include=np.number).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # Handle categorical columns
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill missing with the most frequent value
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    
    if train_data:
        y = list(df['sii'])
        X = scaler.fit_transform(df.drop(['sii'],axis=1))
        return X,y
    scaled_df = scaler.fit_transform(df)
    
    
    return scaled_df

X,y = preprocess_data(train,train_data=True)
test_data = preprocess_data(test)

# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🔎GridSearchCV</span>

In [7]:
# Define parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.1, 0.05, 0.01],
#     'max_depth': [3, 5, 7],
#     'subsample': [0.8, 0.9, 1.0],
# }

# # Create XGBoost classifier
# xgb = XGBClassifier()

# # Create grid search object
# grid_search = GridSearchCV(xgb, param_grid, cv=5)

# # Fit grid search to your data
# grid_search.fit(X, y)

# # Best parameters
# best_params = grid_search.best_params_

best_params = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
print(best_params)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


# <span style="background-color:#b27eed;padding:15px;border-radius:40px;">🛡️XGBClassifier</span>

In [8]:
xgb_best = xgb.XGBClassifier(**best_params)
xgb_best.fit(X, y, verbose=False)

test_result = xgb_best.predict(test_data).astype(np.int32)
csv_submission = pd.DataFrame(columns=['id','sii'])
csv_submission['id'] = ids
csv_submission['sii'] = test_result
csv_submission.to_csv('submission.csv',index=False)

csv_submission.head(20)

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,0
5,001f3379,0
6,0038ba98,0
7,0068a485,0
8,0069fbed,0
9,0083e397,0
