## Child Mind Institute — Problematic Internet Use


For this I entered the 50,000 kaggle competition to determine unhealthy internet use.

In [21]:
##import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from fastai.tabular.all import *
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [45]:
## paths for my different data
INPUT = '/kaggle/input/child-mind-institute-problematic-internet-use'
train = pd.read_csv(INPUT + '/train.csv')
test = pd.read_csv(INPUT + '/test.csv')

In [49]:
# Preprocessing steps and identifying columns
categorical_cols = ['Basic_Demos-Enroll_Season', 'Physical-Season', 'CGAS-Season']  # Add relevant categorical columns
continuous_cols = ['Basic_Demos-Age', 'Physical-BMI', 'Physical-Weight']  # Add relevant continuous columns
dep_var = 'sii'

In [53]:
test = pd.read_csv(INPUT + '/test.csv')

# Ensure Age_Group is created based on Basic_Demos-Age
def age_group(age):
    if age <= 12:
        return 'Child'
    elif 13 <= age <= 17:
        return 'Adolescent'
    else:
        return 'Young Adult'

# Apply age group categorization
train['Age_Group'] = train['Basic_Demos-Age'].apply(age_group)
test['Age_Group'] = test['Basic_Demos-Age'].apply(age_group)

# Feature engineering for BMI categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal'
    elif 25.0 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

# Apply BMI categorization
train['BMI_Category'] = train['Physical-BMI'].apply(categorize_bmi)
test['BMI_Category'] = test['Physical-BMI'].apply(categorize_bmi)

In [51]:
print(type(test))  # Should return <class 'pandas.core.frame.DataFrame'>


<class 'function'>


I was having issues with correctly aligning dataset between training and test, so I used some print functions to better understand what was going on.

In [54]:
# Step 1: Ensure missing values are handled before loading into FastAI
for col in train.columns:
    if train[col].dtype == 'object':  # Categorical columns
        train[col] = train[col].fillna(train[col].mode()[0])
    else:  # Numerical columns
        train[col] = train[col].fillna(train[col].mean())

# Repeat the same for the test set
for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna(test[col].mode()[0])
    else:
        test[col] = test[col].fillna(test[col].mean())

# Step 2: Pass the preprocessed data to FastAI, excluding FillMissing
from fastai.tabular.all import *

# Define path for saving/loading (if required)
path = Path('.')

# Continuous and categorical columns
categorical_cols = [
    'Basic_Demos-Enroll_Season', 'Physical-Season', 'Age_Group', 'BMI_Category',
    'CGAS-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
    'PAQ_A-Season', 'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season']
    
# Continuous columns should include only numeric values
continuous_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
continuous_cols = list(set(train.columns) - set(categorical_cols) - {'id', 'sii'})  # Remove 'id' and 'sii'

#Ensure no categorical columns are in the continuous columns list
print("Continuous Columns: ", continuous_cols)
print("Categorical Columns: ", categorical_cols)

# Define dependent variable
dep_var = 'sii'


Continuous Columns:  ['BIA-BIA_DEE', 'Fitness_Endurance-Time_Sec', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_02', 'SDS-SDS_Total_Raw', 'FGC-FGC_TL_Zone', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_08', 'BIA-BIA_FMI', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_Total', 'FGC-FGC_SRR_Zone', 'Physical-Weight', 'PAQ_A-PAQ_A_Total', 'FGC-FGC_SRL', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_15', 'BIA-BIA_FFM', 'Basic_Demos-Sex', 'FGC-FGC_CU_Zone', 'PAQ_C-PAQ_C_Total', 'BIA-BIA_Fat', 'Physical-Systolic_BP', 'Physical-Height', 'Physical-Waist_Circumference', 'Fitness_Endurance-Max_Stage', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_18', 'FGC-FGC_PU', 'FGC-FGC_GSD_Zone', 'BIA-BIA_BMR', 'BIA-BIA_SMM', 'PCIAT-PCIAT_06', 'PreInt_EduHx-computerinternet_hoursday', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_05', 'Fitness_Endurance-Time_Mins', 'CGAS-CGAS_Score', 'FGC-FGC_GSND', 'BIA-BIA_LST', 'BIA-BIA_ICW', 'Physical-HeartRate', 'BIA-BIA_ECW', 'SDS-SDS_Total_T', 'PCIAT-PCIAT_09', 'FGC-FGC_SRL_Zone', 'BIA-BIA_LDM', 'FGC-F

In [56]:

# Create FastAI dataloaders, apply Normalize only to continuous variables
dls = TabularDataLoaders.from_df(
    df=train, 
    path='.', 
    procs=[Categorify, Normalize],  # Normalize is applied only to continuous features
    cat_names=categorical_cols, 
    cont_names=continuous_cols, 
    y_names=dep_var, 
    valid_pct=0.2, 
    seed=42
)


In [57]:
# Train model
learn = tabular_learner(dls, metrics=accuracy)

# Train the model
learn.fit_one_cycle(2)


epoch,train_loss,valid_loss,accuracy,time
0,0.495069,7.771564,0.407828,00:00
1,0.302016,1.717238,0.407828,00:00


In [62]:
# Step 1: Print shapes of train and test sets before alignment
print(f"Training set shape: {train.shape}")
print(f"Test set shape: {test.shape}")

# Step 2: Check if there are missing columns in the test set
missing_cols = set(train.columns) - set(test.columns)
print(f"Missing columns in test set: {missing_cols}")

# Step 3: Add missing columns to the test set (excluding 'sii')
for col in missing_cols:
    if col != 'sii':  # Skip 'sii' because it's the target variable
        if train[col].dtype == 'object':  # If the column is categorical
            test[col] = train[col].mode()[0]  # Fill with the most frequent category
        else:  # If the column is numerical
            test[col] = 0  # Fill numerical columns with 0

# Step 4: Print test set columns after adding missing columns
print(f"Test set columns after adding missing: {test.columns}")

# Step 5: Reorder the test set columns to match the training set (drop 'sii' from train set)
test = test.reindex(columns=train.columns.drop('sii'), fill_value=0)

# Step 6: Print column alignment
print(f"Train set columns: {train.columns}")
print(f"Test set columns: {test.columns}")

# Step 7: Print shapes after reordering
print(f"Training set shape after alignment: {train.shape}")
print(f"Test set shape after alignment: {test.shape}")


Training set shape: (3960, 84)
Test set shape: (20, 83)
Missing columns in test set: {'sii'}
Test set columns after adding missing: Index(['id', 'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
       'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
       'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
       'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
       'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BI

In [77]:
def get_predictions(learn, test_df, dls):
    """
    Function to predict target values for test data using a trained FastAI model.
    
    Parameters:
    - learn: Trained FastAI learner object
    - test_df: Preprocessed test DataFrame
    - dls: DataLoaders object used to process the data
    
    Returns:
    - preds: The predicted values for the test data
    """
    
    # Ensure the test data is aligned with the training data columns
    test_dl = dls.test_dl(test_df)
    
    # Get predictions
    preds, _ = learn.get_preds(dl=test_dl)
    
    # Convert predictions to actual values (if it's a classification, take argmax)
    if learn.dls.c == 1:  # For regression problems
        final_preds = preds.numpy()
    else:  # For classification problems
        final_preds = preds.argmax(dim=1).numpy()
    
    return final_preds

# Call the function with the required inputs
predictions = get_predictions(learn, test, dls)

# Print the predictions
print(predictions)

[[-0.33473745]
 [-0.5295082 ]
 [ 0.02046593]
 [-0.00364726]
 [-0.14308275]
 [-0.36770022]
 [-0.32066864]
 [-0.38246098]
 [-0.18887849]
 [-0.14243495]
 [-0.14141166]
 [-0.26225752]
 [ 0.01007659]
 [ 0.06750283]
 [-0.18137968]
 [ 0.05114394]
 [-0.42389846]
 [-0.2118071 ]
 [-0.15892577]
 [-0.16369575]]


In [80]:
def create_submission_file(learn, test_df, dls, submission_filename='/kaggle/working/submission.csv'):
    """
    Function to predict the 'sii' values for the test set and create a submission file.
    
    Parameters:
    - learn: Trained FastAI learner object
    - test_df: Preprocessed test DataFrame
    - dls: DataLoaders object used to process the data
    - submission_filename: Path to the CSV file for submission (default: '/kaggle/working/submission.csv')
    """
    
    # Ensure the test data is aligned with the training data columns
    test_dl = dls.test_dl(test_df)
    
    # Get predictions
    preds, _ = learn.get_preds(dl=test_dl)
    
    # Get the predicted labels by taking the argmax of the predictions
    predicted_sii = preds.argmax(dim=1).numpy()
    
    # Create a DataFrame for submission
    submission_df = pd.DataFrame({
        'id': test_df['id'],  # Use the 'id' column from the test set
        'sii': predicted_sii  # Add the predicted 'sii' values
    })
    
    # Save to CSV file in /kaggle/working directory
    submission_df.to_csv(submission_filename, index=False)
    print(f"Submission file created: {submission_filename}")

# Call the function to create the submission file in /kaggle/working
create_submission_file(learn, test, dls)

Submission file created: /kaggle/working/submission.csv
