## Load Data

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

%cd ../child-mind-institute-problematic-internet-use/

/Users/sujenkancherla/Documents/child-mind-institute-problematic-internet-use


In [2]:
data = pd.read_csv("train.csv")
phys_df = pd.read_csv('../Project_225A/phy_other_overall.csv')

# Discard no label data
data = data.dropna(subset=["sii"])

print(f'Phyisological Data Size: {len(phys_df)}\nOriginal Train data size: {len(data)}')

Phyisological Data Size: 972
Original Train data size: 2736


## Feature Selection

In [3]:
interested_item = ["Basic_Demos-Age", "Basic_Demos-Sex",
                   "Physical-BMI", "Physical-Height", "Physical-Weight",
                   "Fitness_Endurance-Max_Stage", "Fitness_Endurance-Time_Mins", "Fitness_Endurance-Time_Sec",
                   "FGC-FGC_CU", "FGC-FGC_GSND", "FGC-FGC_GSD", "FGC-FGC_PU", "FGC-FGC_SRL", "FGC-FGC_SRR", "FGC-FGC_TL",
                   "BIA-BIA_BMC", "BIA-BIA_BMI", "BIA-BIA_BMR", "BIA-BIA_DEE", "BIA-BIA_Fat", "BIA-BIA_SMM", "BIA-BIA_TBW",
                   "SDS-SDS_Total_Raw", "SDS-SDS_Total_T", 
                   "PreInt_EduHx-computerinternet_hoursday", "sii"]

corr_matrix = data[interested_item].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)


sii                                       1.000000
Physical-Height                           0.373530
Basic_Demos-Age                           0.365990
PreInt_EduHx-computerinternet_hoursday    0.336526
Physical-Weight                           0.325938
FGC-FGC_CU                                0.247364
SDS-SDS_Total_T                           0.232982
SDS-SDS_Total_Raw                         0.229692
BIA-BIA_BMI                               0.227818
Physical-BMI                              0.221125
FGC-FGC_PU                                0.177125
FGC-FGC_GSD                               0.163448
FGC-FGC_GSND                              0.149495
FGC-FGC_TL                                0.107623
BIA-BIA_SMM                               0.046692
BIA-BIA_DEE                               0.045239
BIA-BIA_TBW                               0.037513
BIA-BIA_Fat                               0.037357
BIA-BIA_BMR                               0.031965
Fitness_Endurance-Time_Sec     

In [4]:
phys_df = phys_df.loc[:, ~phys_df.columns.str.startswith('acc')]

In [5]:
interested_item = ["Basic_Demos-Age", "Physical-Height", "Physical-Weight", "FGC-FGC_CU", 
                   "BIA-BIA_BMR", "BIA-BIA_DEE", "BIA-BIA_Fat", "BIA-BIA_SMM", "BIA-BIA_TBW", 
                   "PreInt_EduHx-computerinternet_hoursday", "sii"]
train_selected_features = data[['id'] + interested_item].dropna()


## Pre process

### Body Fat Values
The boday fat values should have a range of 0 to 50 which is valid for an individual body fat. The dataset has many weird values with some being negative even. To address this we replace the out of range values with the median imputed.

In [6]:
valid_min = 0
valid_max = 50

# Step 2: Replace outliers with NaN
train_selected_features['BIA-BIA_Fat'] = train_selected_features['BIA-BIA_Fat'].apply(lambda x: x if valid_min <= x <= valid_max else np.nan)

# Step 3: Impute missing values
# Option 1: Replace with median
median_fat = train_selected_features['BIA-BIA_Fat'].median()
train_selected_features.fillna({'BIA-BIA_Fat': median_fat}, inplace=True)

### Merge Physiological with train data

In [7]:
merged = pd.merge(train_selected_features, phys_df, on=['id', 'sii'], how='right')

merged = merged.dropna()
display(merged)
print(f'Merged Data length: {len(merged)}')

Unnamed: 0,id,Basic_Demos-Age,Physical-Height,Physical-Weight,FGC-FGC_CU,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_Fat,BIA-BIA_SMM,BIA-BIA_TBW,PreInt_EduHx-computerinternet_hoursday,sii,non_wear_ratio,enmo_avg,enmo_wear,enmo_night,enmo_day,enmo_high
0,00115b9f,9.0,56.00,81.6,18.0,1131.43,1923.44,18.82430,26.4798,45.9966,0.0,1.0,0.000000,0.047388,0.047388,0.026886,0.047848,0.000231
1,001f3379,13.0,59.50,112.2,12.0,1330.97,1996.45,14.47300,35.3804,63.1265,0.0,1.0,0.655708,0.011926,0.016461,0.005247,0.025072,0.000008
2,00f332d1,14.0,66.50,108.0,16.0,1414.24,2969.90,15.10200,42.1074,68.6822,2.0,1.0,0.171246,0.030255,0.036441,0.005242,0.065173,0.000317
3,01085eb3,12.0,60.50,178.0,8.0,1551.20,2016.56,14.47300,44.5863,80.7024,0.0,0.0,0.035210,0.032946,0.033677,0.005423,0.052328,0.000059
7,02cebf33,12.0,62.00,108.0,30.0,1300.75,2081.19,27.19050,38.5561,60.5743,1.0,1.0,0.711983,0.014279,0.034756,0.006837,0.049584,0.000039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,fdc11b96,13.0,66.00,185.0,18.0,1662.35,2659.76,14.47300,54.8182,88.2803,2.0,0.0,0.126315,0.028964,0.033266,0.012621,0.043773,0.000293
966,fdf4691f,9.0,58.00,94.6,10.0,1184.61,1895.37,26.16060,29.2277,49.2268,2.0,0.0,0.175629,0.045411,0.054394,0.006048,0.086978,0.001387
967,fe9c71d8,9.0,56.75,72.8,8.0,1116.86,2345.40,11.57690,25.8365,44.6590,0.0,0.0,0.137257,0.043686,0.045263,0.010077,0.072284,0.000381
968,fecc07d6,7.0,49.00,54.6,4.0,1002.34,1603.74,7.37455,21.0046,39.3149,0.0,0.0,0.939101,0.002962,0.015427,0.001789,0.050686,0.000000


Merged Data length: 572


## Run Models

### Only Training Data - Logisitic Regression

In [8]:
X = train_selected_features.drop(columns=['id', 'sii'])  
y = train_selected_features['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, random_state=42))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 60.40%


### Merged Data - Logistic Regression

In [10]:
X = merged.drop(columns=['id', 'sii'])  
X = X.drop(columns=["non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, random_state=42, class_weight={0: 1, 1: 1, 2: 2, 3: 4}))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 59.13%


### Only Physiological Data

In [11]:
X = phys_df.drop(columns=['id', 'sii'])  
y = phys_df['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, random_state=42))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 55.38%


## More Feature Selection

In [12]:
feat_lst = list(data.columns)
feat_lst.remove('id')
rm_lst = []
for feat in feat_lst:
    if feat.endswith("Season"):
        rm_lst.append(feat)
    elif feat.startswith("PCIAT"):
        rm_lst.append(feat)
for feat in rm_lst:
    feat_lst.remove(feat)
        
corr_matrix = data[feat_lst].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)


sii                                       1.000000
Physical-Height                           0.373530
Basic_Demos-Age                           0.365990
PreInt_EduHx-computerinternet_hoursday    0.336526
Physical-Weight                           0.325938
Physical-Waist_Circumference              0.272885
FGC-FGC_CU                                0.247364
SDS-SDS_Total_T                           0.232982
SDS-SDS_Total_Raw                         0.229692
BIA-BIA_BMI                               0.227818
Physical-BMI                              0.221125
FGC-FGC_PU                                0.177125
BIA-BIA_Frame_num                         0.165936
FGC-FGC_GSD                               0.163448
FGC-FGC_GSND                              0.149495
Physical-Systolic_BP                      0.135336
FGC-FGC_TL                                0.107623
BIA-BIA_FFMI                              0.102702
BIA-BIA_FMI                               0.077054
BIA-BIA_LST                    

In [13]:
feat_lst = list(phys_df.columns)
feat_lst.remove('id')
rm_lst = []
for feat in feat_lst:
    if feat.endswith("Season"):
        rm_lst.append(feat)
    elif feat.startswith("PCIAT"):
        rm_lst.append(feat)
for feat in rm_lst:
    feat_lst.remove(feat)
        
corr_matrix = phys_df[feat_lst].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)

sii               1.000000
non_wear_ratio    0.121539
enmo_night       -0.035987
enmo_high        -0.108048
enmo_avg         -0.210345
enmo_wear        -0.236801
enmo_day         -0.262213
Name: sii, dtype: float64


In [14]:
feat_lst = list(merged.columns)
feat_lst.remove('id')
rm_lst = []
for feat in feat_lst:
    if feat.endswith("Season"):
        rm_lst.append(feat)
    elif feat.startswith("PCIAT"):
        rm_lst.append(feat)
for feat in rm_lst:
    feat_lst.remove(feat)
        
corr_matrix = merged[feat_lst].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)

sii                                       1.000000
Basic_Demos-Age                           0.433762
Physical-Height                           0.415499
PreInt_EduHx-computerinternet_hoursday    0.400344
Physical-Weight                           0.344015
BIA-BIA_BMR                               0.309847
BIA-BIA_DEE                               0.302919
FGC-FGC_CU                                0.298773
BIA-BIA_TBW                               0.297511
BIA-BIA_SMM                               0.271701
BIA-BIA_Fat                               0.202018
non_wear_ratio                            0.144768
enmo_night                               -0.033235
enmo_high                                -0.070758
enmo_avg                                 -0.206280
enmo_wear                                -0.237759
enmo_day                                 -0.273086
Name: sii, dtype: float64


### Merged Data with feature selection - Logistic Regression

In [16]:
X = merged.drop(columns=['id', 'sii'])  
X = X.drop(columns=["enmo_night", "BIA-BIA_Fat", "enmo_high", "non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=300, random_state=42))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 64.35%


### Merged Data feature selection - Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier, export_text
X = merged.drop(columns=['id', 'sii'])  
#X = X.drop(columns=["enmo_night", "acc_cnt_night", "BIA-BIA_Fat", "enmo_high", "acc_cnt_night", "non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classW = {0: 1, 1: 1, 2: 2, 3: 4}
# Step 3: Initialize and train the Logistic Regression model
t = DecisionTreeClassifier(max_depth=3)
t = DecisionTreeClassifier(max_depth=3, class_weight=classW)
model = make_pipeline(StandardScaler(), t)
model.fit(X_train, y_train)
print(export_text(t, feature_names=X.columns)),

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


|--- Physical-Height <= -0.43
|   |--- enmo_high <= 2.59
|   |   |--- BIA-BIA_SMM <= -0.34
|   |   |   |--- class: 0.0
|   |   |--- BIA-BIA_SMM >  -0.34
|   |   |   |--- class: 0.0
|   |--- enmo_high >  2.59
|   |   |--- enmo_high <= 3.83
|   |   |   |--- class: 1.0
|   |   |--- enmo_high >  3.83
|   |   |   |--- class: 0.0
|--- Physical-Height >  -0.43
|   |--- PreInt_EduHx-computerinternet_hoursday <= -0.42
|   |   |--- enmo_wear <= -0.84
|   |   |   |--- class: 2.0
|   |   |--- enmo_wear >  -0.84
|   |   |   |--- class: 0.0
|   |--- PreInt_EduHx-computerinternet_hoursday >  -0.42
|   |   |--- enmo_day <= -0.86
|   |   |   |--- class: 2.0
|   |   |--- enmo_day >  -0.86
|   |   |   |--- class: 1.0

Accuracy: 61.74%


### Merged feature selected - SVM

In [None]:
from sklearn import svm
X = merged.drop(columns=['id', 'sii'])  
X = X.drop(columns=["enmo_night", "BIA-BIA_Fat", "enmo_high", "non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
classW = {0: 1, 1: 1, 2: 2, 3: 4}
clf = svm.SVC()
# clf = DecisionTreeClassifier(max_depth=3, )
model = make_pipeline(StandardScaler(), clf)
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")



KeyError: "['acc_cnt_night', 'acc_cnt_night'] not found in axis"