## Load features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

%cd ../child-mind-institute-problematic-internet-use/

/Users/sujenkancherla/Documents/child-mind-institute-problematic-internet-use


In [2]:
data = pd.read_csv("train.csv")
phys_df = pd.read_csv('../Project_225A/phy_other_overall.csv')

# Discard no label data
data = data.dropna(subset=["sii"])
# data = data[data.loc[:, "BIA-BIA_Fat"].between(0, 200, inclusive="both")]

In [3]:
len(phys_df)

972

In [4]:
interested_item = ["Basic_Demos-Age", "Basic_Demos-Sex",
                   "Physical-BMI", "Physical-Height", "Physical-Weight",
                   "Fitness_Endurance-Max_Stage", "Fitness_Endurance-Time_Mins", "Fitness_Endurance-Time_Sec",
                   "FGC-FGC_CU", "FGC-FGC_GSND", "FGC-FGC_GSD", "FGC-FGC_PU", "FGC-FGC_SRL", "FGC-FGC_SRR", "FGC-FGC_TL",
                   "BIA-BIA_BMC", "BIA-BIA_BMI", "BIA-BIA_BMR", "BIA-BIA_DEE", "BIA-BIA_Fat", "BIA-BIA_SMM", "BIA-BIA_TBW",
                   "SDS-SDS_Total_Raw", "SDS-SDS_Total_T", 
                   "PreInt_EduHx-computerinternet_hoursday", "sii"]

corr_matrix = data[interested_item].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)


sii                                       1.000000
Physical-Height                           0.373530
Basic_Demos-Age                           0.365990
PreInt_EduHx-computerinternet_hoursday    0.336526
Physical-Weight                           0.325938
FGC-FGC_CU                                0.247364
SDS-SDS_Total_T                           0.232982
SDS-SDS_Total_Raw                         0.229692
BIA-BIA_BMI                               0.227818
Physical-BMI                              0.221125
FGC-FGC_PU                                0.177125
FGC-FGC_GSD                               0.163448
FGC-FGC_GSND                              0.149495
FGC-FGC_TL                                0.107623
BIA-BIA_SMM                               0.046692
BIA-BIA_DEE                               0.045239
BIA-BIA_TBW                               0.037513
BIA-BIA_Fat                               0.037357
BIA-BIA_BMR                               0.031965
Fitness_Endurance-Time_Sec     

In [5]:
interested_item = ["Basic_Demos-Age", "Physical-Height", "Physical-Weight", "FGC-FGC_CU", 
                   "BIA-BIA_BMR", "BIA-BIA_DEE", "BIA-BIA_Fat", "BIA-BIA_SMM", "BIA-BIA_TBW", 
                   "PreInt_EduHx-computerinternet_hoursday", "sii"]
train_selected_features = data[['id'] + interested_item].dropna()


In [6]:
# pre process

valid_min = 0
valid_max = 50

# Step 2: Replace outliers with NaN
train_selected_features['BIA-BIA_Fat'] = train_selected_features['BIA-BIA_Fat'].apply(lambda x: x if valid_min <= x <= valid_max else np.nan)

# Step 3: Impute missing values
# Option 1: Replace with median
median_fat = train_selected_features['BIA-BIA_Fat'].median()
train_selected_features.fillna({'BIA-BIA_Fat': median_fat}, inplace=True)

In [7]:
merged = pd.merge(train_selected_features, phys_df, on=['id', 'sii'], how='right')

merged = merged.dropna()
display(merged)
print(len(merged), len(merged.dropna()), len(train_selected_features), len(train_selected_features.dropna()))

Unnamed: 0,id,Basic_Demos-Age,Physical-Height,Physical-Weight,FGC-FGC_CU,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_Fat,BIA-BIA_SMM,BIA-BIA_TBW,...,non_wear_ratio,acc_cnt,acc_cnt_wear,acc_cnt_night,acc_cnt_day,enmo_avg,enmo_wear,enmo_night,enmo_day,enmo_high
0,00115b9f,9.0,56.00,81.6,18.0,1131.43,1923.44,18.82430,26.4798,45.9966,...,0.000000,0.059387,0.059387,0.026096,0.057444,0.047388,0.047388,0.026886,0.047848,0.000231
1,001f3379,13.0,59.50,112.2,12.0,1330.97,1996.45,14.47300,35.3804,63.1265,...,0.655708,0.014682,0.023693,0.010124,0.034350,0.011926,0.016461,0.005247,0.025072,0.000008
2,00f332d1,14.0,66.50,108.0,16.0,1414.24,2969.90,15.10200,42.1074,68.6822,...,0.171246,0.030114,0.034049,0.014085,0.049925,0.030255,0.036441,0.005242,0.065173,0.000317
3,01085eb3,12.0,60.50,178.0,8.0,1551.20,2016.56,14.47300,44.5863,80.7024,...,0.035210,0.036489,0.037325,0.010500,0.053217,0.032946,0.033677,0.005423,0.052328,0.000059
7,02cebf33,12.0,62.00,108.0,30.0,1300.75,2081.19,27.19050,38.5561,60.5743,...,0.711983,0.017473,0.039652,0.013034,0.061417,0.014279,0.034756,0.006837,0.049584,0.000039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,fdc11b96,13.0,66.00,185.0,18.0,1662.35,2659.76,14.47300,54.8182,88.2803,...,0.126315,0.037370,0.039907,0.022745,0.050553,0.028964,0.033266,0.012621,0.043773,0.000293
966,fdf4691f,9.0,58.00,94.6,10.0,1184.61,1895.37,26.16060,29.2277,49.2268,...,0.175629,0.047540,0.053658,0.015692,0.079492,0.045411,0.054394,0.006048,0.086978,0.001387
967,fe9c71d8,9.0,56.75,72.8,8.0,1116.86,2345.40,11.57690,25.8365,44.6590,...,0.137257,0.051195,0.053595,0.022587,0.075386,0.043686,0.045263,0.010077,0.072284,0.000381
968,fecc07d6,7.0,49.00,54.6,4.0,1002.34,1603.74,7.37455,21.0046,39.3149,...,0.939101,0.002205,0.012529,0.001829,0.048425,0.002962,0.015427,0.001789,0.050686,0.000000


572 572 1512 1512


In [8]:
X = merged.drop(columns=['id', 'sii'])  
X = X.drop(columns=["non_wear_ratio", "acc_cnt"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, random_state=42, class_weight={0: 1, 1: 1, 2: 2, 3: 4}))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 55.65%


In [9]:
X = train_selected_features.drop(columns=['id', 'sii'])  
y = train_selected_features['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, random_state=42))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 60.40%


In [10]:
X = phys_df.drop(columns=['id', 'sii'])  
y = phys_df['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, random_state=42))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 54.87%


In [11]:
feat_lst = list(data.columns)
feat_lst.remove('id')
rm_lst = []
for feat in feat_lst:
    if feat.endswith("Season"):
        rm_lst.append(feat)
    elif feat.startswith("PCIAT"):
        rm_lst.append(feat)
for feat in rm_lst:
    feat_lst.remove(feat)
        
corr_matrix = data[feat_lst].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)


sii                                       1.000000
Physical-Height                           0.373530
Basic_Demos-Age                           0.365990
PreInt_EduHx-computerinternet_hoursday    0.336526
Physical-Weight                           0.325938
Physical-Waist_Circumference              0.272885
FGC-FGC_CU                                0.247364
SDS-SDS_Total_T                           0.232982
SDS-SDS_Total_Raw                         0.229692
BIA-BIA_BMI                               0.227818
Physical-BMI                              0.221125
FGC-FGC_PU                                0.177125
BIA-BIA_Frame_num                         0.165936
FGC-FGC_GSD                               0.163448
FGC-FGC_GSND                              0.149495
Physical-Systolic_BP                      0.135336
FGC-FGC_TL                                0.107623
BIA-BIA_FFMI                              0.102702
BIA-BIA_FMI                               0.077054
BIA-BIA_LST                    

In [12]:
feat_lst = list(phys_df.columns)
feat_lst.remove('id')
rm_lst = []
for feat in feat_lst:
    if feat.endswith("Season"):
        rm_lst.append(feat)
    elif feat.startswith("PCIAT"):
        rm_lst.append(feat)
for feat in rm_lst:
    feat_lst.remove(feat)
        
corr_matrix = phys_df[feat_lst].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)

sii               1.000000
non_wear_ratio    0.121539
enmo_night       -0.035987
acc_cnt_night    -0.086318
enmo_high        -0.108048
enmo_avg         -0.210345
acc_cnt          -0.235147
enmo_wear        -0.236801
enmo_day         -0.262213
acc_cnt_wear     -0.265003
acc_cnt_day      -0.302568
Name: sii, dtype: float64


In [13]:
feat_lst = list(merged.columns)
feat_lst.remove('id')
rm_lst = []
for feat in feat_lst:
    if feat.endswith("Season"):
        rm_lst.append(feat)
    elif feat.startswith("PCIAT"):
        rm_lst.append(feat)
for feat in rm_lst:
    feat_lst.remove(feat)
        
corr_matrix = merged[feat_lst].corr(method='pearson')
sii_corr = corr_matrix['sii'].sort_values(ascending=False)
print(sii_corr)

sii                                       1.000000
Basic_Demos-Age                           0.433762
Physical-Height                           0.415499
PreInt_EduHx-computerinternet_hoursday    0.400344
Physical-Weight                           0.344015
BIA-BIA_BMR                               0.309847
BIA-BIA_DEE                               0.302919
FGC-FGC_CU                                0.298773
BIA-BIA_TBW                               0.297511
BIA-BIA_SMM                               0.271701
BIA-BIA_Fat                               0.202018
non_wear_ratio                            0.144768
enmo_night                               -0.033235
enmo_high                                -0.070758
acc_cnt_night                            -0.101844
enmo_avg                                 -0.206280
enmo_wear                                -0.237759
acc_cnt                                  -0.248384
enmo_day                                 -0.273086
acc_cnt_wear                   

In [14]:
X = merged.drop(columns=['id', 'sii'])  
X = X.drop(columns=["enmo_night", "acc_cnt_night", "BIA-BIA_Fat", "enmo_high", "acc_cnt_night", "non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Step 3: Initialize and train the Logistic Regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=300, random_state=42))
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 60.00%


In [15]:
from sklearn.tree import DecisionTreeClassifier, export_text
X = merged.drop(columns=['id', 'sii'])  
#X = X.drop(columns=["enmo_night", "acc_cnt_night", "BIA-BIA_Fat", "enmo_high", "acc_cnt_night", "non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classW = {0: 1, 1: 1, 2: 2, 3: 4}
# Step 3: Initialize and train the Logistic Regression model
t = DecisionTreeClassifier(max_depth=3)
t = DecisionTreeClassifier(max_depth=3, class_weight=classW)
model = make_pipeline(StandardScaler(), t)
model.fit(X_train, y_train)
print(export_text(t, feature_names=X.columns)),

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


|--- Basic_Demos-Age <= 0.16
|   |--- enmo_wear <= -0.96
|   |   |--- BIA-BIA_Fat <= -0.69
|   |   |   |--- class: 0.0
|   |   |--- BIA-BIA_Fat >  -0.69
|   |   |   |--- class: 2.0
|   |--- enmo_wear >  -0.96
|   |   |--- acc_cnt <= 1.45
|   |   |   |--- class: 0.0
|   |   |--- acc_cnt >  1.45
|   |   |   |--- class: 0.0
|--- Basic_Demos-Age >  0.16
|   |--- PreInt_EduHx-computerinternet_hoursday <= -0.43
|   |   |--- BIA-BIA_SMM <= 1.00
|   |   |   |--- class: 0.0
|   |   |--- BIA-BIA_SMM >  1.00
|   |   |   |--- class: 1.0
|   |--- PreInt_EduHx-computerinternet_hoursday >  -0.43
|   |   |--- Basic_Demos-Age <= 0.47
|   |   |   |--- class: 1.0
|   |   |--- Basic_Demos-Age >  0.47
|   |   |   |--- class: 2.0

Accuracy: 60.87%


In [16]:
from sklearn import svm
X = merged.drop(columns=['id', 'sii'])  
X = X.drop(columns=["enmo_night", "acc_cnt_night", "BIA-BIA_Fat", "enmo_high", "acc_cnt_night", "non_wear_ratio"])
y = merged['sii']  

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the Logistic Regression model
classW = {0: 1, 1: 1, 2: 2, 3: 4}
clf = svm.SVC()
# clf = DecisionTreeClassifier(max_depth=3, )
model = make_pipeline(StandardScaler(), clf)
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")



Accuracy: 56.52%


In [17]:
print(len(train_selected_features), len(merged), len(phys_df))


1512 572 972
