In [16]:
import joblib
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model


model = load_model("models/diabetes_dl_model.keras")
scaler = joblib.load("models/model_accu_69.pkl")

In [17]:
test_df = pd.read_csv("./data/test.csv")
test_df.head(5)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,Female,White,Highschool,Middle,Former,Employed,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,Female,White,Highschool,Middle,Never,Unemployed,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,...,184,Male,White,Highschool,Low,Never,Employed,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,...,128,Male,White,Graduate,Middle,Former,Employed,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,...,133,Male,White,Graduate,Low,Current,Unemployed,0,0,0


In [18]:
test_ids = test_df['id']
X_test = test_df.drop(columns=['id'])

In [19]:
X_test['smoking_status'] = X_test['smoking_status'].map({
    'Never': 0,
    'Former': 1,
    'Current': 2
})

X_test['education_level'] = X_test['education_level'].map({
    'No formal': 0,
    'Highschool': 1,
    'Graduate': 2,
    'Postgraduate': 3
})

X_test['income_level'] = X_test['income_level'].map({
    'Low': 0,
    'Lower-Middle': 1,
    'Middle': 2,
    'Upper-Middle': 3,
    'High': 4
})


In [20]:
X_test = pd.get_dummies(
    X_test,
    columns=['gender', 'ethnicity', 'employment_status'],
    drop_first=True
)


In [21]:
X_test.head(5)

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,cardiovascular_history,gender_Male,gender_Other,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,employment_status_Retired,employment_status_Student,employment_status_Unemployed
0,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,...,0,False,False,False,False,False,True,False,False,False
1,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,...,0,False,False,False,False,False,True,False,False,True
2,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,...,0,True,False,False,False,False,True,False,False,False
3,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,...,0,True,False,False,False,False,True,False,False,False
4,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,...,0,True,False,False,False,False,True,False,False,True


In [23]:
train_encoded = pd.read_csv("./data/train_encoded.csv")


train_cols = train_encoded.drop(
    columns=['diagnosed_diabetes', 'id']
).columns



In [24]:
X_test = X_test.reindex(columns=train_cols, fill_value=0)


In [25]:
continuous_cols = [
    'age',
    'alcohol_consumption_per_week',
    'physical_activity_minutes_per_week',
    'diet_score',
    'sleep_hours_per_day',
    'screen_time_hours_per_day',
    'bmi',
    'waist_to_hip_ratio',
    'systolic_bp',
    'diastolic_bp',
    'heart_rate',
    'cholesterol_total',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'triglycerides'
]

X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])



In [26]:
y_test_prob = model.predict(X_test).ravel()


[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 910us/step


In [27]:
y_test_prob

array([0.5588809 , 0.55937433, 0.68858194, ..., 0.5150477 , 0.59799564,
       0.55333406], dtype=float32)

In [35]:
threshold = 0.6  
y_test_pred = (y_test_prob >= threshold).astype(int)

In [36]:
y_test_pred

array([0, 0, 1, ..., 0, 0, 0])

## submission DataFrame

In [37]:
submission = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': y_test_pred
})


In [38]:
submission

Unnamed: 0,id,diagnosed_diabetes
0,700000,0
1,700001,0
2,700002,1
3,700003,1
4,700004,1
...,...,...
299995,999995,1
299996,999996,1
299997,999997,0
299998,999998,0


In [39]:
submission.shape


(300000, 2)

In [42]:
submission.columns

Index(['id', 'diagnosed_diabetes'], dtype='object')

In [40]:
submission['diagnosed_diabetes'].value_counts()


diagnosed_diabetes
1    153178
0    146822
Name: count, dtype: int64

In [41]:
submission.to_csv("./data/submission.csv", index=False)