# Scalers

We will simply save the scalers here.

Note: This was done after all of the training (including the other models), but is kept in this folder for convenience.

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
def create_scaler_1(df, target="diagnosed_diabetes"):
    X = df.drop(columns=[target])
    y = df[target]

    X_train, _, _, _ = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42, 
        shuffle=True
    )
    
    features = ['hba1c', 'glucose_fasting']
    features_indices = [X.columns.get_loc(col) for col in features]

    scaler = StandardScaler()
    scaler.fit(X_train)
    
    scaler_ready = StandardScaler()
    scaler_ready.mean_ = scaler.mean_[features_indices]
    scaler_ready.scale_ = scaler.scale_[features_indices]
    scaler_ready.var_ = scaler.var_[features_indices]
    scaler_ready.n_features_in_ = len(features)

    return scaler_ready

chosen_columns = ['family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'waist_to_hip_ratio', 'age_30-39', 'age_40-49', 'age_50-59', 'age_60-69', 'age_70-79',
       'age_80+', 'gender_Male', 'gender_Other', 'ethnicity_Black',
       'ethnicity_Hispanic', 'ethnicity_Other', 'ethnicity_White',
       'education_level_Highschool', 'education_level_No formal',
       'education_level_Postgraduate', 'income_level_Low',
       'income_level_Lower-Middle', 'income_level_Middle',
       'income_level_Upper-Middle', 'employment_status_Retired',
       'employment_status_Student', 'employment_status_Unemployed',
       'smoking_status_Former', 'smoking_status_Never',
       'alcohol_consumption_per_week_Light',
       'alcohol_consumption_per_week_Moderate',
       'alcohol_consumption_per_week_Heavy',
       'physical_activity_minutes_per_week_Light',
       'physical_activity_minutes_per_week_Moderate',
       'physical_activity_minutes_per_week_Active',
       'physical_activity_minutes_per_week_Very Active','sleep_hours_per_day_Short',
       'sleep_hours_per_day_Normal', 'sleep_hours_per_day_Long',
       'screen_time_hours_per_day_Moderate', 'screen_time_hours_per_day_High',
       'screen_time_hours_per_day_Very_High', 'bmi_Normal', 'bmi_Overweight',
       'bmi_Obese_I', 'bmi_Obese_II']

def create_scaler_2(df, target="diabetes_risk_score"):
    df = df[chosen_columns + [target]]
    X = df.drop(columns=[target])
    y = df[target]

    X_train, _, _, _ = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42, 
        shuffle=True
    )
    
    features = ["family_history_diabetes", "age_60-69", "age_50-59", "physical_activity_minutes_per_week_Active", "age_70-79", "age_40-49", "waist_to_hip_ratio", "age_30-39", "physical_activity_minutes_per_week_Very Active", "physical_activity_minutes_per_week_Moderate", "age_80+", "bmi_Normal", "bmi_Overweight", "bmi_Obese_I", "bmi_Obese_II"]
    features_indices = [X.columns.get_loc(col) for col in features]

    scaler = StandardScaler()
    scaler.fit(X_train)
    
    scaler_ready = StandardScaler()
    scaler_ready.mean_ = scaler.mean_[features_indices]
    scaler_ready.scale_ = scaler.scale_[features_indices]
    scaler_ready.var_ = scaler.var_[features_indices]
    scaler_ready.n_features_in_ = len(features)

    return scaler_ready

In [3]:
df = pd.read_parquet("../diabetes_dataset_prepared.parquet")
df.head()

Unnamed: 0,family_history_diabetes,hypertension_history,cardiovascular_history,waist_to_hip_ratio,glucose_fasting,glucose_postprandial,hba1c,diagnosed_diabetes,glucose_fasting_times_hba1c,glucose_postprandial_times_hba1c,...,hdl_cholesterol_Normal,hdl_cholesterol_High,ldl_cholesterol_Near_optimal,ldl_cholesterol_Borderline_high,ldl_cholesterol_High,ldl_cholesterol_Very_high,triglycerides_Borderline_high,triglycerides_High,insulin_level_Normal,insulin_level_High
0,0,0,0,0.89,136,236,8.18,1,1112.47998,1930.480103,...,1,0,0,0,1,0,0,0,1,0
1,0,0,0,0.8,93,150,5.63,0,523.590027,844.5,...,1,0,0,0,0,0,0,0,1,0
2,1,0,0,0.81,118,195,7.51,1,886.180054,1464.450073,...,0,1,0,0,0,0,0,0,1,0
3,0,0,0,0.88,139,253,9.03,1,1255.169922,2284.589844,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0.78,137,184,7.2,1,986.399963,1324.799927,...,1,0,1,0,0,0,1,0,1,0


In [6]:
scaler1 = create_scaler_1(df)
joblib.dump(scaler1, "../models/has_or_not_scaler.pkl")

['../models/has_or_not_scaler.pkl']

In [7]:
df2 = pd.read_parquet("../diabetes_dataset_prepared_2.parquet")
df2.head()

Unnamed: 0,family_history_diabetes,hypertension_history,cardiovascular_history,waist_to_hip_ratio,glucose_fasting,glucose_postprandial,hba1c,diabetes_risk_score,glucose_fasting_times_hba1c,glucose_postprandial_times_hba1c,...,hdl_cholesterol_Normal,hdl_cholesterol_High,ldl_cholesterol_Near_optimal,ldl_cholesterol_Borderline_high,ldl_cholesterol_High,ldl_cholesterol_Very_high,triglycerides_Borderline_high,triglycerides_High,insulin_level_Normal,insulin_level_High
0,0,0,0,0.89,136,236,8.18,1,1112.47998,1930.480103,...,1,0,0,0,1,0,0,0,1,0
1,0,0,0,0.8,93,150,5.63,1,523.590027,844.5,...,1,0,0,0,0,0,0,0,1,0
2,1,0,0,0.81,118,195,7.51,1,886.180054,1464.450073,...,0,1,0,0,0,0,0,0,1,0
3,0,0,0,0.88,139,253,9.03,1,1255.169922,2284.589844,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0.78,137,184,7.2,1,986.399963,1324.799927,...,1,0,1,0,0,0,1,0,1,0


In [9]:
scaler2 = create_scaler_2(df2)
joblib.dump(scaler2, "../models/risk_score_scaler.pkl")

['../models/risk_score_scaler.pkl']