In [135]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [136]:
path = "/app/data/playground-series-s5e12/train.csv"
path_test = "/app/data/playground-series-s5e12/test.csv"

In [137]:
df = pd.read_csv(path)
df_test = pd.read_csv(path_test)

In [138]:
encoding_columns = ['ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status', 'gender']

for col in encoding_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

for col in encoding_columns:
    le = LabelEncoder()
    df_test[col] = le.fit_transform(df_test[col]) 

In [139]:
fill_cols = ['family_history_diabetes', 'hypertension_history','cardiovascular_history', 'diagnosed_diabetes']
fill_cols_test = ['family_history_diabetes', 'hypertension_history','cardiovascular_history']


df[fill_cols] = df[fill_cols].fillna(0)
df_test[fill_cols_test] = df_test[fill_cols_test].fillna(0)

In [140]:
df.isna().sum()
df_test.isna().sum()

id                                    0
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0


In [141]:
df.head(5)
df_test.head(5)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,0,4,1,3,1,0,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,0,4,1,3,2,3,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,...,184,1,4,1,1,2,0,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,...,128,1,4,0,3,1,0,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,...,133,1,4,0,1,0,3,0,0,0


In [None]:
def bp_risk(df):
    
    df["bp_risk"] = (
    (df["systolic_bp"] >= 140) | (df["diastolic_bp"] >= 90)
    ).astype(int)

    return df

def advanced_feature_engineering(df):
    # Cholesterol ratios (strong diabetes predictors)
    df['cholesterol_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-6)
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1e-6)
    df['triglyceride_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1e-6)
    
    # BMI categories - CONVERT TO INT
    df['bmi_category'] = pd.cut(df['bmi'], 
                                bins=[0, 18.5, 25, 30, 100],
                                labels=[0, 1, 2, 3]).astype(int)  
    
    # Waist-to-hip ratio risk
    df['whr_high_risk'] = (df['waist_to_hip_ratio'] > 0.9).astype(int)
    
    # Blood pressure categories
    df['bp_category'] = 0
    df.loc[(df['systolic_bp'] >= 120) & (df['systolic_bp'] < 130), 'bp_category'] = 1
    df.loc[(df['systolic_bp'] >= 130) | (df['diastolic_bp'] >= 80), 'bp_category'] = 2
    
    # Heart rate categories - CONVERT TO INT
    df['hr_category'] = pd.cut(df['heart_rate'],
                               bins=[0, 60, 100, 200],
                               labels=[0, 1, 2]).astype(int)  
    
    return df

def age_based_features(df):
    # Age groups - CONVERT TO INT
    df['age_group'] = pd.cut(df['age'],
                            bins=[0, 30, 45, 60, 100],
                            labels=[0, 1, 2, 3]).astype(int)  # Add .astype(int)
    
    # Age risk categories
    df['age_high_risk'] = (df['age'] >= 45).astype(int)
    
    return df


def lifestyle_features(df, diet_median=None):
    # Calculate median from training data only
    if diet_median is None:
        diet_median = df['diet_score'].median()
    
    df['healthy_lifestyle_score'] = (
        (df['physical_activity_minutes_per_week'] >= 150).astype(int) +
        (df['diet_score'] >= diet_median).astype(int) +  # Use passed median
        (df['sleep_hours_per_day'].between(7, 9)).astype(int) +
        (df['alcohol_consumption_per_week'] <= 7).astype(int)
    )
    
    df['sedentary_score'] = (
        df['screen_time_hours_per_day'] * 7 - 
        df['physical_activity_minutes_per_week'] / 60
    )
    
    df['sleep_adequate'] = df['sleep_hours_per_day'].between(7, 9).astype(int)
    df['sleep_deficit'] = np.maximum(7 - df['sleep_hours_per_day'], 0)
    
    return df, diet_median


def interaction_features(df):
    # Age interactions (risk increases with age)
    df['age_bmi_interaction'] = df['age'] * df['bmi']
    df['age_bp_interaction'] = df['age'] * df['systolic_bp']
    df['age_cholesterol_interaction'] = df['age'] * df['cholesterol_total']
    
    # BMI and lifestyle
    df['bmi_activity_ratio'] = df['bmi'] / (df['physical_activity_minutes_per_week'] + 1)
    df['bmi_diet_interaction'] = df['bmi'] * (10 - df['diet_score']) 
    
    # Medical history combined risk
    df['medical_history_count'] = (
        df['family_history_diabetes'] +
        df['hypertension_history'] +
        df['cardiovascular_history']
    )
    
    return df

def transform_features(df):
    # Polynomial features for key metrics
    df['bmi_squared'] = df['bmi'] ** 2
    df['age_squared'] = df['age'] ** 2
    df['waist_hip_squared'] = df['waist_to_hip_ratio'] ** 2
    
    # Log transforms for skewed distributions
    df['log_triglycerides'] = np.log1p(df['triglycerides'])
    df['log_activity'] = np.log1p(df['physical_activity_minutes_per_week'])
    
    return df


def risk_stratification(df):
    # Metabolic syndrome indicators (3+ = high risk)
    df['metabolic_syndrome_score'] = (
        (df['bmi'] >= 30).astype(int) +
        (df['systolic_bp'] >= 130).astype(int) +
        (df['triglycerides'] >= 150).astype(int) +
        (df['hdl_cholesterol'] < 40).astype(int) +
        (df['waist_to_hip_ratio'] > 0.9).astype(int)
    )
    
    # Overall diabetes risk score
    df['diabetes_risk_score'] = (
        (df['age'] > 45).astype(int) +
        (df['bmi'] >= 25).astype(int) +
        df['family_history_diabetes'] +
        (df['physical_activity_minutes_per_week'] < 150).astype(int) +
        df['hypertension_history']
    )
    
    return df

In [143]:
def feature_engineering(df, diet_median=None, is_train=True):
    df = bp_risk(df)
    df = advanced_feature_engineering(df)
    
    if is_train:
        df, diet_median = lifestyle_features(df)
    else:
        df, _ = lifestyle_features(df, diet_median=diet_median)
    
    df = interaction_features(df)
    df = transform_features(df)
    df = risk_stratification(df)
    df = age_based_features(df)
    
    return df, diet_median

In [144]:
df, diet_median = feature_engineering(df, is_train=True)
df_test, _ = feature_engineering(df_test, diet_median=diet_median, is_train=False)

MODEL TRAINING

In [145]:
X = df.drop(columns=['diagnosed_diabetes', 'id'])
Y = df['diagnosed_diabetes']

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y )

In [147]:
from sklearn.ensemble import (
    StackingClassifier, 
    RandomForestClassifier, 
    ExtraTreesClassifier,
    GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

base_models = [
    ('xgb1', XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.05, random_state=42)),
    ('xgb2', XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.03, subsample=0.9, random_state=43)),
    ('rf', RandomForestClassifier(n_estimators=500, max_depth=8, random_state=44)),
]

# Stacking
stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3
)

# Train
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)

# Results
print("Stacking Accuracy:", accuracy_score(y_test, y_pred))

# Cross-validation
cv_scores = cross_val_score(stacking, X, Y, cv=5, scoring='accuracy')
print(f"CV Scores: {cv_scores}")
print(f"Mean CV: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

Stacking Accuracy: 0.6828539552512533
CV Scores: [0.68346528 0.68320547 0.68338644 0.68253823 0.68303493]
Mean CV: 0.6831 (+/- 0.0003)


In [148]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6828539552512533


In [149]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")


Cross-validation scores: [0.68251009 0.68248716 0.68300436 0.68166709 0.6823319 ]
Mean CV Accuracy: 0.6824 (+/- 0.0004)


In [150]:
X_test = df_test.drop(columns=["id"])
y_pred = model.predict_proba(X_test)[:, 1]

In [151]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "diagnosed_diabetes": y_pred
})

submission.to_csv('submission.csv', index=False)