In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e12/sample_submission.csv")

In [3]:
display(train.head())

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


In [4]:
train.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder # Requires category_encoders library

def encode_features(df, train_df=None, target=None, is_train=True):
    """
    Strategic Encoding based on Cardinality [cite: 75-81]
    """
    df = df.copy()
    
    # 1. Low Cardinality (< 10): One-Hot Encoding [cite: 76-77]
    oh_cols = ['gender', 'smoking_status', 'family_history_diabetes']
    df = pd.get_dummies(df, columns=oh_cols, drop_first=True)
    
    # 2. Ordinal/Medium Cardinality: Label Encoding [cite: 78-79]
    le_cols = ['education_level', 'income_level', 'employment_status']
    le = LabelEncoder()
    for col in le_cols:
        df[col] = le.fit_transform(df[col].astype(str))
        
    # 3. High Cardinality (100+): Target Encoding [cite: 80-81]
    # NOTE: In a real GM pipeline, this MUST be done inside CV folds [cite: 83]
    if is_train and target is not None:
        te_cols = ['ethnicity'] 
        te = TargetEncoder(cols=te_cols, smoothing=10) # Using smoothing as per [cite: 84]
        df[te_cols] = te.fit_transform(df[te_cols], target)
        
    return df

In [6]:
encoded_df = encode_features(train)
display(encoded_df.head())

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,income_level,employment_status,hypertension_history,cardiovascular_history,diagnosed_diabetes,gender_Male,gender_Other,smoking_status_Former,smoking_status_Never,family_history_diabetes_1
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,2,0,0,0,1.0,False,False,False,False,False
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,4,0,0,0,1.0,False,False,False,True,False
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,2,1,0,0,0.0,True,False,False,True,False
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,2,0,1,0,1.0,False,False,False,False,False
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,4,1,1,0,1.0,True,False,False,True,False


In [7]:
def extract_features(df):
    """
    Tier 1 & 2 Feature Discovery [cite: 91, 96]
    """
    df = df.copy()
    
    # --- Blood Pressure Metrics ---
    # Pulse Pressure: Difference between Systolic and Diastolic [cite: 93]
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    # Mean Arterial Pressure (MAP)
    df['mean_arterial_pressure'] = (df['systolic_bp'] + 2 * df['diastolic_bp']) / 3
    
    # --- Cholesterol Ratios [cite: 93] ---
    # Non-HDL Cholesterol
    df['non_hdl_cholesterol'] = df['cholesterol_total'] - df['hdl_cholesterol']
    # Cardiac Risk Ratio
    df['cholesterol_hdl_ratio'] = df['cholesterol_total'] / df['hdl_cholesterol']
    
    # --- Lifestyle Indicators ---
    # Total activity vs Sleep balance
    df['activity_sleep_ratio'] = df['physical_activity_minutes_per_week'] / (df['sleep_hours_per_day'] * 7 + 1)
    
    # --- Group Aggregations (Tier 1) [cite: 94] ---
    # How does this person's BMI compare to others of the same age/gender?
    df['age_bmi_mean'] = df.groupby('age')['bmi'].transform('mean')
    df['bmi_diff_from_age_avg'] = df['bmi'] - df['age_bmi_mean']
    
    # --- Interaction Features ---
    # Age and BMI interaction is often highly non-linear [cite: 110]
    df['age_bmi_product'] = df['age'] * df['bmi']
    
    # Drop ID as it is a leakage risk [cite: 38]
    if 'id' in df.columns:
        df = df.drop(columns=['id'])
        
    return df

In [8]:
enhanced_df = extract_features(encoded_df)
display(enhanced_df.head())

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,smoking_status_Never,family_history_diabetes_1,pulse_pressure,mean_arterial_pressure,non_hdl_cholesterol,cholesterol_hdl_ratio,activity_sleep_ratio,age_bmi_mean,bmi_diff_from_age_avg,age_bmi_product
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,False,False,42,84.0,141,3.431034,0.925926,25.565422,7.834578,1035.4
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,True,False,43,91.333333,149,3.98,1.569892,25.978136,-2.178136,1190.0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,True,False,6,91.0,129,3.186441,2.992424,25.716036,-1.616036,771.2
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,...,False,False,52,86.333333,128,3.37037,1.54,25.898813,0.701187,1436.4
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,...,True,False,48,76.0,157,4.204082,1.238739,25.898813,2.901187,1555.2


In [9]:
enhanced_df.isna().sum()

age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
ethnicity                             0
education_level                       0
income_level                          0
employment_status                     0
hypertension_history                  0
cardiovascular_history                0
diagnosed_diabetes                    0
gender_Male                           0
gender_Other                          0
smoking_status_Former                 0


In [10]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def add_lifestyle_clusters(df, n_clusters=3):
    """
    Advanced Feature Engineering: K-Means [cite: 102]
    """
    df = df.copy()
    
    # Select lifestyle-related columns
    cluster_features = [
        'physical_activity_minutes_per_week', 
        'sleep_hours_per_day', 
        'screen_time_hours_per_day'
    ]
    
    # 1. Scale numeric data before K-Means [cite: 104]
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[cluster_features])
    
    # 2. Fit K-Means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    
    # 3. Add Cluster ID [cite: 106]
    df['lifestyle_cluster_id'] = kmeans.fit_predict(scaled_data)
    
    # 4. Add Distance to Centroids [cite: 107]
    # This provides a continuous measure of how 'typical' a person is for their group
    distances = kmeans.transform(scaled_data)
    for i in range(n_clusters):
        df[f'dist_to_lifestyle_cluster_{i}'] = distances[:, i]
        
    return df

In [11]:
final_processed_df = add_lifestyle_clusters(enhanced_df)
final_processed_df.columns

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'ethnicity', 'education_level', 'income_level',
       'employment_status', 'hypertension_history', 'cardiovascular_history',
       'diagnosed_diabetes', 'gender_Male', 'gender_Other',
       'smoking_status_Former', 'smoking_status_Never',
       'family_history_diabetes_1', 'pulse_pressure', 'mean_arterial_pressure',
       'non_hdl_cholesterol', 'cholesterol_hdl_ratio', 'activity_sleep_ratio',
       'age_bmi_mean', 'bmi_diff_from_age_avg', 'age_bmi_product',
       'lifestyle_cluster_id', 'dist_to_lifestyle_cluster_0',
       'dist_to_lifestyle_cluster_1', 'dist_to_lifestyle_cluster_2'],
      dtype='object')

In [12]:
test_encoded = encode_features(test)
enhanced_test = extract_features(test_encoded)
test_df = add_lifestyle_clusters(enhanced_test)

In [13]:
# import pandas as pd
# import numpy as np
# import lightgbm as lgb
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import roc_auc_score

# # 1. Drop ID and separate Target 
# # IDs are dropped to prevent leakage hints [cite: 38]
# X = train.drop(columns=['id', 'diagnosed_diabetes'])
# y = train['diagnosed_diabetes']

# # 2. Define Stratified K-Fold
# # This strategy is chosen for imbalanced/binary data to mimic the test set [cite: 24, 27]
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Containers for results
# oof_preds = np.zeros(len(X))
# cv_scores = []
# feature_importances = pd.DataFrame()

# # 3. The Grandmaster Training Loop [cite: 171]
# for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
#     print(f"--- Training Fold {fold + 1} ---")
    
#     # Split data
#     X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
#     # --- Feature Engineering Tier 1 & 2 [cite: 91, 96] ---
#     X_train_processed = extract_features(X_train)
#     X_train_processed = add_lifestyle_clusters(X_train_processed)
    
#     X_val_processed = extract_features(X_val)
#     X_val_processed = add_lifestyle_clusters(X_val_processed)
    
#     # --- Strategic Encoding [cite: 75, 83] ---
#     # Target encoding MUST be done inside CV folds to prevent target leakage [cite: 83, 85]
#     X_train_processed = encode_features(X_train_processed, target=y_train, is_train=True)
#     X_val_processed = encode_features(X_val_processed, is_train=False) 

#     # --- Grandmaster Numeric Safety Fix ---
#     # LightGBM requires int, float, or bool [cite: 146]
#     X_train_processed = X_train_processed.apply(pd.to_numeric, errors='coerce')
#     X_val_processed = X_val_processed.apply(pd.to_numeric, errors='coerce')
    
#     # Handle any NaNs—letting the tree model handle them or filling with 0 [cite: 68, 70]
#     X_train_processed = X_train_processed.fillna(0)
#     X_val_processed = X_val_processed.fillna(0)

#     # 4. Baseline Model: LightGBM [cite: 44, 119]
#     # We use near-default params as an anchor for CV [cite: 45, 48]
#     model = lgb.LGBMClassifier(
#         n_estimators=2000,
#         learning_rate=0.03,
#         metric='auc', # AUC is the competition metric [cite: 12, 14]
#         importance_type='gain', # Gain provides better feature discovery than weight
#         random_state=42,
#         verbosity=-1
#     )
    
#     model.fit(
#         X_train_processed, y_train,
#         eval_set=[(X_val_processed, y_val)],
#         callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)] # Prevent over-tuning [cite: 125]
#     )
    
#     # 5. Store Out-of-Fold (OOF) predictions & Metrics
#     oof_preds[val_idx] = model.predict_proba(X_val_processed)[:, 1]
#     score = roc_auc_score(y_val, oof_preds[val_idx])
#     cv_scores.append(score)
    
#     # Track feature importance for pruning [cite: 98]
#     fold_importance = pd.DataFrame({
#         'feature': X_train_processed.columns,
#         'importance': model.feature_importances_,
#         'fold': fold + 1
#     })
#     feature_importances = pd.concat([feature_importances, fold_importance], axis=0)

# # Final CV Diagnostic [cite: 28, 30]
# # If CV improves but LB drops, we look for leakage [cite: 87]
# print(f"\nOverall CV AUC: {np.mean(cv_scores):.5f} +/- {np.std(cv_scores):.5f}")

In [14]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans


# Separate features and target
X = train.drop(columns=['id', 'diagnosed_diabetes'])
y = train['diagnosed_diabetes']
X_test = test.drop(columns=['id'])
test_ids = test['id']

# 2. Define Helper Functions (Consolidated)
def extract_features(df):
    df = df.copy()
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = (df['systolic_bp'] + 2 * df['diastolic_bp']) / 3
    df['non_hdl_cholesterol'] = df['cholesterol_total'] - df['hdl_cholesterol']
    df['cholesterol_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-9)
    df['age_bmi_product'] = df['age'] * df['bmi']
    return df

# 3. Cross-Validation & Inference Loop
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
oof_preds = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"Processing Fold {fold + 1}...")
    
    # Split
    X_train_f, X_val_f = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
    y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]
    X_test_f = X_test.copy()
    
    # Feature Extraction
    X_train_f = extract_features(X_train_f)
    X_val_f = extract_features(X_val_f)
    X_test_f = extract_features(X_test_f)
    
    # K-Means Clustering (Fit on Train, Transform others)
    cluster_cols = ['physical_activity_minutes_per_week', 'sleep_hours_per_day', 'screen_time_hours_per_day']
    scaler = StandardScaler()
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    
    X_train_f[cluster_cols] = scaler.fit_transform(X_train_f[cluster_cols])
    X_val_f[cluster_cols] = scaler.transform(X_val_f[cluster_cols])
    X_test_f[cluster_cols] = scaler.transform(X_test_f[cluster_cols])
    
    X_train_f['cluster'] = kmeans.fit_predict(X_train_f[cluster_cols])
    X_val_f['cluster'] = kmeans.predict(X_val_f[cluster_cols])
    X_test_f['cluster'] = kmeans.predict(X_test_f[cluster_cols])

    # Encoding (Label Encoding for simplicity/robustness)
    cat_cols = X_train_f.select_dtypes(include=['object']).columns
    for col in cat_cols:
        le = LabelEncoder()
        # Fit on train, handles unseen in val/test by mapping to a new class
        X_train_f[col] = le.fit_transform(X_train_f[col].astype(str))
        X_val_f[col] = X_val_f[col].map(lambda s: le.transform([str(s)])[0] if str(s) in le.classes_ else -1)
        X_test_f[col] = X_test_f[col].map(lambda s: le.transform([str(s)])[0] if str(s) in le.classes_ else -1)

    # Train Model
    model = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.03, metric='auc', random_state=42, verbosity=-1)
    model.fit(
        X_train_f, y_train_f,
        eval_set=[(X_val_f, y_val_f)],
        callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)] # Prevent over-tuning [cite: 125]
    )
    # Predict
    oof_preds[val_idx] = model.predict_proba(X_val_f)[:, 1]
    test_preds += model.predict_proba(X_test_f)[:, 1] / cv.n_splits

# 4. Final Submission
submission = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': test_preds
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved successfully!")

  if entities is not ():


Processing Fold 1...
Training until validation scores don't improve for 10 rounds
[10]	valid_0's auc: 0.696229
[20]	valid_0's auc: 0.697686
[30]	valid_0's auc: 0.699454
[40]	valid_0's auc: 0.701455
[50]	valid_0's auc: 0.703243
[60]	valid_0's auc: 0.704661
[70]	valid_0's auc: 0.706485
[80]	valid_0's auc: 0.708003
[90]	valid_0's auc: 0.709519
[100]	valid_0's auc: 0.710793
[110]	valid_0's auc: 0.712438
[120]	valid_0's auc: 0.713157
[130]	valid_0's auc: 0.714362
[140]	valid_0's auc: 0.714909
[150]	valid_0's auc: 0.715734
[160]	valid_0's auc: 0.716445
[170]	valid_0's auc: 0.717115
[180]	valid_0's auc: 0.717696
[190]	valid_0's auc: 0.718206
[200]	valid_0's auc: 0.718882
[210]	valid_0's auc: 0.719329
[220]	valid_0's auc: 0.719671
[230]	valid_0's auc: 0.719989
[240]	valid_0's auc: 0.720425
[250]	valid_0's auc: 0.72076
[260]	valid_0's auc: 0.72096
[270]	valid_0's auc: 0.721332
[280]	valid_0's auc: 0.721574
[290]	valid_0's auc: 0.721775
[300]	valid_0's auc: 0.721941
[310]	valid_0's auc: 0.722129