In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath('../src'))

from data_loader import DiabetesLoader
from preprocessing import get_preprocessor

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

loader = DiabetesLoader('../data/raw/train.csv')
loader.load_external('../data/external/diabetes_dataset.csv')

X_train, y_train, X_val, y_val = loader.get_data()
pipeline = get_preprocessor()

Loading data from /app/data/raw/train.csv
Data Loaded. Shape: (700000, 25)
loading external data from ../data/external/diabetes_dataset.csv
Train X: (640000, 25)  Train y: (640000,)
Val X:   (160000, 25)    Val y:   (160000,)


In [2]:
X_train_processed = pipeline.fit_transform(X_train)
X_val_processed = pipeline.transform(X_val)

In [3]:
from config import RATIO

xgb_base = XGBClassifier(n_estimators=100,max_depth=5, n_jobs=-1,scale_pos_weight=RATIO, eval_metric="auc")

In [4]:
from preprocessing import is_numeric

print(f"processed X train shape: {X_train_processed.shape}")
count = 0
for col in X_train_processed.columns:
    if is_numeric(X_train_processed,col):
        count += 1
print(f"numeric cols: {count}")

print(X_train_processed.iloc[0])

processed X train shape: (640000, 64)
numeric cols: 45
age                                    36.0
alcohol_consumption_per_week            3.0
physical_activity_minutes_per_week    134.0
diet_score                              7.7
sleep_hours_per_day                     7.5
                                      ...  
employment_status_Unemployed            0.0
family_history_diabetes_1               0.0
hypertension_history_1                  0.0
cardiovascular_history_1                0.0
is_external                             0.0
Name: 52811, Length: 64, dtype: float64


In [5]:
from preprocessing import get_top_k_features, add_important_interaction

important_cols = get_top_k_features(base_model=xgb_base, X=X_train_processed,y=y_train,k=10)
print(important_cols)
X_train_full = add_important_interaction(X_train_processed, important_cols)

print(f"total cols: {X_train_full.shape[1]}")

['family_history_diabetes_1', 'Age_BMI_Risk', 'Relative_Activity', 'Diet_Activity_Score', 'Chronic_Metabolic_Load', 'physical_activity_minutes_per_week', 'Lipid_Accumulation', 'age', 'is_external', 'Age_WHR_Risk']
combining 8 cols
added 28 cols as interactions
total cols: 92


In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

X_scout, _, y_scout, _ = train_test_split(
    X_train_full, 
    y_train, 
    train_size=100000,  
    stratify=y_train, 
    random_state=42
)

selector = RFECV(
                xgb_base, 
                step=1,
                min_features_to_select=10,
                cv=3,
                scoring='roc_auc',
                n_jobs = 1
                )

In [None]:
selector.fit(X_scout, y_scout)

In [None]:
optimal_cols = X_scout.columns[selector.support_]
X_train_optimal = X_train_full[optimal_cols]
print(f"optimal cols: {len(optimal_cols)}")
print(X_train_optimal.columns.tolist())