In [7]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath('../src'))

from data_loader import DiabetesLoader
from adversarial_validation import Validator
from preprocessing import get_preprocessor

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

loader = DiabetesLoader('../data/raw/train.csv')
loader.load_external('../data/external/diabetes_dataset.csv')

pipeline = get_preprocessor()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading data from /app/data/raw/train.csv
Data Loaded. Shape: (700000, 25)
loading external data from ../data/external/diabetes_dataset.csv


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from preprocessing import get_xgb_preprocessor, get_lgbm_preprocessor, get_cat_preprocessor
from config import XGB_PARAMS, LGBM_PARAMS, CAT_PARAMS, TARGET, NOMINAL_COLS

xgb = XGBClassifier(**XGB_PARAMS)
lgbm = LGBMClassifier(**LGBM_PARAMS)
cat = CatBoostClassifier(**CAT_PARAMS)

pipe_xgb = Pipeline([
    ('preprocessor', get_xgb_preprocessor()),
    ('model', XGBClassifier(**XGB_PARAMS))
])

pipe_lgbm = Pipeline([
    ('preprocessor', get_lgbm_preprocessor()),
    ('model', LGBMClassifier(**LGBM_PARAMS))
])

pipe_cat = Pipeline([
    ('preprocessor', get_cat_preprocessor()),
    ('model', CatBoostClassifier(**CAT_PARAMS, cat_features=NOMINAL_COLS)) 
])

ensemble = VotingClassifier(
    estimators=[
        ('xgb',pipe_xgb),
        ('lgbm',pipe_lgbm),
        ('cat',pipe_cat)
    ],
    voting='soft'
)

In [9]:
X,y = loader.get_full()

X_processed =  pipeline.fit_transform(X)

print(f"full data x: {X_processed.shape}, y: {y.shape}")

validator = Validator(X_processed, 'is_external')
mask = validator.get_mask()

X_masked = X[mask]
y_masked = y[mask]

print(f"After filtered x: {X_masked.shape}, y: {y_masked.shape}")

full data x: (770739, 34), y: (770739,)
After filtered x: (750377, 25), y: (750377,)


In [10]:
ensemble.fit(X_masked, y_masked)

In [11]:
from data_loader import TestLoader

test_loader = TestLoader('../data/raw/test.csv')
X_test, id = test_loader.get_data()
X_test['is_external'] = 0

print(f"full test data x: {X_test.shape}")

Loading data from /app/data/raw/test.csv
Test X: (300000, 25)
full test data x: (300000, 25)


In [12]:
import pandas as pd

y_submit = ensemble.predict_proba(X_test)[:, 1]
df_submit = pd.DataFrame(y_submit,columns=[TARGET], index=id)

display(df_submit.head())


Unnamed: 0_level_0,diagnosed_diabetes
id,Unnamed: 1_level_1
700000,0.380972
700001,0.545342
700002,0.680917
700003,0.305335
700004,0.874912


In [16]:
print((df_submit[TARGET] > 0.9).sum())

6308


In [14]:
# df_submit.to_csv('../outputs/ensemble_external_no_fe.csv')