<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/Diabets_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import and install necessary model

In [1]:
# !nvidia-smi

In [2]:
!pip install lightgbm --install-option=--gpu


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: --install-option


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
!pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.9


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
import joblib
import dill as pickle
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Train dataset

In [6]:
df = pd.read_csv('/content/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Transformers

In [44]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.w = 0.00001

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        data = X.copy()
        data['Pregnancies_rank'] = data['Pregnancies'].rank()

        data['AgeGroup'] = pd.cut(data['Age'], bins=[20, 30, 50, 100], labels=[1, 2, 3], right=False)
        data['AgeGroup'] = pd.to_numeric(data['AgeGroup'], errors='coerce')
        data['PregnancyRatio'] = data['Pregnancies'] / (data['Age'] + self.w)
        data['BMICategory'] = pd.cut(data['BMI'], bins=[0, 18.5, 25, 30, 70],
                                     labels=[-1, 0, 2, 3], right=False)
        data['BMICategory'] = pd.to_numeric(data['BMICategory'], errors='coerce')
        data['GlucoseCategory'] = pd.cut(data['Glucose'], bins=[0, 90, 140, 200],
                                         labels=[-1, 0, 1], right=False)
        data['GlucoseCategory'] = pd.to_numeric(data['GlucoseCategory'], errors='coerce')
        data['RiskScore'] = (0.5 * data['Glucose'] +
                             0.3 * data['BMI'] +
                             0.2 * data['Age'])
        data['InsulinEfficiency'] = ((data['Insulin'] + self.w) /
                                     (data['Glucose'] + self.w)).fillna(0)
        data['Glucose_BMI'] = (data['Glucose'] + self.w) / (data['BMI'] + self.w)
        data['BMI_Age'] = data['BMI'] * data['Age']

        return data


class WoEEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_bins = {
            'Pregnancies': [-np.inf, 1.7, 5.1, 8.5, np.inf],
            'Glucose': [-np.inf, 90.6, 119.4, 159.2, np.inf],
            'BMI': [-np.inf, 26.84, 38.26, np.inf],
            'PregnancyRatio': [-np.inf, 0.0737, 0.221, np.inf],
            'RiskScore': [-np.inf, 55.61, 77.51, 99.41, np.inf],
            'BMI_Age': [-np.inf, 539.4, 1078.8, 1618.2, np.inf]
        }
        self.woe_mappings = {}

    def fit(self, X, y):
        y = pd.Series(y, name='target')

        for feature, bins in self.feature_bins.items():
            X[f'{feature}_cat'] = pd.cut(X[feature], bins=bins)
            woe_df = self._calculate_woe(X, f'{feature}_cat', y)
            self.woe_mappings[feature] = {
                'woe': woe_df.set_index(f'{feature}_cat')['WOE'].to_dict(),
                'rank': woe_df.set_index(f'{feature}_cat')['rank'].to_dict()
            }

        return self

    def transform(self, X):
        data = X.copy()
        for feature in self.feature_bins.keys():
            data[f'{feature}_cat'] = pd.cut(data[feature], bins=self.feature_bins[feature])
            data[f'{feature}_woe'] = data[f'{feature}_cat'].map(self.woe_mappings[feature]['woe'])
            data[f'{feature}_rank'] = data[f'{feature}_cat'].map(self.woe_mappings[feature]['rank'])
            data[f'{feature}_woe'] = pd.to_numeric(data[f'{feature}_woe'], errors='coerce')
            data[f'{feature}_rank'] = pd.to_numeric(data[f'{feature}_rank'], errors='coerce')
            data.drop(columns=[f'{feature}_cat'], inplace=True)

        return data

    def _calculate_woe(self, data, feature_name, y):
        data = data.copy()
        data['target'] = y
        grouped = data.groupby(feature_name, observed=False)['target'].value_counts().unstack(fill_value=0)

        grouped = grouped.rename(columns={1: '# of events', 0: '# of non-events'})
        grouped['Percentage events'] = grouped['# of events'] / grouped['# of events'].sum()
        grouped['Percentage non-events'] = grouped['# of non-events'] / grouped['# of non-events'].sum()
        grouped['WOE'] = np.log(grouped['Percentage events'] / grouped['Percentage non-events'])
        grouped['rank'] = grouped['WOE'].rank()
        return grouped.reset_index()

In [8]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

## Model

In [9]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
cv = StratifiedKFold(n_splits=5,
                     shuffle=True,
                     random_state=42)

In [49]:
rf_col = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction',
          'Age', 'PregnancyRatio', 'RiskScore', 'InsulinEfficiency', 'Glucose_BMI',
          'BMI_Age', 'Glucose_rank', 'Glucose_woe', 'RiskScore_rank', 'RiskScore_woe']
rf_model = Pipeline(
    [('colum_selector', ColumnSelector(rf_col)),
     ('rf_model', RandomForestClassifier(max_depth=6,
                                      n_estimators=300,
                                      criterion='entropy'))])

gb_col = ['BMI', 'DiabetesPedigreeFunction', 'PregnancyRatio', 'RiskScore', 'Glucose_BMI', 'BMI_Age']
gb_model = Pipeline(
    [('colum_selector', ColumnSelector(gb_col)),
     ('gb_model', GradientBoostingClassifier(
              n_estimators=177,
              learning_rate=0.03324793834648156,
              max_depth=3,
              min_samples_split=17,
              min_samples_leaf=2,
              max_features='sqrt',
              subsample=0.5158427523434093,
              min_impurity_decrease=0.03,
              criterion='friedman_mse',
              random_state=42
      ))])

lg_col = ['Pregnancies', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age',
          'Pregnancies_rank', 'AgeGroup', 'PregnancyRatio', 'BMICategory', 'GlucoseCategory',
          'RiskScore', 'InsulinEfficiency', 'Pregnancies_woe', 'Glucose_rank', 'Glucose_woe',
          'BMI_rank', 'BMI_woe', 'PregnancyRatio_rank', 'PregnancyRatio_woe', 'RiskScore_rank',
          'RiskScore_woe', 'BMI_Age_rank', 'BMI_Age_woe']
ln_model = Pipeline(
    [('colum_selector', ColumnSelector(lg_col)),
     ('scaler', StandardScaler()),
     ('ln_model', LogisticRegression(max_iter=1000,
                              penalty='l2',
                              C=0.12165146592065838,
                              solver='lbfgs'))])
xgb_col = ['Pregnancies', 'BMI', 'RiskScore', 'BMI_Age']
xgb_model = Pipeline(
    [('colum_selector', ColumnSelector(xgb_col)),
     ('xgb_model', XGBClassifier(
            n_estimators=947,
            max_depth=4,
            learning_rate=0.016838244125398805,
            subsample=0.780145921505157,
            colsample_bytree=0.9314800015785268,
            colsample_bylevel=0.5367494060666282,
            colsample_bynode=0.5416300057586382,
            min_child_weight=1,
            gamma=3.249425146724733,
            reg_alpha=0.05071259830380429,
            reg_lambda=0.0006700960560036836,
            scale_pos_weight=4.085968523187387,
            random_state=42,
            use_label_encoder=False,
            tree_method='gpu_hist',
            predictor='gpu_predictor'
        ))])

cat_gb_col = ['BMI', 'DiabetesPedigreeFunction', 'PregnancyRatio', 'RiskScore', 'Glucose_BMI', 'BMI_Age']
catboost_model = Pipeline(
    [
        ('colum_selector', ColumnSelector(cat_gb_col)),
        ('cat_model', CatBoostClassifier(
            iterations=184,
            learning_rate=0.024348893731335558,
            depth=9,
            l2_leaf_reg=1.6550796085362447,
            border_count=207,
            bagging_temperature=0.2639628599253254,
            random_strength=6.501714632959864,
            scale_pos_weight=1.8796468107471038,
            # subsample=0.7680238761358753,
            # bootstrap_type='Bernoulli',
            task_type="GPU",
            devices='0',
            loss_function='Logloss',
            random_seed=42
        ))
    ]
)

lgb_col = ['BMI', 'DiabetesPedigreeFunction', 'PregnancyRatio', 'RiskScore', 'InsulinEfficiency', 'Glucose_BMI', 'BMI_Age']
lgb_model = Pipeline(
    [
        ('colum_selector', ColumnSelector(lgb_col)),
        ('lgb_model', LGBMClassifier(
            n_estimators=20,
            random_state=42,
            learning_rate=0.005))
    ])


estemators = [
    ('rf_model', rf_model),
    # ('gb_model', gb_model),
    # ('ln_model', ln_model), ---
    # ('xgb_model', xgb_model), -----
    # ('cat_model', catboost_model),
    # ('lgb_model', lgb_model)
]

stacking_model = Pipeline(
    [('feature_engineering', FeatureEngineering()),
     ('woe_encoding', WoEEncoding()),
     ('stacking', StackingClassifier(
              estimators=estemators,
              cv=cv,
              final_estimator=RandomForestClassifier(max_depth=6,
                                                    n_estimators=300,
                                                    criterion='entropy'),
    passthrough=True
              ))]
)
cv_roc_auc = cross_val_score(stacking_model, X, y, cv=cv, scoring='roc_auc')

In [50]:
cv_roc_auc.mean()

0.8349706498951782

0.8432990915443745