In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from xgboost import XGBRFClassifier, XGBRFRegressor 

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from catboost import CatBoostClassifier

from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc

# I/ Preprocessing

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [3]:
df = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
df.head(5)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [4]:
print(f"Dataset Shape: {df.shape}")
 
summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary['Missing'] = df.isnull().sum().values
summary['PercMissing'] = df.isnull().sum().values / df.isnull().count().values
summary['Uniques'] = df.nunique().values
summary['Data type'] = df.dtypes.values
 
print('**Variable Description of  data:**')
summary

Dataset Shape: (70692, 22)
**Variable Description of  data:**


Unnamed: 0,index,dtypes,Name,Missing,PercMissing,Uniques,Data type
0,Diabetes_binary,float64,Diabetes_binary,0,0.0,2,float64
1,HighBP,float64,HighBP,0,0.0,2,float64
2,HighChol,float64,HighChol,0,0.0,2,float64
3,CholCheck,float64,CholCheck,0,0.0,2,float64
4,BMI,float64,BMI,0,0.0,80,float64
5,Smoker,float64,Smoker,0,0.0,2,float64
6,Stroke,float64,Stroke,0,0.0,2,float64
7,HeartDiseaseorAttack,float64,HeartDiseaseorAttack,0,0.0,2,float64
8,PhysActivity,float64,PhysActivity,0,0.0,2,float64
9,Fruits,float64,Fruits,0,0.0,2,float64


In [5]:
target_column = ["Diabetes_binary"]
 
numerical_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth','Age', 'Education', 'Income']
 
categorical_columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 
                    'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']

df[numerical_columns] = df[numerical_columns].astype('float')
df[categorical_columns] = df[categorical_columns].astype('int').astype('category')

df = df[target_column + numerical_columns + categorical_columns]

df[target_column] = df.dropna(subset=target_column)[target_column].astype('int')

In [6]:
df.describe()

Unnamed: 0,Diabetes_binary,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income
count,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0
mean,0.5,29.856985,2.837082,3.752037,5.810417,8.584055,4.920953,5.698311
std,0.500004,7.113954,1.113565,8.155627,10.062261,2.852153,1.029081,2.175196
min,0.0,12.0,1.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,25.0,2.0,0.0,0.0,7.0,4.0,4.0
50%,0.5,29.0,3.0,0.0,0.0,9.0,5.0,6.0
75%,1.0,33.0,4.0,2.0,6.0,11.0,6.0,8.0
max,1.0,98.0,5.0,30.0,30.0,13.0,6.0,8.0


In [7]:
X = df.drop(target_column, axis=1)
X_ = X.copy(deep=True)
y = df[target_column]

In [8]:
for col in X.columns: X[col] = X[col].sample(frac=1-np.random.rand()*0.35)

In [9]:
print(f"Dataset Shape: {X.shape}")
 
summary = pd.DataFrame(X.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary['Missing'] = X.isnull().sum().values
summary['PercMissing'] = X.isnull().sum().values / X.isnull().count().values
summary['Uniques'] = X.nunique().values
summary['Data type'] = X.dtypes.values

print('**Variable Description of  data:**')
summary

Dataset Shape: (70692, 21)
**Variable Description of  data:**


Unnamed: 0,index,dtypes,Name,Missing,PercMissing,Uniques,Data type
0,BMI,float64,BMI,21833,0.308847,78,float64
1,GenHlth,float64,GenHlth,5084,0.071918,5,float64
2,MentHlth,float64,MentHlth,23711,0.335413,31,float64
3,PhysHlth,float64,PhysHlth,14557,0.205921,31,float64
4,Age,float64,Age,2757,0.039,13,float64
5,Education,float64,Education,22729,0.321522,6,float64
6,Income,float64,Income,2879,0.040726,8,float64
7,HighBP,category,HighBP,5331,0.075412,2,category
8,HighChol,category,HighChol,13049,0.184589,2,category
9,CholCheck,category,CholCheck,8098,0.114553,2,category


# _________________________________________________________
# II/ Miss Forest

In [10]:
X_base = X.copy(deep= True)

X_base[numerical_columns] = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_base[numerical_columns])
X_base[categorical_columns] = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='most_frequent').set_output(transform="pandas").fit_transform(X_base[categorical_columns])).astype("category")

In [11]:
X_new = X_base.copy(deep= True)

col_order = X_new.isnull().sum().sort_values().index

d1_new, d2_new, cond, stop_num, stop_cat, iter_ = 1e10, 1e10, True, False, False, 0

while(cond):
    X_old, d1_old, d2_old = X_new.copy(deep = True), d1_new, d2_new
    for col in col_order:
        if ((col in numerical_columns) & ~stop_num) | ((col in categorical_columns) & ~stop_cat):
            train_index, test_index = np.flatnonzero(~X[col].isnull()), np.flatnonzero(X[col].isnull())
            rf = XGBRFRegressor(enable_categorical = True) if col in numerical_columns else XGBRFClassifier(enable_categorical = True)
            rf.fit(X_new.iloc[train_index].drop(col, axis = 1), X_new.loc[train_index, col])
            X_new.loc[test_index, col] = rf.predict(X_new.iloc[test_index].drop(col, axis=1))

    d1_new = np.linalg.norm(X_new[numerical_columns] - X_old[numerical_columns])
    d2_new = np.linalg.norm(X_new[categorical_columns] == X_old[categorical_columns])
    stop_num, stop_cat = (d1_new >= d1_old), (d2_new >= d2_old)

    if (stop_num & stop_cat): cond = False
    iter_+=1

X_mf = X_old.copy(deep = True)
iter_

14

In [12]:
for col in numerical_columns:
    print(col+":")
    train_index, test_index = np.flatnonzero(~X[col].isnull()), np.flatnonzero(X["Income"].isnull())
    print("Test loss: (RMSE)", mean_squared_error(X_mf.loc[test_index, col], X_.loc[test_index, col]))
    print("Test loss: (RMSE - Random)",  mean_squared_error(X_base.loc[test_index, col], X_.loc[test_index, col]))
    print("-"*30)

BMI:
Test loss: (RMSE) 11.4454558658485
Test loss: (RMSE - Random) 13.418198922199512
------------------------------
GenHlth:
Test loss: (RMSE) 0.061401038807171876
Test loss: (RMSE - Random) 0.07910246924334374
------------------------------
MentHlth:
Test loss: (RMSE) 19.570296819602685
Test loss: (RMSE - Random) 22.871328751921585
------------------------------
PhysHlth:
Test loss: (RMSE) 12.759368958968091
Test loss: (RMSE - Random) 21.589106254918594
------------------------------
Age:
Test loss: (RMSE) 0.2278215912551837
Test loss: (RMSE - Random) 0.26954047174879336
------------------------------
Education:
Test loss: (RMSE) 0.36398278991990496
Test loss: (RMSE - Random) 0.35243546243505036
------------------------------
Income:
Test loss: (RMSE) 4.03053870208834
Test loss: (RMSE - Random) 4.717849987815738
------------------------------


In [13]:
for col in categorical_columns:
    print(col+":")
    train_index, test_index = np.flatnonzero(~X[col].isnull()), np.flatnonzero(X["Income"].isnull())
    print("Test loss: (RMSE)", f1_score(X_mf.loc[test_index, col], X_.loc[test_index, col], average= 'macro'))
    print("Test loss: (RMSE - Random)",  f1_score(X_base.loc[test_index, col], X_.loc[test_index, col], average= 'macro'))
    print("-"*30)

HighBP:
Test loss: (RMSE) 0.9766623532047352
Test loss: (RMSE - Random) 0.9614821634563928
------------------------------
HighChol:
Test loss: (RMSE) 0.9382456651458225
Test loss: (RMSE - Random) 0.9090291212597921
------------------------------
CholCheck:
Test loss: (RMSE) 0.9698767447213678
Test loss: (RMSE - Random) 0.9698767447213678
------------------------------
Smoker:
Test loss: (RMSE) 0.911199208933966
Test loss: (RMSE - Random) 0.8836980634536724
------------------------------
Stroke:
Test loss: (RMSE) 0.9821677232919862
Test loss: (RMSE - Random) 0.9820645910320123
------------------------------
HeartDiseaseorAttack:
Test loss: (RMSE) 0.9714432515215716
Test loss: (RMSE - Random) 0.9716622487122817
------------------------------
PhysActivity:
Test loss: (RMSE) 0.9007089624449454
Test loss: (RMSE - Random) 0.8910027589647265
------------------------------
Fruits:
Test loss: (RMSE) 1.0
Test loss: (RMSE - Random) 1.0
------------------------------
Veggies:
Test loss: (RMSE) 0.8

# _________________________________________________________
# III/ Modelisation
## a) Fill na mean/median

In [14]:
 X_train, X_test, y_train, y_test = train_test_split(X_base, y, test_size=0.2, random_state=42)
 
n_train=y.size
n_test=y_test.size
 
n,p=X.shape
n_classes= np.unique(y).size
 
#matrice des probas
lb = LabelBinarizer().fit(y)
Y_train = lb.transform(y_train) 
Y_test = lb.transform(y_test)

Balance_train = np.unique(y_train, return_counts=True) 
Balance_test = np.unique(y_test, return_counts=True)

In [15]:
#Train model
clf_catboost = CatBoostClassifier(cat_features = sorted([X.columns.get_loc(col) for col in categorical_columns]))
clf_catboost = clf_catboost.fit(X_train, y_train, verbose = 100, eval_set = (X_test, y_test))

Learning rate set to 0.085855
0:	learn: 0.6664734	test: 0.6666658	best: 0.6666658 (0)	total: 258ms	remaining: 4m 17s
100:	learn: 0.5072018	test: 0.5159570	best: 0.5159570 (100)	total: 5.74s	remaining: 51.1s
200:	learn: 0.4976915	test: 0.5149951	best: 0.5148486 (182)	total: 11.8s	remaining: 46.8s
300:	learn: 0.4901260	test: 0.5154047	best: 0.5148486 (182)	total: 17.1s	remaining: 39.7s
400:	learn: 0.4835590	test: 0.5162475	best: 0.5148486 (182)	total: 23.1s	remaining: 34.5s
500:	learn: 0.4774220	test: 0.5171395	best: 0.5148486 (182)	total: 28.9s	remaining: 28.8s
600:	learn: 0.4719467	test: 0.5180964	best: 0.5148486 (182)	total: 34.3s	remaining: 22.7s
700:	learn: 0.4669674	test: 0.5190396	best: 0.5148486 (182)	total: 40.1s	remaining: 17.1s
800:	learn: 0.4623282	test: 0.5197928	best: 0.5148486 (182)	total: 45.7s	remaining: 11.4s
900:	learn: 0.4577909	test: 0.5208418	best: 0.5148486 (182)	total: 51.1s	remaining: 5.61s
999:	learn: 0.4535154	test: 0.5216308	best: 0.5148486 (182)	total: 57s	re

In [16]:
#Result model
Y_hat_train = clf_catboost.predict_proba(X_train)
Y_hat_test = clf_catboost.predict_proba(X_test)

In [17]:
#Loss
print("Train loss: ", log_loss(Y_train, Y_hat_train))
print("Test loss: ", log_loss(Y_test, Y_hat_test))
 
print("Test loss (random): ", log_loss(Y_test, np.ones((n_test,2))*0.5))

Train loss:  0.4992771847386423
Test loss:  0.5148485664358327
Test loss (random):  0.6931471805599453


In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, Y_hat_test[:,1])
fpr, tpr, thresholds = roc_curve(y_test, Y_hat_test[:,1])

print("AUC PR (CatBoost - FillNa Mean/Mode): ", auc(recall, precision))
print("AUC ROC (CatBoost - FillNa Mean/Mode): ", auc(fpr, tpr))


AUC PR (CatBoost - FillNa Mean/Mode):  0.7904875295368167
AUC ROC (CatBoost - FillNa Mean/Mode):  0.8202104310727587


# _________________________________________________________
# III/ Modelisation
## b) Fill na RF

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_mf, y, test_size=0.2, random_state=42)
 
n_train=y.size
n_test=y_test.size
 
n,p=X.shape
n_classes= np.unique(y).size
 
#matrice des probas
lb = LabelBinarizer().fit(y)
Y_train = lb.transform(y_train) 
Y_test = lb.transform(y_test)

Balance_train = np.unique(y_train, return_counts=True) 
Balance_test = np.unique(y_test, return_counts=True)

In [20]:
#Train model
clf_catboost = CatBoostClassifier(cat_features = sorted([X.columns.get_loc(col) for col in categorical_columns]))
clf_catboost = clf_catboost.fit(X_train, y_train, verbose = 100, eval_set = (X_test, y_test))

Learning rate set to 0.085855
0:	learn: 0.6647104	test: 0.6648375	best: 0.6648375 (0)	total: 125ms	remaining: 2m 5s
100:	learn: 0.5059165	test: 0.5147995	best: 0.5147995 (100)	total: 7.28s	remaining: 1m 4s
200:	learn: 0.4970306	test: 0.5137994	best: 0.5137422 (191)	total: 14.1s	remaining: 56.2s
300:	learn: 0.4896183	test: 0.5143185	best: 0.5137422 (191)	total: 25.8s	remaining: 59.9s
400:	learn: 0.4827604	test: 0.5150872	best: 0.5137422 (191)	total: 43.4s	remaining: 1m 4s
500:	learn: 0.4764850	test: 0.5160401	best: 0.5137422 (191)	total: 1m	remaining: 59.9s
600:	learn: 0.4704138	test: 0.5170222	best: 0.5137422 (191)	total: 1m 17s	remaining: 51.4s
700:	learn: 0.4648397	test: 0.5178399	best: 0.5137422 (191)	total: 1m 35s	remaining: 40.9s
800:	learn: 0.4595784	test: 0.5189618	best: 0.5137422 (191)	total: 1m 54s	remaining: 28.5s
900:	learn: 0.4543682	test: 0.5198098	best: 0.5137422 (191)	total: 2m 12s	remaining: 14.6s
999:	learn: 0.4496056	test: 0.5205258	best: 0.5137422 (191)	total: 2m 30s

In [21]:
#Loss
print("Train loss: ", log_loss(Y_train, Y_hat_train))
print("Test loss: ", log_loss(Y_test, Y_hat_test))

Train loss:  0.4992771847386423
Test loss:  0.5148485664358327


In [22]:
#Result model
Y_hat_train = clf_catboost.predict_proba(X_train)
Y_hat_test = clf_catboost.predict_proba(X_test)

In [23]:
precision, recall, thresholds = precision_recall_curve(y_test, Y_hat_test[:,1])
fpr, tpr, thresholds = roc_curve(y_test, Y_hat_test[:,1])

print("AUC PR (CatBoost - FillNa MissForest): ", auc(recall, precision))
print("AUC ROC (CatBoost - FillNa MissForest): ", auc(fpr, tpr))

AUC PR (CatBoost - FillNa MissForest):  0.7915241456724408
AUC ROC (CatBoost - FillNa MissForest):  0.8206230374883372
