### Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

### Dataset

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train_df.head()

Unnamed: 0,PatientID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,target
0,42351,1,1,1,29,0,0,0,1,1,...,0,3,0,0,0,0,13,5,8,0
1,135091,1,0,1,30,0,1,2,0,0,...,0,2,0,0,0,0,9,5,6,0
2,201403,0,0,1,31,0,0,0,1,1,...,0,2,0,7,0,0,10,6,8,0
3,72750,0,0,1,36,0,0,2,0,0,...,0,2,0,0,0,0,11,5,6,0
4,133895,0,1,1,29,0,0,0,1,1,...,0,4,0,0,1,1,10,6,7,0


In [4]:
X = train_df.drop(['PatientID','target'], axis = 1)
y = train_df['target'].copy()

test = test_df.drop('PatientID', axis = 1)

### Splitting Dataset

In [5]:
strat_split = StratifiedShuffleSplit(n_splits=1, test_size = 0.25, random_state = 2021)
for train_index, test_index in strat_split.split(X, y):
    X_Train = X.iloc[train_index]; X_Test = X.iloc[test_index]
    y_Train = y.iloc[train_index]; y_Test = y.iloc[test_index]

### Model fitting

In [6]:
skfold = StratifiedKFold(n_splits = 10, random_state = 2022, shuffle = True)

### XGBoost

In [7]:
model_xgb = XGBClassifier(learning_rate=0.01,subsample=0.7,colsample_bytree=0.9,reg_alpha=10,
               n_jobs=-1,n_estimators=1500,max_depth= 5,random_state=34)

train_roc_xgb, val_roc_xgb = [], []

train_pred_xgb = np.zeros(len(X_Train))
val_pred_xgb = np.zeros(len(X_Train))
test_pred_xgb = np.zeros(len(test))
validation_pred_xgb = np.zeros(len(X_Test))
fold = 0

for train_index, val_index in skfold.split(X_Train, y_Train):
    X_train = X_Train.iloc[train_index] ; y_train = y_Train.iloc[train_index].values
    X_val = X_Train.iloc[val_index] ; y_val = y_Train.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_xgb.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric="auc",
                       eval_set=[(X_val, y_val)],verbose=250)
    
    train_pred = model_xgb.predict_proba(X_train, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]
    val_pred = model_xgb.predict_proba(X_val, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]

    train_pred_xgb[train_index] = train_pred
    val_pred_xgb[val_index] = val_pred
    
    test_pred_xgb += model_xgb.predict_proba(test)[:,1]
    validation_pred_xgb += model_xgb.predict_proba(X_Test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_xgb.append(roc_auc_score(y_train, train_pred)); val_roc_xgb.append(roc_auc_score(y_val,val_pred))
    
test_pred_xgb = test_pred_xgb / 10
validation_pred_xgb = validation_pred_xgb / 10
print(f'Training ROC score : {np.mean(train_roc_xgb)}')
print(f'Testing ROC score : {np.mean(val_roc_xgb)} +/- {np.std(val_roc_xgb)}')

***************************Fold :1***********************************************
[0]	validation_0-auc:0.82407
[250]	validation_0-auc:0.85413
[500]	validation_0-auc:0.85898
[750]	validation_0-auc:0.85977
[1000]	validation_0-auc:0.85984
[1079]	validation_0-auc:0.85983
Train score : 0.8548752879644992
Validation score : 0.8598786221163094

***************************Fold :2***********************************************
[0]	validation_0-auc:0.81182
[250]	validation_0-auc:0.84304
[500]	validation_0-auc:0.84618
[750]	validation_0-auc:0.84712
[1000]	validation_0-auc:0.84754
[1250]	validation_0-auc:0.84768
[1499]	validation_0-auc:0.84768
Train score : 0.8590275631473014
Validation score : 0.8477394467223942

***************************Fold :3***********************************************
[0]	validation_0-auc:0.81737
[250]	validation_0-auc:0.85040
[500]	validation_0-auc:0.85377
[750]	validation_0-auc:0.85479
[1000]	validation_0-auc:0.85517
[1250]	validation_0-auc:0.85530
[1486]	validation_0-

### Choosing a Threshold

In [8]:
train_pred = pd.DataFrame({'Pred_proba': val_pred_xgb, 'Target': y_Train})

In [9]:
sum(train_pred[train_pred['Target'].eq(0)]['Pred_proba'].lt(0.13))/len(train_pred[train_pred['Target'].eq(0)])*100

79.93169648037932

In [10]:
sum(train_pred[train_pred['Target'].eq(1)]['Pred_proba'].ge(0.13))/len(train_pred[train_pred['Target'].eq(1)])*100

72.74394132653062

In [11]:
train_pred[train_pred['Target'].eq(1)].describe()

Unnamed: 0,Pred_proba,Target
count,12544.0,12544.0
mean,0.258929,1.0
std,0.169924,0.0
min,0.001949,1.0
25%,0.12085,1.0
50%,0.232234,1.0
75%,0.374559,1.0
max,0.813601,1.0


In [12]:
train_pred[train_pred['Target'].eq(0)].describe()

Unnamed: 0,Pred_proba,Target
count,120638.0,120638.0
mean,0.077075,0.0
std,0.105755,0.0
min,0.001515,0.0
25%,0.009162,0.0
50%,0.030296,0.0
75%,0.101537,0.0
max,0.8034,0.0


In [13]:
train_pred.head()

Unnamed: 0,Pred_proba,Target
86585,0.003404,0
71512,0.330358,0
142287,0.008621,0
135730,0.004984,0
64399,0.006419,0


In [14]:
train_pred['treshold_0.12'] = train_pred['Pred_proba'].apply(lambda x: 1 if x >= 0.12 else 0)

In [15]:
f1_score(train_pred['Target'], train_pred['treshold_0.12'])

0.39204521840322515

### Model Evaluation

In [20]:
test_pred.head()

Unnamed: 0,Pred_proba,treshold_0.12
0,0.022175,0
1,0.029917,0
2,0.133997,1
3,0.021714,0
4,0.154727,1


In [17]:
validation_pred = pd.DataFrame(validation_pred_xgb, columns = ['Pred_proba'])
validation_pred['treshold_0.12'] = validation_pred['Pred_proba'].apply(lambda x: 1 if x >= 0.12 else 0)

In [18]:
validation_pred.head()

Unnamed: 0,Pred_proba,treshold_0.12
0,0.003819,0
1,0.056411,0
2,0.003566,0
3,0.004894,0
4,0.02117,0


In [21]:
f1_score(y_Test, validation_pred['treshold_0.12'])

0.39308507342462357

In [19]:
test_pred = pd.DataFrame(test_pred_xgb, columns = ['Pred_proba'])
test_pred['treshold_0.12'] = test_pred['Pred_proba'].apply(lambda x: 1 if x >= 0.12 else 0)

### Submission file

In [25]:
sub_1 = sample_submission.copy()
sub_1['target'] = test_pred['treshold_0.12']
sub_1.to_csv('Baseline_model.csv', index = False)

In [26]:
sub_1.head()

Unnamed: 0,PatientID,target
0,222135,0
1,66531,0
2,212920,1
3,11402,0
4,136077,1
