
# **GOOD-HEALTH-AND-WELL-BEING**

### ***Setting Up***

### Importing Libaries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.cluster import KMeans

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

### **Dataset Preprocessing**

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
SEED = 2022   
seed_everything(SEED) 

In [4]:
train_df = pd.read_csv('Ml-Olympiad/train.csv')
test_df = pd.read_csv('Ml-Olympiad/test.csv')
sample_submission = pd.read_csv('Ml-Olympiad/sample_submission.csv')

In [5]:
train_df.head(3)

Unnamed: 0,PatientID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,target
0,42351,1,1,1,29,0,0,0,1,1,1,0,1,0,3,0,0,0,0,13,5,8,0
1,135091,1,0,1,30,0,1,2,0,0,0,0,0,0,2,0,0,0,0,9,5,6,0
2,201403,0,0,1,31,0,0,0,1,1,1,0,1,0,2,0,7,0,0,10,6,8,0


In [6]:
X_df = train_df.drop(['PatientID','target'], axis = 1)
y = train_df['target']

test_df = test_df.drop('PatientID', axis = 1)

In [7]:
train_df.isnull().sum().sum(), pd.read_csv('Ml-Olympiad/train.csv').isnull().sum().sum()

(0, 0)

 ### **Feature Engineering and Selection**

**1. Feature Interaction**

In [8]:
feat_interact = PolynomialFeatures(interaction_only = True, include_bias = False)
train_fi = feat_interact.fit_transform(X_df)
test_fi = feat_interact.fit_transform(test_df)

col_names = [f'col_{n}' for n in range(1,232)]

train_fi = pd.DataFrame(train_fi, columns = col_names)
test_fi = pd.DataFrame(test_fi, columns = col_names)

In [9]:
train_fi.shape, test_fi.shape

((177576, 231), (76104, 231))

In [10]:
imp_featues = ['col_208', 'col_34', 'col_22', 'col_57', 'col_53', 'col_26', 'col_119',
       'col_63', 'col_38', 'col_39', 'col_23', 'col_226', 'col_44', 'col_17',
       'col_74', 'col_42', 'col_37', 'col_6', 'col_75'] ## 'col_18',

X = pd.concat([X_df, train_fi[imp_featues]], axis = 1)
test = pd.concat([test_df, test_fi[imp_featues]], axis = 1)

**2. Clustering**

In [11]:
cluster_1 = ['HighBP', 'HighChol', 'CholCheck', 'BMI']
cluster_2 = ['Smoker', 'Stroke','Diabetes', 'PhysActivity']
cluster_3 = ['Fruits', 'Veggies', 'HvyAlcoholConsump','AnyHealthcare']
cluster_4 = ['NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth']

cluster_zip = [('cluster_1',cluster_1),('cluster_2',cluster_2),('cluster_3',cluster_3),('cluster_4',cluster_4)]#

for cluster_name, cluster_col in cluster_zip:
  cluster = KMeans(n_clusters = 4)
  cluster.fit(X[cluster_col])
  X[cluster_name] = cluster.predict(X[cluster_col])
  test[cluster_name] = cluster.predict(test[cluster_col])

3. **OneHotEncoding of**

In [12]:
X['Diabetes'] = X['Diabetes'].apply(lambda x: str(x))
test['Diabetes'] = test['Diabetes'].apply(lambda x: str(x))

In [13]:
X = pd.get_dummies(X, drop_first = True)
test = pd.get_dummies(test, drop_first = True)

### **Splitting Dataset**

In [14]:
strat_split = StratifiedShuffleSplit(n_splits=1, test_size = 0.25, random_state = 2021)
for train_index, test_index in strat_split.split(X, y):
    X_Train = X.iloc[train_index]; X_Test = X.iloc[test_index]
    y_Train = y.iloc[train_index]; y_Test = y.iloc[test_index]

**CV fold**

In [16]:
skfold = StratifiedKFold(n_splits = 10, random_state = 2022, shuffle = True)

# **XGBoost CV**

In [17]:
model_xgb = XGBClassifier(objective='binary:logistic', max_depth =  10,
          learning_rate = 0.06808766268462589, colsample_bytree = 0.5124699707194202, 
             subsample =  0.4553243247037658, reg_alpha =  1.5936907054484504, random_state = 117)


train_roc_xgb, val_roc_xgb = [], []

train_pred_xgb = np.zeros(len(X_Train))
val_pred_xgb = np.zeros(len(X_Train))
test_pred_xgb = np.zeros(len(test))
validation_pred_xgb = np.zeros(len(X_Test))
fold = 0

for train_index, val_index in skfold.split(X_Train, y_Train):
    X_train = X_Train.iloc[train_index] ; y_train = y_Train.iloc[train_index].values
    X_val = X_Train.iloc[val_index] ; y_val = y_Train.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_xgb.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric="auc",
                       eval_set=[(X_val, y_val)],verbose=250)
    
    train_pred = model_xgb.predict_proba(X_train, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]
    val_pred = model_xgb.predict_proba(X_val, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]

    train_pred_xgb[train_index] = train_pred
    val_pred_xgb[val_index] = val_pred
    
    test_pred_xgb += model_xgb.predict_proba(test)[:,1]
    validation_pred_xgb += model_xgb.predict_proba(X_Test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_xgb.append(roc_auc_score(y_train, train_pred)); val_roc_xgb.append(roc_auc_score(y_val,val_pred))
    
test_pred_xgb = test_pred_xgb / 10
validation_pred_xgb = validation_pred_xgb / 10
print(f'Training ROC score : {np.mean(train_roc_xgb)}')
print(f'Testing ROC score : {np.mean(val_roc_xgb)} +/- {np.std(val_roc_xgb)}')

***************************Fold :1***********************************************
[0]	validation_0-auc:0.835437
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.856378
Train score : 0.8795144646980229
Validation score : 0.8578562738436175

***************************Fold :2***********************************************
[0]	validation_0-auc:0.82246
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.844709
Train score : 0.8828473047426211
Validation score : 0.8452279410210615

***************************Fold :3***********************************************
[0]	validation_0-auc:0.831524
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.851927
Train score : 0.8791251624265176
Validation score : 0.8529934977303398

***************************Fold :4***********************************************
[0]	validation_0-auc:0.838622
Will train until validation_0-auc hasn't improved

### **Choosing a Threshold**

In [18]:
train_prediction = pd.DataFrame({'XGBoost_Prob': val_pred_xgb, 'Target': y_Train})

In [19]:
train_prediction[train_prediction['Target'].eq(1)].describe()

Unnamed: 0,XGBoost_Prob,Target
count,12544.0,12544.0
mean,0.267931,1.0
std,0.163892,0.0
min,0.009081,1.0
25%,0.136217,1.0
50%,0.241273,1.0
75%,0.373489,1.0
max,0.868208,1.0


In [20]:
train_prediction[train_prediction['Target'].eq(0)].describe()

Unnamed: 0,XGBoost_Prob,Target
count,120638.0,120638.0
mean,0.09692,0.0
std,0.103169,0.0
min,0.008152,0.0
25%,0.022955,0.0
50%,0.067303,0.0
75%,0.123228,0.0
max,0.874465,0.0


In [63]:
xgb_train_pred = train_prediction['XGBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(y_Train, xgb_train_pred)

0.41449916989485336

### **Model Evaluation**

In [64]:
val_prediction = pd.DataFrame({'XGBoost_Prob' : validation_pred_xgb, 'Target': y_Test })
xgb_val_pred = val_prediction['XGBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(y_Test, xgb_val_pred)

0.42258198422581983

### **Model Prediction**

In [65]:
test_prediction = pd.DataFrame({'XGBoost_Prob': test_pred_xgb})
xgb_test_pred = test_prediction['XGBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)

sub_1 = sample_submission.copy()
sub_1['target'] = xgb_test_pred
sub_1.to_csv('Ml-Olympiad/Xgboost_unbal.csv', index = False)

# **LightGBM CV**

In [24]:
model_lgb = LGBMClassifier(boosting_type =  'gbdt',objective = 'binary',random_state=34, n_estimators=500,
    colsample_bytree=0.9, min_child_samples=10, subsample=0.7, subsample_freq=2,
    num_leaves=120,reg_lambda=2, reg_alpha=5 ,metric='auc', learning_rate=0.008, max_depth=5)

In [25]:
train_roc_lgb, val_roc_lgb = [], []

train_pred_lgb = np.zeros(len(X_Train))
val_pred_lgb = np.zeros(len(X_Train))
test_pred_lgb = np.zeros(len(test))
validation_pred_lgb = np.zeros(len(X_Test))
fold = 0

for train_index, val_index in skfold.split(X_Train, y_Train):
    X_train = X_Train.iloc[train_index] ; y_train = y_Train.iloc[train_index].values
    X_val = X_Train.iloc[val_index] ; y_val = y_Train.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_lgb.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=200,
                               verbose=250)
    train_pred = model_lgb.predict_proba(X_train,  num_iteration = model_lgb.best_iteration_)[:,1]
    val_pred = model_lgb.predict_proba(X_val, num_iteration = model_lgb.best_iteration_)[:,1]

    train_pred_lgb[train_index] = train_pred
    val_pred_lgb[val_index] = val_pred
    
    test_pred_lgb += model_lgb.predict_proba(test)[:,1]
    validation_pred_lgb += model_lgb.predict_proba(X_Test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_lgb.append(roc_auc_score(y_train, train_pred)); val_roc_lgb.append(roc_auc_score(y_val,val_pred))
    
test_pred_lgb = test_pred_lgb / 10
validation_pred_lgb = validation_pred_lgb / 10
print(f'Training ROC score : {np.mean(train_roc_lgb)}')
print(f'Testing ROC score : {np.mean(val_roc_lgb)} +/- {np.std(val_roc_lgb)}')

***************************Fold :1***********************************************
Training until validation scores don't improve for 200 rounds.
[250]	valid_0's auc: 0.857124
[500]	valid_0's auc: 0.85924
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.85924
Train score : 0.852909444812952
Validation score : 0.859240491614444

***************************Fold :2***********************************************
Training until validation scores don't improve for 200 rounds.
[250]	valid_0's auc: 0.842628
[500]	valid_0's auc: 0.845278
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.845278
Train score : 0.8543661314569557
Validation score : 0.8452780720618852

***************************Fold :3***********************************************
Training until validation scores don't improve for 200 rounds.
[250]	valid_0's auc: 0.851738
[500]	valid_0's auc: 0.853995
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.853995
Train score

### **Choosing Treshold**

In [26]:
train_prediction['LightGBM_Prob'] = val_pred_lgb

In [27]:
train_prediction[train_prediction['Target'].eq(1)].describe()

Unnamed: 0,XGBoost_Prob,Target,LightGBM_Prob
count,12544.0,12544.0,12544.0
mean,0.267931,1.0,0.249302
std,0.163892,0.0,0.157925
min,0.009081,1.0,0.0056
25%,0.136217,1.0,0.119747
50%,0.241273,1.0,0.229223
75%,0.373489,1.0,0.355282
max,0.868208,1.0,0.730671


In [28]:
train_prediction[train_prediction['Target'].eq(0)].describe()

Unnamed: 0,XGBoost_Prob,Target,LightGBM_Prob
count,120638.0,120638.0,120638.0
mean,0.09692,0.0,0.077756
std,0.103169,0.0,0.10069
min,0.008152,0.0,0.005577
25%,0.022955,0.0,0.011976
50%,0.067303,0.0,0.033697
75%,0.123228,0.0,0.10354
max,0.874465,0.0,0.723518


### **Model Evaluation**

In [61]:
lgb_train_pred = train_prediction['LightGBM_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(train_prediction['Target'], lgb_train_pred)

0.4206873859591397

In [75]:
val_prediction['LightGBM_Prob'] = validation_pred_lgb
lgb_val_pred = val_prediction['LightGBM_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(y_Test, lgb_val_pred)

0.425044404973357

### **LGBM Model Submission**

In [66]:
test_prediction['LightGBM_Prob'] = test_pred_lgb
lgb_test_pred = test_prediction['LightGBM_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)

sub_2 = sample_submission.copy()
sub_2['target'] = lgb_test_pred
sub_2.to_csv('Ml-Olympiad/LGBM_unbal.csv', index = False)

# **CATBOOST**

In [32]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [33]:
from catboost import CatBoostClassifier

In [34]:
model_cat =  CatBoostClassifier(random_seed=34,use_best_model=True,
                          n_estimators=5000,silent=True,eval_metric='AUC')

train_roc_cat, val_roc_cat = [], []

train_pred_cat = np.zeros(len(X_Train))
val_pred_cat = np.zeros(len(X_Train))
test_pred_cat = np.zeros(len(test))
validation_pred_cat = np.zeros(len(X_Test))
fold = 0

for train_index, val_index in skfold.split(X_Train, y_Train):
    X_train = X_Train.iloc[train_index] ; y_train = y_Train.iloc[train_index].values
    X_val = X_Train.iloc[val_index] ; y_val = y_Train.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_cat.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=200,
                               verbose=250,use_best_model=True)
    train_pred = model_cat.predict_proba(X_train)[:,1]
    val_pred = model_cat.predict_proba(X_val)[:,1]

    train_pred_cat[train_index] = train_pred
    val_pred_cat[val_index] = val_pred
    
    test_pred_cat += model_cat.predict_proba(test)[:,1]
    validation_pred_cat += model_cat.predict_proba(X_Test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_cat.append(roc_auc_score(y_train, train_pred)); val_roc_cat.append(roc_auc_score(y_val,val_pred))
    
test_pred_cat = test_pred_cat / 10
validation_pred_cat = validation_pred_cat / 10
print(f'Training ROC score : {np.mean(train_roc_cat)}')
print(f'Testing ROC score : {np.mean(val_roc_cat)} +/- {np.std(val_roc_cat)}')

***************************Fold :1***********************************************
Learning rate set to 0.051321
0:	test: 0.8117714	best: 0.8117714 (0)	total: 83.9ms	remaining: 6m 59s
250:	test: 0.8594563	best: 0.8597276 (125)	total: 16s	remaining: 5m 3s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8597275685
bestIteration = 125

Shrink model to first 126 iterations.
Train score : 0.853053617261198
Validation score : 0.8597275685058176

***************************Fold :2***********************************************
Learning rate set to 0.051321
0:	test: 0.8013081	best: 0.8013081 (0)	total: 138ms	remaining: 11m 31s
250:	test: 0.8474953	best: 0.8475026 (246)	total: 33.1s	remaining: 10m 25s
500:	test: 0.8476325	best: 0.8476774 (431)	total: 53.7s	remaining: 8m 2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8476773939
bestIteration = 431

Shrink model to first 432 iterations.
Train score : 0.8645136649782799
Validation score : 0.847677393872

### **Choosing Treshold**

In [35]:
train_prediction['CatBoost_Prob'] = val_pred_cat

In [36]:
train_prediction[train_prediction['Target'].eq(1)].describe()

Unnamed: 0,XGBoost_Prob,Target,LightGBM_Prob,CatBoost_Prob
count,12544.0,12544.0,12544.0,12544.0
mean,0.267931,1.0,0.249302,0.258343
std,0.163892,0.0,0.157925,0.168871
min,0.009081,1.0,0.0056,0.002457
25%,0.136217,1.0,0.119747,0.122539
50%,0.241273,1.0,0.229223,0.235412
75%,0.373489,1.0,0.355282,0.366528
max,0.868208,1.0,0.730671,0.883919


In [37]:
train_prediction[train_prediction['Target'].eq(0)].describe()

Unnamed: 0,XGBoost_Prob,Target,LightGBM_Prob,CatBoost_Prob
count,120638.0,120638.0,120638.0,120638.0
mean,0.09692,0.0,0.077756,0.077
std,0.103169,0.0,0.10069,0.105689
min,0.008152,0.0,0.005577,0.001428
25%,0.022955,0.0,0.011976,0.009044
50%,0.067303,0.0,0.033697,0.02895
75%,0.123228,0.0,0.10354,0.103026
max,0.874465,0.0,0.723518,0.87295


### **Model Evaluation**

In [67]:
cat_train_pred= train_prediction['CatBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(train_prediction['Target'], cat_train_pred)

0.4207148067104304

In [68]:
val_prediction['CatBoost_Prob'] = validation_pred_cat
cat_val_pred = val_prediction['CatBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(y_Test, cat_val_pred)

0.4255725190839695

### **CatBoost Model Submission**

In [69]:
test_prediction['CatBoost_Prob'] = test_pred_cat
cat_test_pred = test_prediction['CatBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)

sub_3 = sample_submission.copy()
sub_3['target'] = cat_test_pred
sub_3.to_csv('Ml-Olympiad/CatBoost_unbal.csv', index = False)

# **Model Blending**

In [76]:
train_prediction['Blend'] = 0.5 * train_prediction['XGBoost_Prob'] + 0.3 * train_prediction['LightGBM_Prob'] + 0.2 * train_prediction['CatBoost_Prob']
val_prediction['Blend'] = 0.5 * val_prediction['XGBoost_Prob'] + 0.3 * val_prediction['LightGBM_Prob'] + 0.2 * val_prediction['CatBoost_Prob']
test_prediction['Blend'] = 0.5 * test_prediction['XGBoost_Prob'] + 0.3 * test_prediction['LightGBM_Prob'] + 0.2 * test_prediction['CatBoost_Prob']

In [79]:
blend_train_pred= train_prediction['Blend'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(train_prediction['Target'], blend_train_pred)

0.4195940336110396

In [80]:
val_prediction.head()

Unnamed: 0,XGBoost_Prob,Target,CatBoost_Prob,LightGBM_Prob,Blend
153205,0.028577,0,0.004041,0.006975,0.017189
136740,0.074955,0,0.051071,0.053474,0.063734
118183,0.028408,0,0.004223,0.006245,0.016922
150106,0.029316,0,0.005371,0.006564,0.017701
163046,0.046891,0,0.024138,0.025851,0.036029


In [81]:
blend_val_pred= val_prediction['Blend'].apply(lambda x: 1 if x >= 0.2 else 0)
f1_score(val_prediction['Target'], blend_val_pred)

0.42632161402911384

In [82]:
sub_4 = sample_submission.copy()
sub_4['target'] = test_prediction['Blend'].apply(lambda x: 1 if x >= 0.2 else 0)
sub_4.to_csv('Ml-Olympiad/Blending_unbal.csv', index = False)

In [86]:
val_prediction.head()

Unnamed: 0,XGBoost_Prob,Target,CatBoost_Prob,LightGBM_Prob,Blend
153205,0.028577,0,0.004041,0.006975,0.017189
136740,0.074955,0,0.051071,0.053474,0.063734
118183,0.028408,0,0.004223,0.006245,0.016922
150106,0.029316,0,0.005371,0.006564,0.017701
163046,0.046891,0,0.024138,0.025851,0.036029


In [88]:
train_prediction.drop('Stack', axis = 1, inplace = True)

# **Model Stacking**

In [89]:
X_stack_T = train_prediction.drop(['Blend', 'Target'], axis = 1)
y_stack_T = train_prediction['Target']

X_stack_v = val_prediction.drop(['Blend', 'Target'], axis = 1)
y_stack_v = val_prediction['Target']

X_stack_t = test_prediction.drop('Blend', axis = 1)

In [84]:
from sklearn.linear_model import LinearRegression

In [90]:
stack_model = LinearRegression()
stack_model.fit(X_stack_T, y_stack_T)

train_prediction['Stack'] = stack_model.predict(X_stack_T)
val_prediction['Stack'] = stack_model.predict(X_stack_v)
test_prediction['Stack'] = stack_model.predict(X_stack_t)

In [96]:
sub_5 = sample_submission.copy()
sub_5['target'] = test_prediction['Stack'].apply(lambda x: 1 if x >= 0.2 else 0)
sub_5.to_csv('Ml-Olympiad/Stacking_unbal.csv', index = False)