In [1]:
import numpy as np 
import pandas as pd 
import pickle

from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


# **SOME DEFAULT FUNCTIONS**

In [2]:
def convert_probability_to_1_0_score(y_pred_proba, threshold):
    y_pred = np.zeros(y_pred_proba.shape[0])
    for i in range(y_pred_proba.shape[0]):
        if y_pred_proba[i][1] >= threshold:
            y_pred[i] = 1
    return y_pred

In [3]:
def convert_categorical_data_to_numerical(df, categorical_columns):
    for column in categorical_columns:
        number = LabelEncoder()
        df[column] = number.fit_transform(df[column].astype('str')).astype(float)
        le_name_mapping = dict(zip(number.classes_, number.transform(number.classes_)))
        print(column)
        print(le_name_mapping)

# **GLOBAL VARIABLES**

In [4]:
FEATURES = ['is_Al', 'Al_crackles', 'Al_wheezes', 'is_Ar', 'Ar_crackles', 'Ar_wheezes', 'is_Pl', 
            'Pl_crackles', 'Pl_wheezes', 'is_Pr', 'Pr_crackles', 'Pr_wheezes', 'is_Ll', 'Ll_crackles', 
            'Ll_wheezes', 'is_Lr', 'Lr_crackles', 'Lr_wheezes', 'is_Tc', 'Tc_crackles', 'Tc_wheezes', 
            'age', 'sex', 'adult_bmi', 'child_weight', 'child_height']
CATEGORICAL_COLUMNS = ['sex']
PATH_TO_PATIENT_INFO_DF = '/kaggle/input/notebookd311de3272/train_data.csv'
TEST_SIZE = 0.1
MAX_ITERATIONS = 500
THRESHOLD = 0.5
FILE_NAME_TO_SAVE_MODEL = "is_healthy_model.pkl"


# **READING DATASET**

In [5]:
patient_info_df = pd.read_csv(PATH_TO_PATIENT_INFO_DF)
patient_info_df

Unnamed: 0.1,Unnamed: 0,pid,is_Al,Al_crackles,Al_wheezes,is_Ar,Ar_crackles,Ar_wheezes,is_Pl,Pl_crackles,...,Lr_wheezes,is_Tc,Tc_crackles,Tc_wheezes,age,sex,adult_bmi,child_weight,child_height,disease
0,0,101,True,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,3.00,F,0.00,19.0,99.0,URTI
1,1,102,False,0.0,0.0,True,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,0.75,F,0.00,9.8,73.0,Healthy
2,2,103,False,0.0,0.0,True,0.0,0.666667,False,0.000000,...,0.0,False,0.0,0.0,70.00,F,33.00,0.0,0.0,Asthma
3,3,104,True,0.0,0.0,True,0.0,0.714286,True,0.000000,...,0.0,False,0.0,0.0,70.00,F,28.47,0.0,0.0,COPD
4,4,105,False,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,True,0.0,0.0,7.00,F,0.00,32.0,135.0,URTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,121,222,False,0.0,0.0,True,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,60.00,M,0.00,0.0,0.0,COPD
122,122,223,True,0.0,0.0,True,0.0,1.000000,True,0.000000,...,0.0,False,0.0,0.0,0.00,0,0.00,0.0,0.0,COPD
123,123,224,True,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,True,0.0,0.0,10.00,F,0.00,32.3,143.0,Healthy
124,124,225,False,0.0,0.0,False,0.0,0.000000,True,0.000000,...,0.0,False,0.0,0.0,0.83,M,0.00,7.8,74.0,Healthy


# **CONVERTING CATEGORICAL VALUES**

In [6]:
convert_categorical_data_to_numerical(patient_info_df, CATEGORICAL_COLUMNS)
patient_info_df

sex
{'0': 0, 'F': 1, 'M': 2}


Unnamed: 0.1,Unnamed: 0,pid,is_Al,Al_crackles,Al_wheezes,is_Ar,Ar_crackles,Ar_wheezes,is_Pl,Pl_crackles,...,Lr_wheezes,is_Tc,Tc_crackles,Tc_wheezes,age,sex,adult_bmi,child_weight,child_height,disease
0,0,101,True,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,3.00,1.0,0.00,19.0,99.0,URTI
1,1,102,False,0.0,0.0,True,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,0.75,1.0,0.00,9.8,73.0,Healthy
2,2,103,False,0.0,0.0,True,0.0,0.666667,False,0.000000,...,0.0,False,0.0,0.0,70.00,1.0,33.00,0.0,0.0,Asthma
3,3,104,True,0.0,0.0,True,0.0,0.714286,True,0.000000,...,0.0,False,0.0,0.0,70.00,1.0,28.47,0.0,0.0,COPD
4,4,105,False,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,True,0.0,0.0,7.00,1.0,0.00,32.0,135.0,URTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,121,222,False,0.0,0.0,True,0.0,0.000000,False,0.000000,...,0.0,False,0.0,0.0,60.00,2.0,0.00,0.0,0.0,COPD
122,122,223,True,0.0,0.0,True,0.0,1.000000,True,0.000000,...,0.0,False,0.0,0.0,0.00,0.0,0.00,0.0,0.0,COPD
123,123,224,True,0.0,0.0,False,0.0,0.000000,False,0.000000,...,0.0,True,0.0,0.0,10.00,1.0,0.00,32.3,143.0,Healthy
124,124,225,False,0.0,0.0,False,0.0,0.000000,True,0.000000,...,0.0,False,0.0,0.0,0.83,2.0,0.00,7.8,74.0,Healthy


# **LABELS CREATING**

In [7]:
patient_info_df['is_healthy'] = [1 if d == 'Healthy' else 0 for d in patient_info_df.disease]

# **SPLITTING DATA INTO TRAIN AND TEST**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(patient_info_df[FEATURES], patient_info_df['is_healthy'], test_size=0.2, random_state=44)

# **TRAINING GRADIENT BOOSTING**

In [9]:
clf = GradientBoostingClassifier().fit(X_train, y_train)

# **CONVERTING PROBABILITY TO 0 AND 1 SCORES**

In [10]:
y_pred = convert_probability_to_1_0_score(clf.predict_proba(np.array(X_test)), THRESHOLD)

# **BUILDING CONFUSION MATRIX**

In [11]:
cf_matrix = confusion_matrix(y_pred, y_test)
print(cf_matrix)

[[19  0]
 [ 1  6]]


# **CALCULATING CLASSIFICATION REPORT**

In [12]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     1.0000    0.9500    0.9744        20
           1     0.8571    1.0000    0.9231         6

    accuracy                         0.9615        26
   macro avg     0.9286    0.9750    0.9487        26
weighted avg     0.9670    0.9615    0.9625        26



# **MODEL SAVING**

In [13]:
with open(FILE_NAME_TO_SAVE_MODEL, 'wb') as file:
    pickle.dump(clf, file)