In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

from catboost import CatBoostClassifier, Pool

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
file = 'synthetic_asthma_dataset.csv'
data = pd.read_csv(file)

# EDA

In [3]:
data.head()

Unnamed: 0,Patient_ID,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma,Asthma_Control_Level
0,ASTH100000,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0,
1,ASTH100001,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.6,2,297.6,22.9,0,
2,ASTH100002,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0,
3,ASTH100003,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.6,1,438.0,40.1,1,Poorly Controlled
4,ASTH100004,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Patient_ID               10000 non-null  object 
 1   Age                      10000 non-null  int64  
 2   Gender                   10000 non-null  object 
 3   BMI                      10000 non-null  float64
 4   Smoking_Status           10000 non-null  object 
 5   Family_History           10000 non-null  int64  
 6   Allergies                7064 non-null   object 
 7   Air_Pollution_Level      10000 non-null  object 
 8   Physical_Activity_Level  10000 non-null  object 
 9   Occupation_Type          10000 non-null  object 
 10  Comorbidities            5033 non-null   object 
 11  Medication_Adherence     10000 non-null  float64
 12  Number_of_ER_Visits      10000 non-null  int64  
 13  Peak_Expiratory_Flow     10000 non-null  float64
 14  FeNO_Level             

In [5]:
data.describe()

Unnamed: 0,Age,BMI,Family_History,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,44.9307,25.05332,0.3034,0.497998,1.0159,400.88409,25.10142,0.2433
std,25.653559,4.874466,0.459749,0.224809,1.020564,97.531113,9.840184,0.429096
min,1.0,15.0,0.0,0.0,0.0,150.0,5.0,0.0
25%,23.0,21.6,0.0,0.32,0.0,334.8,18.2,0.0
50%,45.0,25.0,0.0,0.5,1.0,402.5,25.0,0.0
75%,67.0,28.4,1.0,0.67,2.0,468.7,31.7,0.0
max,89.0,45.0,1.0,0.99,6.0,600.0,63.9,1.0


## Unique Values

In [6]:
print("\nUnique values for each column:\n")
for col in data.columns:
    unique_vals = data[col].unique()
    print(f"{col}: {unique_vals}")


Unique values for each column:

Patient_ID: ['ASTH100000' 'ASTH100001' 'ASTH100002' ... 'ASTH109997' 'ASTH109998'
 'ASTH109999']
Age: [52 15 72 61 21 83 87 75 88 24  3 22 53  2 30 38 64 60 33 76 58 89 49 59
 42 80 62 47 51 55  7 73 39 18  4 14  9 84 71 44  8 35 78 81 36 50  6 54
 63 34 74 48 40 85 82 26 41 29 45 65  1 11  5 28 12 23 37 86 27 79 77 43
 13 32 66 57 70 56 19 68 17 69 16 20 67 31 25 46 10]
Gender: ['Female' 'Male' 'Other']
BMI: [27.6 24.6 17.6 16.8 30.2 27.8 32.3 29.7 23.1 15.  28.  19.9 24.1 27.1
 21.8 20.9 23.5 19.8 23.8 20.3 22.4 22.8 35.6 28.8 24.4 27.2 29.6 26.4
 24.3 20.1 23.4 24.2 30.3 21.  20.6 21.4 22.  26.6 21.1 25.6 23.9 29.2
 26.8 22.2 31.  38.1 21.7 22.6 18.5 29.3 34.6 23.6 20.5 25.4 28.2 27.9
 18.8 23.2 25.  31.3 31.5 30.8 30.7 16.  31.1 29.9 20.2 18.1 17.9 16.7
 27.  20.4 28.3 32.8 24.5 21.6 29.5 26.2 33.9 36.4 19.5 26.3 26.5 22.9
 35.5 22.3 28.5 20.8 21.5 32.6 28.4 16.5 31.2 16.2 36.2 25.8 18.4 16.1
 27.3 33.4 30.6 22.7 18.  23.  19.6 21.2 27.7 22.5 25.5 1

In [7]:
print(data.nunique())

Patient_ID                 10000
Age                           89
Gender                         3
BMI                          254
Smoking_Status                 3
Family_History                 2
Allergies                      4
Air_Pollution_Level            3
Physical_Activity_Level        3
Occupation_Type                2
Comorbidities                  3
Medication_Adherence         100
Number_of_ER_Visits            7
Peak_Expiratory_Flow        3468
FeNO_Level                   497
Has_Asthma                     2
Asthma_Control_Level           3
dtype: int64


In [8]:
for col in data.columns:
    print(f"\nValues in '{col}':")
    print(data[col].value_counts(dropna=False))


Values in 'Patient_ID':
Patient_ID
ASTH100000    1
ASTH106670    1
ASTH106663    1
ASTH106664    1
ASTH106665    1
             ..
ASTH103333    1
ASTH103334    1
ASTH103335    1
ASTH103336    1
ASTH109999    1
Name: count, Length: 10000, dtype: int64

Values in 'Age':
Age
17    139
62    134
58    131
87    130
13    129
     ... 
50     96
83     95
5      95
31     95
7      89
Name: count, Length: 89, dtype: int64

Values in 'Gender':
Gender
Female    4814
Male      4786
Other      400
Name: count, dtype: int64

Values in 'BMI':
BMI
15.0    232
25.5     98
27.2     95
25.6     94
23.0     92
       ... 
40.3      1
39.0      1
41.9      1
40.5      1
41.6      1
Name: count, Length: 254, dtype: int64

Values in 'Smoking_Status':
Smoking_Status
Never      6070
Former     2487
Current    1443
Name: count, dtype: int64

Values in 'Family_History':
Family_History
0    6966
1    3034
Name: count, dtype: int64

Values in 'Allergies':
Allergies
NaN         2936
Dust        2479
Pollen   

In [9]:
print("\nEmpty values for each columns:\n", data.isnull().sum())


Empty values for each columns:
 Patient_ID                    0
Age                           0
Gender                        0
BMI                           0
Smoking_Status                0
Family_History                0
Allergies                  2936
Air_Pollution_Level           0
Physical_Activity_Level       0
Occupation_Type               0
Comorbidities              4967
Medication_Adherence          0
Number_of_ER_Visits           0
Peak_Expiratory_Flow          0
FeNO_Level                    0
Has_Asthma                    0
Asthma_Control_Level       7567
dtype: int64


# Feature Engineering

In [10]:
data.drop('Asthma_Control_Level', axis=1, inplace=True)
data.head()

Unnamed: 0,Patient_ID,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
0,ASTH100000,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0
1,ASTH100001,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.6,2,297.6,22.9,0
2,ASTH100002,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0
3,ASTH100003,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.6,1,438.0,40.1,1
4,ASTH100004,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0


In [11]:
data['Allergies'].fillna('None', inplace=True)
data['Comorbidities'].fillna('None', inplace=True)

print('\nWe check again if there are any empty values:\n')
print(data[['Allergies', 'Comorbidities']].isnull().sum())

print('\n')
print(data['Allergies'].value_counts())
print(data['Comorbidities'].value_counts())


We check again if there are any empty values:

Allergies        0
Comorbidities    0
dtype: int64


Allergies
None        2936
Dust        2479
Pollen      1999
Pets        1585
Multiple    1001
Name: count, dtype: int64
Comorbidities
None            4967
Diabetes        2029
Hypertension    2018
Both             986
Name: count, dtype: int64


# CatBoost

In [12]:
data_catboost = data.copy()
data_catboost = data.drop('Patient_ID', axis=1)
X = data_catboost.drop('Has_Asthma', axis=1)
y = data_catboost['Has_Asthma']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [13]:
categorical_features_indices = [X.columns.get_loc(col) for col in ['Gender', 'Smoking_Status', 'Allergies', 'Air_Pollution_Level', 'Physical_Activity_Level', 'Occupation_Type', 'Comorbidities']]

print("\nTraining CatBoost Classifier")
cat_model = CatBoostClassifier(iterations=500,
                               learning_rate=0.1,
                               depth=6,
                               l2_leaf_reg=3,
                               loss_function='Logloss',
                               eval_metric='Accuracy',
                               random_seed=42,
                               verbose=0,
                               early_stopping_rounds=10,
                               cat_features=categorical_features_indices
                              )

cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_cat = cat_model.predict(X_test)
y_pred_proba_cat = cat_model.predict_proba(X_test)[:, 1]

accuracy_cat = accuracy_score(y_test, y_pred_cat)
precision_cat = precision_score(y_test, y_pred_cat)
recall_cat = recall_score(y_test, y_pred_cat)
f1_cat = f1_score(y_test, y_pred_cat)
roc_auc_cat = roc_auc_score(y_test, y_pred_proba_cat)

print(f"CatBoost Accuracy: {accuracy_cat:.4f}")
print(f"CatBoost Precision: {precision_cat:.4f}")
print(f"CatBoost Recall: {recall_cat:.4f}")
print(f"CatBoost F1-Score: {f1_cat:.4f}")
print(f"CatBoost ROC AUC: {roc_auc_cat:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_cat))


Training CatBoost Classifier
CatBoost Accuracy: 1.0000
CatBoost Precision: 1.0000
CatBoost Recall: 1.0000
CatBoost F1-Score: 1.0000
CatBoost ROC AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2270
           1       1.00      1.00      1.00       730

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [14]:
with open('asthma_prediction.pkl', 'wb') as f:
    pickle.dump(cat_model, f)

print("Model saved successfully!")

Model saved successfully!
