In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_curve, RocCurveDisplay, PrecisionRecallDisplay, auc, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

## Loading the Dataset
 The dataset is loaded and unnecessary columns, such as `id`, are removed and `age` is converted into yeears.

In [None]:
df = pd.read_csv("cardio_train_data/cardio_train.csv",sep=";")
df.drop(["id"],axis=1,inplace=True)     #dropping the "id" column
df["age"] = df["age"]/365   #converting age into years
df.head(10)

### Checking the dataset for null values

In [None]:
df.info()

In [None]:
#checking for null values
df.isna().sum()

### Splitting the features into numerical and categorical features

In [None]:
#number of unique values each feature has
df.nunique()

In [72]:
categorical_columns = ["gender","cholesterol","gluc","smoke","alco","active","cardio"]
numerical_columns = ["age","height","weight","ap_hi","ap_lo"]

## Feature Selection
### Pre-processing numerical features

In [None]:
df[numerical_columns].describe()

#### Creating boxplot for `ap_lo` and `ap_hi` and filtering unrealistic values

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,4))

sns.boxplot(df, x= "ap_lo", ax = ax[0])
sns.boxplot(df, x= "ap_hi", ax = ax[1])
ax[0].set_title("ap_lo")
ax[0].set_xlabel(None)
ax[1].set_title("ap_hi")
ax[1].set_xlabel(None)
fig.suptitle("Box Plot of `ap_hi` and `ap_lo`")
plt.tight_layout()

**Observation**: The blood pressure values should fall within a physiologically realistic range. Values outside this range will be filtered. We choose the thershold of 370/360 mm Hg as given [here](<https://pubmed.ncbi.nlm.nih.gov/7741618/#:~:text=The%20highest%20pressure%20recorded%20in,005).&text=BP%20was%20recorded%20in%2010,maximal%20lifting%20with%20slow%20exhalation.>).

In [75]:
#filetering unrealistic blood pressure values
df = df[(df["ap_hi"] <= 370) & (df["ap_hi"] >= 0)]
df = df[(df["ap_lo"] <= 360) & (df["ap_lo"] >= 0)].reset_index(drop=True) 

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,4))

sns.boxplot(df, x= "ap_lo", ax = ax[0])
sns.boxplot(df, x= "ap_hi", ax = ax[1])
ax[0].set_title("ap_lo")
ax[0].set_xlabel(None)
ax[1].set_title("ap_hi")
ax[1].set_xlabel(None)
fig.suptitle("Box Plot of `ap_hi` and `ap_lo`")
plt.tight_layout()

#### Creating histogram plot with `cardio` as hue

In [None]:
fig, ax = plt.subplots(nrows= int(np.ceil(len(numerical_columns)/3)), ncols=3, figsize = (12,6))
for k,col in enumerate(numerical_columns):
    r = int(k//3)
    c = int(k%3)
    sns.histplot(data = df, x = col, bins= 20, hue = "cardio", ax=ax[r,c],kde= True)
    ax[r,c].set_ylabel(None)
k += 1
r = int(k//3)
c = int(k%3)
fig.delaxes(ax=ax[r,c])
plt.tight_layout()

#### Creating boxplot with `cardio` as hue

In [None]:
df["cardio"] = df["cardio"].astype(str)
fig, ax = plt.subplots(nrows= int(np.ceil(len(numerical_columns)/3)), ncols=3, figsize = (12,6))
for k,col in enumerate(numerical_columns):
    r = int(k//3)
    c = int(k%3)
    sns.boxplot(data = df, x = col, y = "cardio", ax=ax[r,c])
k += 1
r = int(k//3)
c = int(k%3)
fig.delaxes(ax=ax[r,c])
plt.tight_layout()

#### Scaling numerical features

The numerical features is scaled using standard scaler as the data is normally distributed which is evident from the histogram plot.

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit(df[numerical_columns])
scaled_numerical_data = scaler.transform(df[numerical_columns])
df[numerical_columns] = scaled_numerical_data
df.head(10)

#### Checking correlation between numerical features using correlation matrix

In [None]:
corr_matrix = df[numerical_columns].corr()
corr_matrix

Thus, no two numerical features are related.

### Pre-processing categorical features ###

#### Looking at categorical features

In [None]:
df[categorical_columns].head(10)

In [None]:
#countplot for each categorical features
fig, ax = plt.subplots(nrows= int(np.ceil(len(categorical_columns)/3)), ncols=3, figsize = (12,12))
for k,col in enumerate(categorical_columns[:-1]):
    r = int(k//3)
    c = int(k%3)
    sns.countplot(data = df, x = col, ax=ax[r,c])
    ax[r,c].set_ylabel(None)
sns.countplot(data=df, x = "cardio", ax=ax[2,1])
fig.delaxes(ax=ax[2,0])
fig.delaxes(ax=ax[2,2])
plt.tight_layout()

#### Creating countplot with `cardio` as hue

In [None]:
df["cardio"] = df["cardio"].astype(str)
fig, ax = plt.subplots(nrows= int(np.ceil(len(categorical_columns[:-1])/3)), ncols=3, figsize = (12,6))
for k,col in enumerate(categorical_columns[:-1]):
    r = int(k//3)
    c = int(k%3)
    sns.countplot(data = df, x = col, hue = "cardio", ax=ax[r,c])
    ax[r,c].set_ylabel(None)
plt.tight_layout()

#### Checking relation between categorical features

In [84]:
#this function calculates chi^2 and p-value from the contigency table between two features
def calc_chi2(df, feature1, feature2):
    contingency_table = pd.crosstab(df[feature1],df[feature2],margins= True)
    chi2_value, p_value, _, _ = chi2_contingency(contingency_table)
    return chi2_value, p_value

In [None]:
chi2_matrix = np.zeros((len(categorical_columns), len(categorical_columns)))    #matrix for storing chi^2 values between all the features
p_matrix = np.zeros_like(chi2_matrix)   #matrix for storing p-values between all the features

for i,col1 in enumerate(categorical_columns):
    for j,col2 in enumerate(categorical_columns):
        chi2_value, p_value = calc_chi2(df,col1,col2)
        chi2_matrix[i,j] = chi2_value
        p_matrix[i,j] = p_value

chi2_matrix = pd.DataFrame(data=chi2_matrix, index=categorical_columns, columns=categorical_columns) 
p_matrix = pd.DataFrame(data=p_matrix, index=categorical_columns, columns=categorical_columns)

fig, ax = plt.subplots(1,2, figsize=(12,6))
sns.heatmap(chi2_matrix, ax = ax[0], cmap="Reds")
sns.heatmap(p_matrix, ax=ax[1], cmap="Blues_r")
ax[0].set_aspect("equal")
ax[1].set_aspect("equal")
plt.tight_layout()

In [None]:
p_matrix

The p_value of the feature `alco` as compared with `cardio` is greater than 0.05, which implies that `alco` is independent of the target variable `cardio`. Also, The p_value of the feature `alco` as compared with other categorical features is less than 0.05, which implies that `alco` is dependent on all other categorical features. Thus, we can safely discard `alco` from out ddataset.

In [87]:
df.drop(["alco"], axis = 1, inplace = True) #dropping the "alco" feature

## Splitting the dataset into trainning and test

In [None]:
df.head(10)

In [89]:
X = df.drop(["cardio"],axis=1)
y = df["cardio"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
fig, ax = plt.subplots(1,2,figsize = (10,5))
sns.countplot(data=df, x = y_train, ax = ax[0])
ax[0].set_title("Training")
sns.countplot(data=df, x = y_test, ax = ax[1])
ax[1].set_title("Test")
plt.tight_layout()

## Model Selection

### Logistic Regression

#### Creating and training the Logistic Regression model

In [91]:
LR = LogisticRegression(C = 0.5)
LR = LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

#### Calculating the accuracy scores using confusion matrix

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(LR, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(LR, X_test, y_test, ax= ax[1])
plt.show()

### K Nearest Neighbors

#### Creating and training the K Nearest Neighbors Classifier

In [96]:
KNN = KNeighborsClassifier(n_neighbors = 500)
KNN = KNN.fit(X_train,y_train)
y_pred = KNN.predict(X_test)

#### Calculating the accuracy scores using confusion matrix

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(KNN, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(KNN, X_test, y_test, ax= ax[1])
plt.show()

### Support Vector Classifier with Kernels

#### Creating and training Kernel Support Vector Classifier

In [101]:
SVM = SVC(kernel = "rbf", gamma= "auto", C= 1.0)
SVM = SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)

#### Calculating the accuracy scores using confusion matrix

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(SVM, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(SVM, X_test, y_test, ax= ax[1])
plt.show()

### Decision Tree Classifier

#### Creating and training Decision Tree Classifier

In [106]:
Decision_Tree = DecisionTreeClassifier(criterion= "gini", max_depth = 5)
Decision_Tree = Decision_Tree.fit(X_train, y_train)
y_pred = Decision_Tree.predict(X_test)

#### Calculating the accuracy scores using confusion matrix

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Decision_Tree, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(Decision_Tree, X_test, y_test, ax= ax[1])
plt.show()

### Bagging Classifier

#### Creating and training Bagging Classifier

In [111]:
BC = BaggingClassifier(n_estimators=50, max_samples= 0.75)
BC = BC.fit(X_train,y_train)
y_pred = BC.predict(X_test)

#### Calculating the accuracy scores using confusion matrix

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(BC, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(BC, X_test, y_test, ax= ax[1])
plt.show()

### Random Forest Classifier

#### Creating and training Bagging Classifier

In [116]:
Random_Forest = RandomForestClassifier(n_estimators=50, criterion= "entropy")
Random_Forest = Random_Forest.fit(X_train,y_train)
y_pred = Random_Forest.predict(X_test)

#### Calculating the accuracy scores using confusion matrix

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Random_Forest, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(Random_Forest, X_test, y_test, ax= ax[1])
plt.show()

### Gradient Boosting Classifier

#### Creating and training Gradient Boosting Classifier

In [121]:
Gradient_Boost = GradientBoostingClassifier(learning_rate= 0.05, n_estimators= 100)
Gradient_Boost  = Gradient_Boost.fit(X_train, y_train)
y_pred = Gradient_Boost.predict(X_test)

#### Calculating the accuracy scores using confusion matrix 

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Gradient_Boost, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(Gradient_Boost, X_test, y_test, ax= ax[1])
plt.show()

### AdaBoost Classifier

#### Creating and training AdaBoost Classifier

In [126]:
AdaBoost = AdaBoostClassifier(n_estimators=100)
AdaBoost = AdaBoost.fit(X_train, y_train)
y_pred = AdaBoost.predict(X_test)

#### Calculating the accuracy scores using confusion matrix 

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100

print(f"{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting the ROC_AUC_Curve and Precision_Recall_Curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
roc_auc

In [None]:
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(AdaBoost, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= roc_auc).from_estimator(AdaBoost, X_test, y_test, ax= ax[1])
plt.show()

So far, we have calculated the accuracy scores for each model without performing any cross validation or paramater tuning. The best model so far is Support Vector Machine with the accurcacy score 73.48% and F1 score 71.74%.

### Classification models with Hyperparameter Tuning

In [131]:
models = {
    "LR" : {
        "model": LogisticRegression(penalty="l2"),
        "params_grid": {
            "C" : [0.1,1,10,100]
        }
    },

    "KNN" : {
        "model": KNeighborsClassifier(),
        "params_grid": {
            "n_neighbors" : [10,100,200,500]
        }
    },

    "SVM": {
        "model": SVC(gamma="auto"),
        "params_grid": {
            "kernel" : ["linear", "rbf"],
            "C": [0.1,1,10,100]
        }
    },

    "Decision_Tree" : {
        "model": DecisionTreeClassifier(),
        "params_grid": {
            "criterion": ["gini", "entropy"],
            "max_depth": [3,5,10]
        }
    },

    "Bagging":{
        "model": BaggingClassifier(),
        "params_grid": {
            "n_estimators" : [10, 50, 100, 500],
            "max_samples": [0.5, 0.75, 1]
        }
    },

    "Random_Forest": {
        "model" : RandomForestClassifier(),
        "params_grid" : {
            "n_estimators": [10,50,100,500],
            "criterion" : ["gini", "entropy"]
        }

    },

    "Gradient_Boosting": {
        "model": GradientBoostingClassifier(),
        "params_grid": {
            "learning_rate" : [0.01, 0.05, 0.1, 0.5],
            "n_estimators": [50,100,500,1000]
        }

    },

    "AdaBoost": {
        "model": AdaBoostClassifier(),
        "params_grid": {
            "n_estimators": [10,50,100,500],
        }
    }
}

In [None]:
scores = []

for model_name, param in models.items():
    classifier = GridSearchCV(estimator= param["model"], param_grid= param["params"], cv= 10)
    classifier.fit(X_train, y_train)
    scores.append({
        "model" : model_name,
        "best_params" : classifier.best_params_,
        "best_score" : classifier.best_score_
    })

scores = pd.DataFrame(scores, columns= ["model", "best_params", "best_score"])
scores

Parameter grids are created for each of the above classifier and {the best classification model with parameters} is selected as our final model.