In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, RocCurveDisplay, PrecisionRecallDisplay

## Loading the Dataset
 The dataset is loaded and unnecessary columns, such as `id`, are removed and `age` is converted into years.

In [None]:
df = pd.read_csv("cardio_data/cardio.csv",sep=";")
df.drop(["id"],axis=1,inplace=True)     #dropping the "id" column
df["age"] = df["age"]/365   #converting age into years
df.head(10)

### Checking the dataset for null values

In [None]:
df.info()

In [None]:
#checking for null values
df.isna().sum()

### Splitting the features into numerical and categorical features

In [None]:
#number of unique values each feature has
df.nunique()

In [None]:
categorical_columns = ["gender","cholesterol","gluc","smoke","alco","active","cardio"]
numerical_columns = ["age","height","weight","ap_hi","ap_lo"]

## Feature Selection
### Pre-processing numerical features

In [None]:
df[numerical_columns].describe()

#### Creating boxplot for `ap_lo` and `ap_hi` and filtering unrealistic values

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,4))

sns.boxplot(df, x= "ap_lo", ax = ax[0])
sns.boxplot(df, x= "ap_hi", ax = ax[1])
ax[0].set_title("ap_lo")
ax[0].set_xlabel(None)
ax[1].set_title("ap_hi")
ax[1].set_xlabel(None)
fig.suptitle("Box Plot of 'ap_hi' and 'ap_lo'")     #Boxplot of `ap_hi` and `ap_lo`
plt.tight_layout()

**Observation**: The blood pressure values should fall within a physiologically realistic range. Values outside this range will be filtered. We choose the thershold of 370/360 mm Hg as given [here](<https://pubmed.ncbi.nlm.nih.gov/7741618/#:~:text=The%20highest%20pressure%20recorded%20in,005).&text=BP%20was%20recorded%20in%2010,maximal%20lifting%20with%20slow%20exhalation.>).

In [None]:
#filetering unrealistic blood pressure values
df = df[(df["ap_hi"] <= 370) & (df["ap_hi"] >= 0)]
df = df[(df["ap_lo"] <= 360) & (df["ap_lo"] >= 0)].reset_index(drop=True) 

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,4))

sns.boxplot(df, x= "ap_lo", ax = ax[0])
sns.boxplot(df, x= "ap_hi", ax = ax[1])
ax[0].set_title("ap_lo")
ax[0].set_xlabel(None)
ax[1].set_title("ap_hi")
ax[1].set_xlabel(None)
fig.suptitle("Box Plot of 'ap_hi' and 'ap_lo'")     #Boxplot of `ap_hi` and `ap_lo`
plt.tight_layout()

#### Creating histogram plot with `cardio` as hue

In [None]:
fig, ax = plt.subplots(nrows= int(np.ceil(len(numerical_columns)/3)), ncols=3, figsize = (12,6))
for k,col in enumerate(numerical_columns):
    r = int(k//3)
    c = int(k%3)
    sns.histplot(data = df, x = col, bins= 20, hue = "cardio", ax=ax[r,c],kde= True)
    ax[r,c].set_ylabel(None)
k += 1
r = int(k//3)
c = int(k%3)
fig.delaxes(ax=ax[r,c])
plt.suptitle("Histogram plots for numerical features")      #Histogram plot for numerical features
plt.tight_layout()

#### Creating boxplot with `cardio` as hue

In [None]:
df["cardio"] = df["cardio"].astype(str)
fig, ax = plt.subplots(nrows= int(np.ceil(len(numerical_columns)/3)), ncols=3, figsize = (12,6))
for k,col in enumerate(numerical_columns):
    r = int(k//3)
    c = int(k%3)
    sns.boxplot(data = df, x = col, y = "cardio", ax=ax[r,c])
k += 1
r = int(k//3)
c = int(k%3)
fig.delaxes(ax=ax[r,c])
plt.suptitle("Boxplots for numerical features")     #Boxplot for numerical features
plt.tight_layout()

#### Scaling numerical features

The numerical features is scaled using standard scaler as the data is normally distributed which is evident from the histogram plot.

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit(df[numerical_columns])
scaled_numerical_data = scaler.transform(df[numerical_columns])
df[numerical_columns] = scaled_numerical_data
df.head(10)

#### Checking correlation between numerical features using correlation matrix

In [None]:
corr_matrix = df[numerical_columns].corr()
corr_matrix

In [None]:
sns.heatmap(data= corr_matrix)
plt.show()

Thus, no two numerical features are correlated.

### Pre-processing categorical features ###

#### Looking at categorical features

In [None]:
df[categorical_columns].head(10)

In [None]:
fig, ax = plt.subplots(nrows= int(np.ceil(len(categorical_columns)/3)), ncols=3, figsize = (12,12))
for k,col in enumerate(categorical_columns[:-1]):
    r = int(k//3)
    c = int(k%3)
    sns.countplot(data = df, x = col, ax=ax[r,c])
    ax[r,c].set_ylabel(None)
sns.countplot(data=df, x = "cardio", ax=ax[2,1])
fig.delaxes(ax=ax[2,0])
fig.delaxes(ax=ax[2,2])
plt.suptitle("Countplots for categorical features")     #countplot for each categorical features
plt.tight_layout()

#### Creating countplot with `cardio` as hue

In [None]:
df["cardio"] = df["cardio"].astype(str)
fig, ax = plt.subplots(nrows= int(np.ceil(len(categorical_columns[:-1])/3)), ncols=3, figsize = (12,6))
for k,col in enumerate(categorical_columns[:-1]):
    r = int(k//3)
    c = int(k%3)
    sns.countplot(data = df, x = col, hue = "cardio", ax=ax[r,c])
    ax[r,c].set_ylabel(None)
plt.suptitle("Countplots with 'cardio' as hue")     #countplot with `cardio` as hue
plt.tight_layout()

#### Checking relation between categorical features

In [None]:
#this function calculates chi^2 and p-value from the contigency table between two features
def calc_chi2(df, feature1, feature2):
    contingency_table = pd.crosstab(df[feature1],df[feature2],margins= True)
    chi2_value, p_value, _, _ = chi2_contingency(contingency_table)
    return chi2_value, p_value

In [None]:
chi2_matrix = np.zeros((len(categorical_columns), len(categorical_columns)))    #matrix for storing chi^2 values between all the features
p_matrix = np.zeros_like(chi2_matrix)   #matrix for storing p-values between all the features

#using calc_chi2 to calculate chi^2-values and p-values between each categorical variables
for i,col1 in enumerate(categorical_columns):
    for j,col2 in enumerate(categorical_columns):
        chi2_value, p_value = calc_chi2(df,col1,col2)
        chi2_matrix[i,j] = chi2_value
        p_matrix[i,j] = p_value

chi2_matrix = pd.DataFrame(data=chi2_matrix, index=categorical_columns, columns=categorical_columns) 
p_matrix = pd.DataFrame(data=p_matrix, index=categorical_columns, columns=categorical_columns)

#heatmap for chi^2-values and p-values
fig, ax = plt.subplots(1,2, figsize=(12,6))
sns.heatmap(chi2_matrix, ax = ax[0], cmap="Reds")
sns.heatmap(p_matrix, ax=ax[1], cmap="Blues_r")
ax[0].set_aspect("equal")
ax[0].set_title("Chi^2 values")
ax[1].set_aspect("equal")
ax[1].set_title("p values")
plt.tight_layout()

In [None]:
p_matrix

The p_value of the feature `alco` as compared with `cardio` is greater than 0.05, which implies that `alco` is independent of the target variable `cardio`. Also, The p_value of the feature `alco` as compared with other categorical features is less than 0.05, which implies that `alco` is dependent on all other categorical features. Thus, we can safely discard `alco` from out ddataset.

In [None]:
df.drop(["alco"], axis = 1, inplace = True)     #dropping the "alco" feature

## Splitting the dataset into trainning and test

In [None]:
df.head(10)

In [None]:
X = df.drop(["cardio"],axis=1)
y = df["cardio"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
#looking at the counts for training and test dataset
fig, ax = plt.subplots(1,2,figsize = (10,5))
sns.countplot(data=df, x = y_train, ax = ax[0])
ax[0].set_title("Training")
sns.countplot(data=df, x = y_test, ax = ax[1])
ax[1].set_title("Test")
plt.tight_layout()

The training and test have similar distribution of target variable `cardio`.

## Model Selection

### Logistic Regression
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"C" : [0.001, 0.01, 0.1,1,10], "solver": ["lbfgs","newton-cholesky", "saga"]}
classifier = GridSearchCV(estimator= LogisticRegression(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

Parameter grids are created for various classification models and the best three model with the best parameters are selected for further prediction. Here, recall score is selected because we want to increase the chance of predicting True Positive compared to False Negative.

$$
\begin{align*}
    \text{Recall} = \frac{\text{True Postive}}{\text{True Postive} + \text{False Negative}}
\end{align*}
$$

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting

In [None]:
LR = LogisticRegression(C = best_params["C"], solver= best_params["solver"]).fit(X_train, y_train)
y_pred = LR.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(LR, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(LR, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### K-Nearest Neighbors
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"n_neighbors" : [10,50,100,200,500]}
classifier = GridSearchCV(estimator= KNeighborsClassifier(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting

In [None]:
KNN = KNeighborsClassifier(n_neighbors= best_params["n_neighbors"]).fit(X_train, y_train)
y_pred = KNN.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(KNN, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(KNN, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Linear Support Vector Classifier
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"C" : [1e-6,1e-5,1e-4,1e-3], "loss": ["hinge", "squared_hinge"]}
classifier = GridSearchCV(estimator= LinearSVC(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting

In [None]:
Linear_SVM = LinearSVC(C = best_params["C"], loss= best_params["loss"]).fit(X_train, y_train)
y_pred = Linear_SVM.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Linear_SVM, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(Linear_SVM, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Support Vector Classifier
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"C" : [0.01,0.1,1], "gamma": ["scale",1,10,100]}
classifier = GridSearchCV(estimator= SVC(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting

In [None]:
SVM = SVC(C = best_params["C"], gamma = best_params["gamma"]).fit(X_train, y_train)
y_pred = SVM.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(SVM, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(SVM, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Decision Tree
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"criterion": ["gini", "entropy"],"max_depth": [5,10,20]}
classifier = GridSearchCV(estimator= DecisionTreeClassifier() , param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting 

In [None]:
Decision_Tree = DecisionTreeClassifier(criterion = best_params["criterion"], max_depth=best_params["max_depth"]).fit(X_train, y_train)
y_pred = Decision_Tree.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Decision_Tree, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(Decision_Tree, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Bagging
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"n_estimators" : [50, 100,500,1000], "max_samples": [0.5, 0.75, 1]}
classifier = GridSearchCV(estimator= BaggingClassifier(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting

In [None]:
BC = BaggingClassifier(n_estimators=best_params["n_estimators"], max_samples= best_params["max_samples"])
BC = BC.fit(X_train,y_train)
y_pred = BC.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(BC, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(BC, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Random Forest
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"n_estimators": [100,500,1000], "criterion" : ["gini", "entropy"]}
classifier = GridSearchCV(estimator= RandomForestClassifier(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting 

In [None]:
Random_Forest = RandomForestClassifier(n_estimators = best_params["n_estimators"], criterion= best_params["criterion"])
Random_Forest = Random_Forest.fit(X_train,y_train)
y_pred = Random_Forest.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Random_Forest, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(Random_Forest, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Gradient Boosting
#### Defining parameter grid and performing Cross Validation

In [None]:
param_grid = {"learning_rate" : [0.1, 0.5, 1, 10],"n_estimators": [3,5,10,50]}
classifier = GridSearchCV(estimator= GradientBoostingClassifier(), param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training and predicting the classifier

In [None]:
Gradient_Boost = GradientBoostingClassifier(learning_rate= best_params["learning_rate"], n_estimators= best_params["n_estimators"])
Gradient_Boost  = Gradient_Boost.fit(X_train, y_train)
y_pred = Gradient_Boost.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(Gradient_Boost, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(Gradient_Boost, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()

### Adaptive Boosting
#### Defining parameter grid and performing cross validation

In [None]:
param_grid = {"n_estimators": [10,50,100]}
classifier = GridSearchCV(estimator= AdaBoostClassifier, param_grid= param_grid,cv= ShuffleSplit(n_splits=5, test_size=0.3), scoring= "recall", n_jobs=-1)
classifier.fit(X_train, y_train)
parameters = pd.DataFrame(classifier.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score')
parameters

In [None]:
best_params = classifier.best_params_
print(best_params)

#### Training the classifier and predicting 

In [None]:
AdaBoost = AdaBoostClassifier(n_estimators= best_params["n_estimators"]).fit(X_train, y_train)
y_pred = AdaBoost.predict(X_test)

#### Computing the confusion matrix and accuracy scores

In [None]:
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix = pd.DataFrame(data = con_matrix, columns= ["Predicted 0","Predicted 1"], index= ["Actual 0", "Actual 1"])
con_matrix

In [None]:
accuracy = accuracy_score(y_test, y_pred) *100
precision = precision_score(y_test, y_pred) *100
recall = recall_score(y_test, y_pred) *100
f1 = f1_score(y_test,y_pred) *100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1_score: {f1:.2f}%")

#### Plotting ROC curve and Precision-Recall curve

In [None]:
fpr , tpr, thresholds = roc_curve(y_test, y_pred)
fig,ax = plt.subplots(1,2, figsize = (10,6))
PrecisionRecallDisplay(precision= precision, recall= recall).from_estimator(AdaBoost, X_test, y_test, ax= ax[0])
RocCurveDisplay(fpr = fpr,tpr = tpr, roc_auc= auc(fpr,tpr)).from_estimator(AdaBoost, X_test, y_test, ax= ax[1])
ax[0].set_title("Precision-Recall Curve")
ax[1].set_title("ROC Curve")
plt.show()