This notebook consists of various experiments performed during experimentation stage of the voting classifier. 
<br>**Note:** *Close to 50-60 experiments were  conducted to get the final classifier model*. 
<br>They are not plausible to add in a single notebook, that would required multiple notebooks, that don't particulary convey anything.
<br>Since, they are computationally exhautive to run, *only the most significant experiments*, "which are also mentioned in the dissertation file's analysis*, are included here.

In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv("prepared_df.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
## Spliting of the dataset
X=df.drop("outcome",axis=1)
y=df["outcome"]

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Post Scaling

In [None]:
X.head()

In [None]:
X.columns

In [None]:
X.dtypes

In [None]:
X_Categorical=X[['hypertensive','atrialfibrillation', 'diabetes', 'deficiencyanemias','depression', 
                 'Hyperlipemia', 'Renal_failure', 'COPD','gendera']]
X_Categorical.head()

In [None]:
X_Numerical= X.drop(columns=X_Categorical.columns)


In [None]:
X_Numerical.head()

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
X_Numerical_scaled = scaler.fit_transform(X_Numerical)


X_Numerical_scaled_df = pd.DataFrame(X_Numerical_scaled, columns=X_Numerical.columns, index=X_Numerical.index)


X_scaled = pd.concat([X_Categorical, X_Numerical_scaled_df], axis=1)

In [None]:
X_scaled.head()

In [None]:
X_scaled.dtypes

In [None]:
y.dtype

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

##  Post Balancing

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



df_y = pd.DataFrame({'target': y})

plt.style.use("dark_background")

# Creating the plot
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df_y, palette='pastel')


plt.title('Class Imbalance of the Target Variable', fontsize=16)
plt.xlabel('Class', fontsize=14)
plt.ylabel('Frequency', fontsize=14)


plt.show()


In [None]:
plt.style.use("default")

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [None]:
import pandas as pd
from imblearn.over_sampling import ADASYN

# Assuming X_scaled is your scaled feature matrix and y is your target variable

# Initializing ADASYN
adasyn = ADASYN(random_state=42)

# Resample=ing the dataset
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)




print("Class distribution after ADASYN:")
print(y_train_resampled.value_counts())




In [None]:
X_train=X_train_resampled
y_train=y_train_resampled

## Experimentations

* *Close to 50-60 experiments were  conducted to get the final classifier model*. 
* Since, they are computationally exhautive to run, *only the most significant experiments*, "which are also mentioned in the dissertation file's analysis*, are included here.

**Note:** The in-depth analysis included in the dissertation with comprehensively exhibited output are done manually using MS Excel. <br>Only basic tables are mentioned in these notebooks.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Define the parameter grid for each model
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'class_weight': [None, 'balanced']
}

rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'class_weight': [None, 'balanced']
}

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'scale_pos_weight': [1, 2, 5]
}

knn_param_grid = {
    'n_neighbors': [5, 10, 15],
    'weights': ['uniform', 'distance']
}

nb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

# Define the parameter grid for LDA
lda_param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto']  # Only applicable for 'lsqr' and 'eigen' solvers
}

# Initialize all models with default parameters
svm_model = SVC(probability=True, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
knn_model = KNeighborsClassifier()
nb_model = GaussianNB()
gb_model = GradientBoostingClassifier(random_state=42)
lda_model = LinearDiscriminantAnalysis()

### 2 Base learners

**$$**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd
import logging

# Suppress XGBoost warnings
logging.getLogger('xgboost').setLevel(logging.ERROR)


# Grid search for KNN
knn_grid = GridSearchCV(knn_model, param_grid=knn_param_grid, scoring='recall', cv=3)
knn_grid.fit(X_train, y_train)
knn_model_best = knn_grid.best_estimator_

# Grid search for LDA
lda_grid = GridSearchCV(lda_model, param_grid=lda_param_grid, scoring='recall', cv=3)
lda_grid.fit(X_train, y_train)
lda_model_best = lda_grid.best_estimator_

# Defining the Voting Classifier with the selected base learners
voting_clf = VotingClassifier(
    estimators=[
        ('knn', knn_model_best),
        ('lda', lda_model_best)
    ],
    voting='soft',
    weights=[1, 1]
)

# Training the Voting Classifier on the full training data
voting_clf.fit(X_train, y_train)

# Predicting probabilities and adjust the threshold for the test data
voting_probs = voting_clf.predict_proba(X_test)[:, 1]
threshold = 0.4
voting_preds = (voting_probs > threshold).astype(int)

# Calculating metrics
accuracy = accuracy_score(y_test, voting_preds)
recall = recall_score(y_test, voting_preds)
precision = precision_score(y_test, voting_preds)
f1 = f1_score(y_test, voting_preds)
auc_roc = roc_auc_score(y_test, voting_probs)


conf_matrix = confusion_matrix(y_test, voting_preds)
tn, fp, fn, tp = conf_matrix.ravel()


sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)


print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC ROC Score: {auc_roc:.4f}")

In [None]:
import pandas as pd

# Metrics obtained from the executed code
metrics = {
    'Accuracy': accuracy,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC ROC Score': auc_roc
}


df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Styling the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '14pt'), ('font-weight', 'bold'), ('color', 'black'), ('font-family', 'Times New Roman')]},  # Style for caption
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("2 Base Learners: KNN and LDA")  # Add a caption to the table
)


styled_table = styled_table.hide(axis="index")


styled_table

**$$**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import logging

# Suppress XGBoost warnings
logging.getLogger('xgboost').setLevel(logging.ERROR)


# Grid search for SVM
svm_grid = GridSearchCV(svm_model, param_grid=svm_param_grid, scoring='recall', cv=3)
svm_grid.fit(X_train, y_train)
svm_model_best = svm_grid.best_estimator_

# Grid search for RandomForest
rf_grid = GridSearchCV(rf_model, param_grid=rf_param_grid, scoring='recall', cv=3)
rf_grid.fit(X_train, y_train)
rf_model_best = rf_grid.best_estimator_

# Defining the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('svm', svm_model_best),
        ('rf', rf_model_best)
    ],
    voting='soft',
    weights=[1, 1]
)

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Predict probabilities and adjust the threshold
voting_probs = voting_clf.predict_proba(X_test)[:, 1]
threshold = 0.4
voting_preds = (voting_probs > threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, voting_preds)
recall = recall_score(y_test, voting_preds)
precision = precision_score(y_test, voting_preds)
f1 = f1_score(y_test, voting_preds)
auc_roc = roc_auc_score(y_test, voting_probs)


conf_matrix = confusion_matrix(y_test, voting_preds)
tn, fp, fn, tp = conf_matrix.ravel()


sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)


print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC ROC Score: {auc_roc:.4f}")

In [None]:
import pandas as pd


metrics = {
    "Metric": ["Accuracy", "Sensitivity", "Specificity", "Precision", "Recall", "F1-Score", "AUC ROC Score"],
    "Value": [accuracy, sensitivity, specificity, precision, recall, f1, auc_roc]
}

# Creating a DataFrame
metrics_df = pd.DataFrame(metrics)


print(metrics_df.to_string(index=False))


In [None]:
import pandas as pd

# Metrics obtained from the above executed code
metrics = {
    'Accuracy': accuracy,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC ROC Score': auc_roc
}


df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Styling the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("2 Base Learners: SVM and Random Forest")  # Add a caption to the table
)


styled_table


In [None]:
import pandas as pd

# Metrics obtained from the above executed code
metrics = {
    'Accuracy': accuracy,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC ROC Score': auc_roc
}


df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Style the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '14pt'), ('font-weight', 'bold'), ('color', 'black'), ('font-family', 'Times New Roman')]},  # Style for caption
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("2 Base Learners: SVM and Random Forest")  # Add a caption to the table
)


styled_table = styled_table.hide(axis="index")


styled_table

In [None]:
import matplotlib.pyplot as plt


metric_names = ["Accuracy", "Sensitivity", "Specificity", "Precision", "Recall", "F1-Score", "AUC ROC Score"]
metric_values = [accuracy, sensitivity, specificity, precision, recall, f1, auc_roc]

# Plot the metrics
plt.figure(figsize=(8, 4))
plt.barh(metric_names, metric_values, color='skyblue')
plt.xlabel('Metric Value')
plt.title('Model Performance Metrics')
plt.xlim(0, 1)
plt.show()

**$$**

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import logging

# Suppress XGBoost warnings
logging.getLogger('xgboost').setLevel(logging.ERROR)



# Grid search for XGBoost
xgb_grid = GridSearchCV(xgb_model, param_grid=xgb_param_grid, scoring='recall', cv=3)
xgb_grid.fit(X_train, y_train)
xgb_model_best = xgb_grid.best_estimator_

# Grid search for k-NN
knn_grid = GridSearchCV(knn_model, param_grid=knn_param_grid, scoring='recall', cv=3)
knn_grid.fit(X_train, y_train)
knn_model_best = knn_grid.best_estimator_

# Define the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model_best),
        ('knn', knn_model_best)
    ],
    voting='soft',
    weights=[1, 1]
)

# Train the Voting Classifier on the entire training set
voting_clf.fit(X_train, y_train)

# Predict probabilities and adjust the threshold on the test set
voting_probs = voting_clf.predict_proba(X_test)[:, 1]
threshold = 0.4
voting_preds = (voting_probs > threshold).astype(int)

# Calculate metrics on the test set
accuracy = accuracy_score(y_test, voting_preds)
recall = recall_score(y_test, voting_preds)
precision = precision_score(y_test, voting_preds)
f1 = f1_score(y_test, voting_preds)
auc_roc = roc_auc_score(y_test, voting_probs)


conf_matrix = confusion_matrix(y_test, voting_preds)
tn, fp, fn, tp = conf_matrix.ravel()


sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)


print(f"Test Set Evaluation:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Sensitivity: {sensitivity:.4f}")
print(f"  Specificity: {specificity:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1-Score: {f1:.4f}")
print(f"  AUC ROC Score: {auc_roc:.4f}")


In [None]:
import pandas as pd

# Metrics obtained from above executed code
metrics = {
    'Accuracy': accuracy,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC ROC Score': auc_roc
}


df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Style the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '14pt'), ('font-weight', 'bold'), ('color', 'black'), ('font-family', 'Times New Roman')]},  # Style for caption
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("2 Base Learners: XGBooost and KNN")  # Add a caption to the table
)

# Remove the index column (row numbers)
styled_table = styled_table.hide(axis="index")


styled_table

### With 3 Base Learners

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import logging

# Suppress warnings
logging.getLogger('xgboost').setLevel(logging.ERROR)



# Grid search for KNN
knn_grid = GridSearchCV(knn_model, param_grid=knn_param_grid, scoring='recall', cv=3)
knn_grid.fit(X_train, y_train)
knn_model_best = knn_grid.best_estimator_

# Grid search for RandomForest
rf_grid = GridSearchCV(rf_model, param_grid=rf_param_grid, scoring='recall', cv=3)
rf_grid.fit(X_train, y_train)
rf_model_best = rf_grid.best_estimator_

# Grid search for XGBoost
xgb_grid = GridSearchCV(xgb_model, param_grid=xgb_param_grid, scoring='recall', cv=3)
xgb_grid.fit(X_train, y_train)
xgb_model_best = xgb_grid.best_estimator_

# Define the Voting Classifier with the selected base learners
voting_clf = VotingClassifier(
    estimators=[
        ('knn', knn_model_best),
        ('rf', rf_model_best),
        ('xgb', xgb_model_best)
    ],
    voting='soft',
    weights=[1, 1, 2]
)

# Train the Voting Classifier on the full training data
voting_clf.fit(X_train, y_train)

# Predict probabilities and adjust the custom threshold for the test data
voting_probs = voting_clf.predict_proba(X_test)[:, 1]
threshold = 0.4
voting_preds = (voting_probs > threshold).astype(int)

# Calculate metrics for the test data
accuracy = accuracy_score(y_test, voting_preds)
recall = recall_score(y_test, voting_preds)
precision = precision_score(y_test, voting_preds)
f1 = f1_score(y_test, voting_preds)
auc_roc = roc_auc_score(y_test, voting_probs)


conf_matrix = confusion_matrix(y_test, voting_preds)
tn, fp, fn, tp = conf_matrix.ravel()


sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Output the metrics 
print(f"Test Metrics:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Sensitivity: {sensitivity:.4f}")
print(f"  Specificity: {specificity:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1-Score: {f1:.4f}")
print(f"  AUC ROC Score: {auc_roc:.4f}")


In [None]:
import pandas as pd

# Metrics obtained from the executed code
metrics = {
    'Accuracy': accuracy,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC ROC Score': auc_roc
}


df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Style the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '14pt'), ('font-weight', 'bold'), ('color', 'black'), ('font-family', 'Times New Roman')]},  # Style for caption
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("3 Base Learners: RF, XGBooost and KNN")  # Add a caption to the table
)

# Remove the index column (row numbers)
styled_table = styled_table.hide(axis="index")


styled_table

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import logging

# Suppress warnings
logging.getLogger('xgboost').setLevel(logging.ERROR)


# Grid search for SVM
svm_grid = GridSearchCV(svm_model, param_grid=svm_param_grid, scoring='recall', cv=3)
svm_grid.fit(X_train, y_train)
svm_model_best = svm_grid.best_estimator_

# Grid search for KNN
knn_grid = GridSearchCV(knn_model, param_grid=knn_param_grid, scoring='recall', cv=3)
knn_grid.fit(X_train, y_train)
knn_model_best = knn_grid.best_estimator_

# Grid search for RandomForest
rf_grid = GridSearchCV(rf_model, param_grid=rf_param_grid, scoring='recall', cv=3)
rf_grid.fit(X_train, y_train)
rf_model_best = rf_grid.best_estimator_

# Define the Voting Classifier with the selected base learners
voting_clf = VotingClassifier(
    estimators=[
        ('svm', svm_model_best),
        ('knn', knn_model_best),
        ('rf', rf_model_best)
    ],
    voting='soft',
    weights=[1, 1, 2]
)


voting_clf.fit(X_train, y_train)

# Predict probabilities and adjust the threshold for the test data
voting_probs = voting_clf.predict_proba(X_test)[:, 1]
threshold = 0.4
voting_preds = (voting_probs > threshold).astype(int)

accuracy = accuracy_score(y_test, voting_preds)
recall = recall_score(y_test, voting_preds)
precision = precision_score(y_test, voting_preds)
f1 = f1_score(y_test, voting_preds)
auc_roc = roc_auc_score(y_test, voting_probs)


conf_matrix = confusion_matrix(y_test, voting_preds)
tn, fp, fn, tp = conf_matrix.ravel()


sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Output the metrics 
print(f"Test Metrics:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Sensitivity: {sensitivity:.4f}")
print(f"  Specificity: {specificity:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1-Score: {f1:.4f}")
print(f"  AUC ROC Score: {auc_roc:.4f}")

In [None]:
import pandas as pd

# Metrics obtained from above executed code
metrics = {
    'Accuracy': accuracy,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC ROC Score': auc_roc
}

# Convert the dictionary to a DataFrame
df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Style the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '14pt'), ('font-weight', 'bold'), ('color', 'black'), ('font-family', 'Times New Roman')]},  # Style for caption
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("3 Base Learners: RF, SVM and KNN")  # Add a caption to the table
)

# Remove the index column (row numbers)
styled_table = styled_table.hide(axis="index")

styled_table

## Final Model with cross-validation

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import logging

# Suppress XGBoost warnings
logging.getLogger('xgboost').setLevel(logging.ERROR)

# Set up cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Initialize lists to store metrics for each fold
metrics = {
    'accuracy': [],
    'recall': [],
    'precision': [],
    'f1': [],
    'auc_roc': [],
    'sensitivity': [],
    'specificity': []
}

# Perform cross-validation
for train_index, val_index in cv.split(X_train, y_train):
    X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # Grid search for SVM
    svm_grid = GridSearchCV(svm_model, param_grid=svm_param_grid, scoring='recall', cv=3)
    svm_grid.fit(X_cv_train, y_cv_train)
    svm_model_best = svm_grid.best_estimator_

    # Grid search for RandomForest
    rf_grid = GridSearchCV(rf_model, param_grid=rf_param_grid, scoring='recall', cv=3)
    rf_grid.fit(X_cv_train, y_cv_train)
    rf_model_best = rf_grid.best_estimator_

    # Grid search for XGBoost
    xgb_grid = GridSearchCV(xgb_model, param_grid=xgb_param_grid, scoring='recall', cv=3)
    xgb_grid.fit(X_cv_train, y_cv_train)
    xgb_model_best = xgb_grid.best_estimator_

    # Grid search for k-NN
    knn_grid = GridSearchCV(knn_model, param_grid=knn_param_grid, scoring='recall', cv=3)
    knn_grid.fit(X_cv_train, y_cv_train)
    knn_model_best = knn_grid.best_estimator_

    # Defining the Voting Classifier
    voting_clf = VotingClassifier(
        estimators=[
            ('svm', svm_model_best),
            ('rf', rf_model_best),
            ('xgb', xgb_model_best),
            ('knn', knn_model_best)
        ],
        voting='soft',
        weights=[0.5,1,2,0.5]
    )

    # Training the Voting Classifier
    voting_clf.fit(X_cv_train, y_cv_train)

    # Predicting probabilities and adjust the threshold
    voting_probs = voting_clf.predict_proba(X_cv_val)[:, 1]
    threshold = 0.4
    voting_preds = (voting_probs > threshold).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_cv_val, voting_preds)
    recall = recall_score(y_cv_val, voting_preds)
    precision = precision_score(y_cv_val, voting_preds)
    f1 = f1_score(y_cv_val, voting_preds)
    auc_roc = roc_auc_score(y_cv_val, voting_probs)

    # Confusion matrix to calculate sensitivity and specificity
    conf_matrix = confusion_matrix(y_cv_val, voting_preds)
    tn, fp, fn, tp = conf_matrix.ravel()

    # Calculate Sensitivity and Specificity
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)

    # Store metrics for the fold
    metrics['accuracy'].append(accuracy)
    metrics['recall'].append(recall)
    metrics['precision'].append(precision)
    metrics['f1'].append(f1)
    metrics['auc_roc'].append(auc_roc)
    metrics['sensitivity'].append(sensitivity)
    metrics['specificity'].append(specificity)

    # Print metrics for the current fold
    print(f"Fold {len(metrics['accuracy'])}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Sensitivity: {sensitivity:.4f}")
    print(f"  Specificity: {specificity:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  AUC ROC Score: {auc_roc:.4f}\n")

# Average metrics across all folds
avg_accuracy = sum(metrics['accuracy']) / len(metrics['accuracy'])
avg_recall = sum(metrics['recall']) / len(metrics['recall'])
avg_precision = sum(metrics['precision']) / len(metrics['precision'])
avg_f1 = sum(metrics['f1']) / len(metrics['f1'])
avg_auc_roc = sum(metrics['auc_roc']) / len(metrics['auc_roc'])
avg_sensitivity = sum(metrics['sensitivity']) / len(metrics['sensitivity'])
avg_specificity = sum(metrics['specificity']) / len(metrics['specificity'])

print(f"\nAverage Accuracy: {avg_accuracy:.4f}")
print(f"Average Sensitivity: {avg_sensitivity:.4f}")
print(f"Average Specificity: {avg_specificity:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")
print(f"Average AUC ROC Score: {avg_auc_roc:.4f}")


In [None]:
import pandas as pd

# Metrics obtained from your executed code
metrics = {
    'Accuracy': avg_accuracy,
    'Sensitivity': avg_sensitivity,
    'Specificity': avg_specificity,
    'Precision': avg_precision,
    'Recall': avg_recall,
    'F1-Score': avg_f1,
    'AUC ROC Score': avg_auc_roc
}

# Convert the dictionary to a DataFrame
df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Style the DataFrame
styled_table = (
    df_metrics.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '14pt'), ('font-weight', 'bold'), ('color', 'black'), ('font-family', 'Times New Roman')]},  # Style for caption
    ])
    .set_properties(**{'border': '1px solid black'})  # Border for all cells
    .set_caption("Final Model")  # Add a caption to the table
)

# Remove the index column (row numbers)
styled_table = styled_table.hide(axis="index")

# Display the styled table
styled_table