### Notebook Overview
This notebook contains the complete comparision the with state -of-art models. 
<br>This comparision of model performance is also used to analyse the effect of scaling and balancing. Standard Scaling is applied and class balancing using **ADASYN** is applied here.
<br>*Multiple scaling techniques were  analysed to check which best improved the state-of-art models' performances. Multiple class balancing techniques were also implemented analyse the state-of-art models' performances, and then ADASYN balancing technique was chosen.*
<br>*However, these experiments with different scaling techniques and balancing techniques implementations are not included here due to compuatational limitations and need for a comprehensive coding notebook.* 
<br>For the remaining steps in this study's implementation, such as scaling and class balancing, please refer to the other notebooks included in this project.

In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv("prepared_df.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
 df.head()

In [None]:
## Spliting of the dataset
X=df.drop("outcome",axis=1)
y=df["outcome"]

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")


# Initializing models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),  # Suppress LightGBM outputs
    "AdaBoost": AdaBoostClassifier(random_state=42, algorithm="SAMME"),  # Set algorithm to SAMME
    "Naive Bayes": GaussianNB(),
    "k-NN": KNeighborsClassifier(),
    "LDA": LinearDiscriminantAnalysis()
}


results = {}


for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions on testing data
    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
    
    
    accuracy = accuracy_score(y_test, test_preds)
    
    
    class_report_str = classification_report(y_test, test_preds, target_names=['Class 0', 'Class 1'])
    class_report = classification_report(y_test, test_preds, target_names=['Class 0', 'Class 1'], output_dict=True)
    
    
    conf_matrix = confusion_matrix(y_test, test_preds)
    tn, fp, fn, tp = conf_matrix.ravel()
    
    
    sensitivity_1 = tp / (tp + fn)
    specificity_1 = tn / (tn + fp)
    
    
    auc_roc = roc_auc_score(y_test, test_probs)
    
   
    metrics = {
        "Class 1": {
            "Accuracy": accuracy,
            "Sensitivity": sensitivity_1,
            "Specificity": specificity_1,
            "Precision": class_report['Class 1']['precision'],
            "Recall": class_report['Class 1']['recall'],
            "F1-Score": class_report['Class 1']['f1-score'],
            "AUC ROC Score": auc_roc
        }
    }
    
    
    results[name] = (metrics, class_report_str)

# Printing the results
for model_name, (metrics, class_report_str) in results.items():
    print(f"**For {model_name}:**")
    for cls, cls_metrics in metrics.items():
        print(f"Metrics for {cls}:")
        for metric, value in cls_metrics.items():
            print(f"{metric}: {value:.4f}")
        print("\n")
    
    
    print(f"Classification Report - Test Data for {model_name}:")
    print(class_report_str)
    print("\n")

In [None]:
import pandas as pd


columns = ['Model', 'Accuracy', 'Recall', 'Precision', 'Sensitivity', 'Specificity', 'F1-Score', 'AUC ROC Score']
df_results = pd.DataFrame(columns=columns)


rows = []

# Filling the list with the results
for model_name, (metrics, _) in results.items():
    row = [
        model_name,
        metrics['Class 1']['Accuracy'],
        metrics['Class 1']['Recall'],
        metrics['Class 1']['Precision'],
        metrics['Class 1']['Sensitivity'],
        metrics['Class 1']['Specificity'],
        metrics['Class 1']['F1-Score'],
        metrics['Class 1']['AUC ROC Score']
    ]
    rows.append(pd.Series(row, index=columns))


df_results = pd.concat(rows, axis=1).T  # Transpose since Series are concatenated along the columns


df_results.set_index('Model', inplace=True)

# Styling the DataFrame
styled_table = (
    df_results.style
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black')]},  # Bold border for headers
        {'selector': 'td', 'props': [('border', '1px solid black')]},  # Ordinary border for other cells
        {'selector': 'th.col_heading.level0', 'props': [('border', '2px solid black')]},  # Bold border for metric names
    ])
    .set_properties(**{'border': '1px solid black'})  # Ordinary border for all cells
    .set_table_attributes('style="border-collapse:collapse"')
)


styled_table

In [None]:
import pandas as pd


columns = ['Model', 'Accuracy', 'Recall', 'Precision', 'Sensitivity', 'Specificity', 'F1-Score', 'AUC ROC Score']
df_results = pd.DataFrame(columns=columns)


rows = []

# Adding the list with the results
for model_name, (metrics, _) in results.items():
    row = [
        model_name,
        metrics['Class 1']['Accuracy'],
        metrics['Class 1']['Recall'],
        metrics['Class 1']['Precision'],
        metrics['Class 1']['Sensitivity'],
        metrics['Class 1']['Specificity'],
        metrics['Class 1']['F1-Score'],
        metrics['Class 1']['AUC ROC Score']
    ]
    rows.append(pd.Series(row, index=columns))


df_results = pd.concat(rows, axis=1).T  # Transpose since Series are concatenated along the columns


df_results.set_index('Model', inplace=True)

# Styling the DataFrame
styled_table = (
    df_results.style
    .format('{:.4f}')  # Format all cells to 4 decimal places
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'th.col_heading.level0', 'props': [('border', '2px solid black')]},  # Bold border for metric names
    ])
    .set_properties(**{'border': '1px solid black'})  # Ordinary border for all cells
    .set_table_attributes('style="border-collapse:collapse"')
)

styled_table

### Post Scaling

In [None]:
X.head()

In [None]:
X.columns

In [None]:
X.dtypes

In [None]:
X_Categorical=X[['hypertensive','atrialfibrillation', 'diabetes', 'deficiencyanemias','depression', 
                 'Hyperlipemia', 'Renal_failure', 'COPD','gendera']]
X_Categorical.head()

In [None]:
X_Numerical= X.drop(columns=X_Categorical.columns)


In [None]:
X_Numerical.head()

In [None]:
# Initializing StandardScaler
scaler = StandardScaler()

# Fitting and transform the numerical features
X_Numerical_scaled = scaler.fit_transform(X_Numerical)


X_Numerical_scaled_df = pd.DataFrame(X_Numerical_scaled, columns=X_Numerical.columns, index=X_Numerical.index)


X_scaled = pd.concat([X_Categorical, X_Numerical_scaled_df], axis=1)

In [None]:
X_scaled.head()

In [None]:
X_scaled.dtypes

In [None]:
y.dtype

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [None]:
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")


# Initializing models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),  # Suppress LightGBM outputs
    "AdaBoost": AdaBoostClassifier(random_state=42, algorithm="SAMME"),  # Set algorithm to SAMME
    "Naive Bayes": GaussianNB(),
    "k-NN": KNeighborsClassifier(),
    "LDA": LinearDiscriminantAnalysis()
}

results = {}


for name, model in models.items():
    
    model.fit(X_train, y_train)
    
    
    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
    
 
    accuracy = accuracy_score(y_test, test_preds)
    
    
    class_report_str = classification_report(y_test, test_preds, target_names=['Class 0', 'Class 1'])
    class_report = classification_report(y_test, test_preds, target_names=['Class 0', 'Class 1'], output_dict=True)
    
    
    conf_matrix = confusion_matrix(y_test, test_preds)
    tn, fp, fn, tp = conf_matrix.ravel()
    
    
    sensitivity_1 = tp / (tp + fn)
    specificity_1 = tn / (tn + fp)
    
    
    auc_roc = roc_auc_score(y_test, test_probs)
    
    # Arranging output metrics in a systematic form for Class 1
    metrics = {
        "Class 1": {
            "Accuracy": accuracy,
            "Sensitivity": sensitivity_1,
            "Specificity": specificity_1,
            "Precision": class_report['Class 1']['precision'],
            "Recall": class_report['Class 1']['recall'],
            "F1-Score": class_report['Class 1']['f1-score'],
            "AUC ROC Score": auc_roc
        }
    }
    
    
    results[name] = (metrics, class_report_str)

# Printing the results
for model_name, (metrics, class_report_str) in results.items():
    print(f"**For {model_name}:**")
    for cls, cls_metrics in metrics.items():
        print(f"Metrics for {cls}:")
        for metric, value in cls_metrics.items():
            print(f"{metric}: {value:.4f}")
        print("\n")
    
   
    print(f"Classification Report - Test Data for {model_name}:")
    print(class_report_str)
    print("\n")

In [None]:
import pandas as pd


columns = ['Model', 'Accuracy', 'Recall', 'Precision', 'Sensitivity', 'Specificity', 'F1-Score', 'AUC ROC Score']
df_results = pd.DataFrame(columns=columns)


rows = []

# Filling the list with the results
for model_name, (metrics, _) in results.items():
    row = [
        model_name,
        metrics['Class 1']['Accuracy'],
        metrics['Class 1']['Recall'],
        metrics['Class 1']['Precision'],
        metrics['Class 1']['Sensitivity'],
        metrics['Class 1']['Specificity'],
        metrics['Class 1']['F1-Score'],
        metrics['Class 1']['AUC ROC Score']
    ]
    rows.append(pd.Series(row, index=columns))


df_results = pd.concat(rows, axis=1).T  # Transpose since Series are concatenated along the columns


df_results.set_index('Model', inplace=True)

# Styling the DataFrame
styled_table = (
    df_results.style
    .format('{:.4f}')  # Format all cells to 4 decimal places
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'th.col_heading.level0', 'props': [('border', '2px solid black')]},  # Bold border for metric names
    ])
    .set_properties(**{'border': '1px solid black'})  # Ordinary border for all cells
    .set_table_attributes('style="border-collapse:collapse"')
)


styled_table

##  Post Balancing

* *Multiple Balancing Techniques were implemented to check which optimised the performance of the benchmark models the best.*
* *These experiments with different scaling balancing techniques implementationsand analysis are not included here due to compuatational limitations and need for a comprehensive coding notebook*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



df_y = pd.DataFrame({'target': y})

plt.style.use("dark_background")

# Creating the plot
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df_y, palette='pastel')


plt.title('Class Imbalance of the Target Variable', fontsize=16)
plt.xlabel('Class', fontsize=14)
plt.ylabel('Frequency', fontsize=14)


plt.show()


In [None]:
plt.style.use("default")

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [None]:
import pandas as pd
from imblearn.over_sampling import ADASYN


# Initialize ADASYN
adasyn = ADASYN(random_state=42)

# Resample the dataset
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)



# Check the class distribution after resampling
print("Class distribution after ADASYN:")
print(y_train_resampled.value_counts())


In [None]:
X_train=X_train_resampled
y_train=y_train_resampled

In [None]:
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

# Initializing models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),  # Suppress LightGBM outputs
    "AdaBoost": AdaBoostClassifier(random_state=42, algorithm="SAMME"),  # Set algorithm to SAMME
    "Naive Bayes": GaussianNB(),
    "k-NN": KNeighborsClassifier(),
    "LDA": LinearDiscriminantAnalysis()
}


results = {}

# Iterating over models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions on testing data
    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
    
    # Calculating accuracy
    accuracy = accuracy_score(y_test, test_preds)
    
    # Generating classification report for test data
    class_report_str = classification_report(y_test, test_preds, target_names=['Class 0', 'Class 1'])
    class_report = classification_report(y_test, test_preds, target_names=['Class 0', 'Class 1'], output_dict=True)
    
    # Confusion matrix to calculate sensitivity and specificity
    conf_matrix = confusion_matrix(y_test, test_preds)
    tn, fp, fn, tp = conf_matrix.ravel()
    
    # Calculating Sensitivity and Specificity for Class 1
    sensitivity_1 = tp / (tp + fn)
    specificity_1 = tn / (tn + fp)
    
    # Calculatong AUC ROC Score
    auc_roc = roc_auc_score(y_test, test_probs)
    
    # Arranging output metrics in a systematic form for Class 1
    metrics = {
        "Class 1": {
            "Accuracy": accuracy,
            "Sensitivity": sensitivity_1,
            "Specificity": specificity_1,
            "Precision": class_report['Class 1']['precision'],
            "Recall": class_report['Class 1']['recall'],
            "F1-Score": class_report['Class 1']['f1-score'],
            "AUC ROC Score": auc_roc
        }
    }
    
    
    results[name] = (metrics, class_report_str)

# Printing the results
for model_name, (metrics, class_report_str) in results.items():
    print(f"**For {model_name}:**")
    for cls, cls_metrics in metrics.items():
        print(f"Metrics for {cls}:")
        for metric, value in cls_metrics.items():
            print(f"{metric}: {value:.4f}")
        print("\n")
    
    
    print(f"Classification Report - Test Data for {model_name}:")
    print(class_report_str)
    print("\n")

In [None]:
import pandas as pd

# Create an empty DataFrame with the required column names
columns = ['Model', 'Accuracy', 'Recall', 'Precision', 'Sensitivity', 'Specificity', 'F1-Score', 'AUC ROC Score']
df_results = pd.DataFrame(columns=columns)

# List to collect rows for concatenation
rows = []

# Populate the list with the results
for model_name, (metrics, _) in results.items():
    row = [
        model_name,
        metrics['Class 1']['Accuracy'],
        metrics['Class 1']['Recall'],
        metrics['Class 1']['Precision'],
        metrics['Class 1']['Sensitivity'],
        metrics['Class 1']['Specificity'],
        metrics['Class 1']['F1-Score'],
        metrics['Class 1']['AUC ROC Score']
    ]
    rows.append(pd.Series(row, index=columns))

# Concatenating all rows into the DataFrame
df_results = pd.concat(rows, axis=1).T  # Transpose since Series are concatenated along the columns


df_results.set_index('Model', inplace=True)

# Styling the DataFrame
styled_table = (
    df_results.style
    .format('{:.4f}')  # Format all cells to 4 decimal places
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '2px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt'), ('font-weight', 'bold')]},  # Bold border and styling for headers
        {'selector': 'td', 'props': [('border', '1px solid black'), ('font-family', 'Times New Roman'), ('font-size', '12pt')]},  # Ordinary border and styling for cells
        {'selector': 'th.col_heading.level0', 'props': [('border', '2px solid black')]},  # Bold border for metric names
    ])
    .set_properties(**{'border': '1px solid black'})  # Ordinary border for all cells
    .set_table_attributes('style="border-collapse:collapse"')
)


styled_table