# Evaluation Criteria Notebook

The second assignment is the evaluation criteria. To test suitable tools and ensure adequate methodology this notebook was created as a companion to the associated report.

In [None]:
#---------------------------------- IMPORTANT PACKAGES --------------------------------------------#
import numpy as np
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import os
from PIL import Image
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score, plot_roc_curve, precision_recall_curve
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier,RandomForestRegressor



In [None]:
#----------------------------------- DATA LOADING -------------------------------------------#
data = pd.read_csv('heart.csv')

In [None]:
data.head()
data.shape
data.info()
#Statistics
data.describe().T


In [None]:
#----------------------------------- DATA PREPROCESSING -------------------------------------------#
# Outliers
data[data['RestingBP']!=0]
data[data['OldPeak']>0]

# Scaling
robust_scale = RobustScaler()
standard_scale = StandardScaler()
# Encoding
ohe= OneHotEncoder()

# Categorical Data Encoding
data['Sex'] = ohe.fit_transform(data['Sex'])
data['ChestPainType'] = ohe.fit_transform(data['ChestPainType'])
data['RestingECG'] = ohe.fit_transform(data['RestingECG'])
data['ExerciseAngina'] = ohe.fit_transform(data['ExerciseAngina'])
data['ST_Slope'] = ohe.fit_transform(data['ST_Slope'])

data1 = pd.get_dummies(data, columns=['Sex','ChestPainType','RestingECG', 'ExerciseAngina', 'ST_Slope'], drop_first=True)

# Scaling Numerical Data
data['Oldpeak'] = robust_scale.fit_transform(data[['Oldpeak']])
data['Age'] = robust_scale.fit_transform(data[['Age']])
data['RestingBP'] = robust_scale.fit_transform(data[['RestingBP']])
data['Cholesterol'] = robust_scale.fit_transform(data[['Cholesterol']])
data['MaxHR'] = robust_scale.fit_transform(data[['MaxHR']])


In [None]:
#----------------------------------- SPLIT DATA -------------------------------------------#
target= data['HeartDisease']
features= data.drop('HeartDisease', axis=1)
x_dev, x_test, y_dev, y_test = train_test_split(features, target, test_size = 0.20, random_state = 2)

In [None]:
# Metrics Dunction
def metrics(y_test, y_pred, classifier):
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    names = ['True Neg','False Pos','False Neg','True Pos']
    counts = [value for value in cm.flatten()]
    percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm,annot = labels,fmt ='')
    
    # Precision, Recall, F1 score
    print(classification_report(y_test,y_pred))
    print(roc_auc_score(y_test,y_pred))

    # ROC
    plot_roc_curve(classifier, x_test,y_test)
    plt.title('ROC_AUC_Plot')
    plt.show()
    

In [None]:
# Model
def model(classifier, x_dev, x_test, y_dev, y_test ):
    kf=model_selection.StratifiedKFold(n_splits=9)
    for fold , (trn_,val_) in enumerate(kf.split(X=x_dev,y=y_dev)):
        
        X_train=x_dev.loc[trn_,x_dev]
        y_train=y_dev.loc[trn_,y_dev]
        
        X_valid=x_dev.loc[val_,x_dev]
        y_valid=y_dev.loc[val_,y_dev]
        
        classifier.fit(X_train,y_train)
        y_pred=classifier.predict(X_valid)
        print(f"The fold is : {fold} : ")
        print(classification_report(y_valid,y_pred))
        acc=roc_auc_score(y_valid,y_pred)
        print(f"The accuracy for Fold {fold+1} : {acc}")
        pass

    y_pred = classifier.predict(x_test)
    metrics(y_test, y_pred, classifier)


In [None]:
#----------------------------------- SUPPORT VECTOR MATRIX MODEL -------------------------------------------#
model(SVC(random_state=0, gamma = 10, C=10), x_dev, x_test, y_dev, y_test )


In [None]:
#------------------------------------  K-NEAREST NEIGHBBOR ------------------------------------------#
model(KNeighborsClassifier(n_neighbors=5), x_dev, x_test, y_dev, y_test)


In [None]:
#-------------------------------------- RANDOM FOREST CLASSIFIER ----------------------------------------#
model(RandomForestClassifier(random_state=0, n_estimators=100, min_samples_split=5, max_depth=10)), x_dev, x_test, y_dev, y_test )


In [None]:
#-------------------------------------- ADABOOST ----------------------------------------#
# hyperparameter tuning
model(AdaBoostClassifier(random_state=0, n_estimators=100, learning_rate=0.001)), x_dev, x_test, y_dev, y_test )


In [None]:
#------------------------------------ FAIRNESS EVALUATION ------------------------------------------#

In [None]:
#-------------------------------------- PERFORMANCE METRICS ----------------------------------------#

## References
https://www.kaggle.com/code/durgancegaur/a-guide-to-any-classification-problem
https://www.kaggle.com/code/durgancegaur/data-imbalance-eda-87-auc?scriptVersionId=88319036