In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
df=pd.read_csv("heart_clean_final.csv")

In [7]:
df2=df

In [9]:
df2

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,62,0,0,138,294.0,1,1,106,0,1.9,1,3,2,0
1,58,0,0,100,248.0,0,0,122,0,1.0,1,0,2,1
2,58,1,0,114,318.0,0,2,140,0,3.5,0,3,1,0
3,54,1,0,122,286.0,0,0,116,1,3.2,1,2,2,0
4,71,0,0,112,149.0,0,1,125,0,1.6,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,60,0,2,120,178.0,1,1,100,0,0.0,2,0,2,1
179,64,1,2,140,335.0,0,1,158,0,0.0,2,0,2,0
180,68,0,2,120,211.0,0,0,115,0,1.5,1,0,2,1
181,44,0,2,108,141.0,0,1,175,0,0.6,1,0,2,1


In [11]:
indep_X=df.drop("target",axis=1)
dep_Y=df["target"]

In [13]:
def selectkbest(indep_X,dep_Y,n):
    test=SelectKBest(score_func=chi2,k=n)
    fit1=test.fit(indep_X,dep_Y)
    selectk_features=fit1.transform(indep_X)
    selected_feature_names = indep_X.columns[fit1.get_support()]
    print("Selected features:", list(selected_feature_names))
    return selectk_features,selected_feature_names

In [15]:
def split_scalar(indep_X,dep_Y):
    X_train,X_test,y_train,y_test=train_test_split(indep_X,dep_Y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    X_train_scaled=sc.fit_transform(X_train)
    X_test_scaled=sc.transform(X_test)
    return X_train_scaled,X_test_scaled,y_train,y_test

In [17]:
def cm_prediction(classifier,X_test,y_test):
    y_pred=classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,f1_score
    cm=confusion_matrix(y_test,y_pred)
    Accuracy=accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    f1_macro=f1_score(y_test,y_pred,average='weighted')
    print("The f1_macro value for best parameter {}:".format(classifier.best_params_),f1_macro)
    return Accuracy,report,cm,f1_macro

In [19]:
def logistic(X_train_scaled,y_train,X_test_scaled,y_test):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import GridSearchCV
    param_grid = [{'solver': ['lbfgs'], 'penalty': ['l2'], 'C': [0.1, 1, 10], 'max_iter': [1000]},
    {'solver': ['liblinear'], 'penalty': ['l1'], 'C': [0.1, 1, 10]}]
    classifier= GridSearchCV(LogisticRegression(), param_grid, refit = True,n_jobs=-1,scoring='f1_weighted')
    classifier.fit(X_train_scaled,y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test_scaled,y_test)
    return  classifier,Accuracy,report,cm,f1_macro

In [21]:
def svm(X_train,y_train,X_test,y_test):
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV
    param_grid = {'kernel':['linear','rbf'],'gamma':['auto','scale'],'C':[0.01,0.1,1,10]} 
    classifier = GridSearchCV(SVC(probability=True), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted') 
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test,y_test)
    return  classifier,Accuracy,report,cm,f1_macro

In [23]:
def Naive(X_train,y_train,X_test,y_test):
    from sklearn.naive_bayes import GaussianNB
    from sklearn.model_selection import GridSearchCV
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]}
    classifier = GridSearchCV(GaussianNB(), param_grid,scoring='f1_weighted', n_jobs=-1)
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1

In [25]:
def knn(X_train,y_train,X_test,y_test):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import GridSearchCV
    param_grid = {'n_neighbors':[3,5,7,9],'weights':['uniform','distance'],'algorithm':['auto', 'ball_tree', 'kd_tree'],'metric':['minkowski'],'p':[2]} 
    classifier = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True,n_jobs=-1,scoring='f1_weighted') 
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1_macro

In [27]:
def Decision(X_train,y_train,X_test,y_test):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import GridSearchCV
    param_grid = {'criterion':['gini','entropy'],'splitter':['best','random'],'max_depth':[5,10,20],'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,4]} 
    classifier = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, refit = True,n_jobs=-1,scoring='f1_weighted') 
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1_macro

In [29]:
def random(X_train,y_train,X_test,y_test):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    param_grid = {'criterion':['gini','entropy'],'max_features': ['sqrt','log2'],'n_estimators':[100,200,300],"max_depth":[5,10,20],"min_samples_split":[2,5],"min_samples_leaf":[1,2,4],"class_weight":["balanced"]} 
    classifier = GridSearchCV(RandomForestClassifier(), param_grid, refit = True,n_jobs=-1,scoring='f1_weighted') 
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1_macro

In [31]:
def Xgboost(X_train,y_train,X_test,y_test):
    from xgboost import XGBClassifier
    from sklearn.model_selection import GridSearchCV
    param_grid = {'max_depth':[5,10,20],'learning_rate':[0.01,0.1,0.2],'subsample':[0.5,0.7,1.0],'colsample_bytree':[0.5,0.7,1.0],"gamma":[0,1]} 
    classifier = GridSearchCV(XGBClassifier(random_state=42), param_grid, refit = True,n_jobs=-1,scoring='f1_weighted') 
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1_macro  

In [33]:
def AdaBoost(X_train,y_train,X_test,y_test):
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.model_selection import GridSearchCV
    param_grid={'n_estimators':[50,100,200],"learning_rate":[0.01,0.1,0.5]}
    classifier = GridSearchCV(AdaBoostClassifier(random_state=42),param_grid,refit=True,n_jobs=-1,scoring='f1_weighted')
    classifier.fit(X_train,y_train)
    Accuracy,report,cm,f1_macro=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1_macro   

In [35]:
acclog=[]
accsvm=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]
accxg=[]
accada=[]

selectk_features,selected_feature_names=selectkbest(indep_X,dep_Y,8)
X_train, X_test, y_train, y_test = train_test_split(selectk_features, dep_Y, test_size=0.25, random_state=0)
classifier,Accuracy,report,cm,f1_macro=logistic(X_train,y_train,X_test,y_test)
acclog.append(Accuracy)

classifier,Accuracy,report,cm,f1_macro=svm(X_train,y_train,X_test,y_test)  
accsvm.append(Accuracy)
    
classifier,Accuracy,report,cm,f1_macro=knn(X_train,y_train,X_test,y_test)  
accknn.append(Accuracy)
    
classifier,Accuracy,report,cm,f1_macro=Naive(X_train,y_train,X_test,y_test)  
accnav.append(Accuracy)
    
classifier,Accuracy,report,cm,f1_macro=Decision(X_train,y_train,X_test,y_test)  
accdes.append(Accuracy)
    
classifier,Accuracy,report,cm,f1_macro=random(X_train,y_train,X_test,y_test)  
accrf.append(Accuracy)

classifier,Accuracy,report,cm,f1_macro=Xgboost(X_train,y_train,X_test,y_test)  
accxg.append(Accuracy)

classifier,Accuracy,report,cm,f1_macro=AdaBoost(X_train,y_train,X_test,y_test)  
accada.append(Accuracy)


Selected features: ['age', 'sex', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']
The f1_macro value for best parameter {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}: 0.8931268772052606
Fitting 5 folds for each of 16 candidates, totalling 80 fits
The f1_macro value for best parameter {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}: 0.9335130560170527
The f1_macro value for best parameter {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}: 0.790997442455243
The f1_macro value for best parameter {'var_smoothing': 1e-06}: 0.9130434782608695
The f1_macro value for best parameter {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'random'}: 0.7531969309462915
The f1_macro value for best parameter {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}: 0.8640814727771249
The f1_macro value fo

In [37]:
def selectk_Classification(acclog,accsvm,accknn,accnav,accdes,accrf,accxg,accada):
    dataframe=pd.DataFrame(index=["chi2"],columns=['Logistic','SVM','KNN','Naive','Decision','Random','XgBoost','AdaBoost'])#,'LightGBM'])
    for number,idex in enumerate(dataframe.index):
        dataframe.loc[idex,'Logistic']=acclog[number]
        dataframe.loc[idex,'SVM']=accsvm[number]
        dataframe.loc[idex,'KNN']=accknn[number]
        dataframe.loc[idex,'Naive']=accnav[number]
        dataframe.loc[idex,'Decision']=accdes[number]
        dataframe.loc[idex,'Random']=accrf[number]
        dataframe.loc[idex,'XgBoost']=accxg[number]
        dataframe.loc[idex,'AdaBoost']=accada[number]
    return dataframe    

In [39]:
result=selectk_Classification(acclog,accsvm,accknn,accnav,accdes,accrf,accxg,accada)#,acclgbm)
highlighted_result=result.style.highlight_max(axis=1, color='lightgreen')
highlighted_result

Unnamed: 0,Logistic,SVM,KNN,Naive,Decision,Random,XgBoost,AdaBoost
chi2,0.891304,0.934783,0.804348,0.913043,0.73913,0.869565,0.869565,0.891304
