In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
import pickle
from sklearn.feature_selection import chi2

In [2]:
import warnings
warnings.filterwarnings("ignore") 

In [3]:
df=pd.read_csv("heart_clean_final.csv")
df2=df

In [4]:
indep_X=df2.drop('target',axis=1)
dep_Y=df2['target']

In [5]:
def split_scalar(indep_X,dep_Y):
    X_train,X_test,y_train,y_test=train_test_split(indep_X,dep_Y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,y_train,y_test

In [6]:
def selectkbest(indep_X,dep_Y,n):
    test=SelectKBest(score_func=chi2,k=n)
    fit1=test.fit(indep_X,dep_Y)
    selectk_features=fit1.transform(indep_X)
    selected_feature_names = indep_X.columns[fit1.get_support()]
    print("Selected features:", list(selected_feature_names))
    return selectk_features,selected_feature_names

In [17]:
def cm_prediction(classifier,X_test,y_test):
    y_pred=classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    from sklearn.metrics import f1_score
    f1=f1_score(y_test,y_pred,average='binary')
    return Accuracy,report,cm,f1

In [19]:
def logistic(X_train,y_train,X_test,y_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0,max_iter=1000)
    classifier.fit(X_train,y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1  

In [21]:
def svm_linear(X_train,y_train,X_test,y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return  classifier,Accuracy,report,cm,f1

In [23]:
def svm_NL(X_train,y_train,X_test,y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return  classifier,Accuracy,report,cm,f1

In [25]:
def Naive(X_train,y_train,X_test,y_test):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1

In [27]:
def knn(X_train,y_train,X_test,y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1

In [29]:
def Decision(X_train,y_train,X_test,y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1     

In [31]:
def random(X_train,y_train,X_test,y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1

In [33]:
def Xgboost(X_train,y_train,X_test,y_test):
    from xgboost import XGBClassifier
    classifier =XGBClassifier()
    classifier.fit(X_train, y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1     
    

In [35]:
def AdaBoost(X_train,y_train,X_test,y_test):
    from sklearn.ensemble import AdaBoostClassifier
    classifier=AdaBoostClassifier()
    classifier.fit(X_train,y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1   

In [37]:
def LightGBM(X_train,y_train,X_test,y_test):
    from lightgbm import LGBMClassifier
    classifier=LGBMClassifier(verbose=-1)
    classifier.fit(X_train,y_train)
    Accuracy,report,cm,f1=cm_prediction(classifier,X_test,y_test)
    return classifier,Accuracy,report,cm,f1 
    

In [39]:
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]
accxg=[]
accada=[]
acclgbm=[]

for n in range(1,13):
    selectk_features,selected_feature_names=selectkbest(indep_X,dep_Y,n)
    X_train, X_test, y_train, y_test = train_test_split(selectk_features, dep_Y, test_size=0.25, random_state=0)
    classifier,Accuracy,report,cm,f1=logistic(X_train,y_train,X_test,y_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,cm,f1=svm_linear(X_train,y_train,X_test,y_test)  
    accsvml.append(Accuracy)
        
    classifier,Accuracy,report,cm,f1=svm_NL(X_train,y_train,X_test,y_test)  
    accsvmnl.append(Accuracy)
        
    classifier,Accuracy,report,cm,f1=knn(X_train,y_train,X_test,y_test)  
    accknn.append(Accuracy)
        
    classifier,Accuracy,report,cm,f1=Naive(X_train,y_train,X_test,y_test)  
    accnav.append(Accuracy)
        
    classifier,Accuracy,report,cm,f1=Decision(X_train,y_train,X_test,y_test)  
    accdes.append(Accuracy)
        
    classifier,Accuracy,report,cm,f1=random(X_train,y_train,X_test,y_test)  
    accrf.append(Accuracy)
    
    classifier,Accuracy,report,cm,f1=Xgboost(X_train,y_train,X_test,y_test)  
    accxg.append(Accuracy)

    classifier,Accuracy,report,cm,f1=AdaBoost(X_train,y_train,X_test,y_test)  
    accada.append(Accuracy)

    classifier,Accuracy,report,cm,f1=LightGBM(X_train,y_train,X_test,y_test)  
    acclgbm.append(Accuracy)


Selected features: ['thalach']
Selected features: ['thalach', 'ca']
Selected features: ['thalach', 'oldpeak', 'ca']
Selected features: ['age', 'thalach', 'oldpeak', 'ca']
Selected features: ['age', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'cp', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'sex', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'sex', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
Selected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
Selected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
Selected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


In [40]:
def selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf,accxg,accada,acclgbm):
    dataframe=pd.DataFrame(index=[f"n={i}" for i in range(1,13)],columns=['Logistic','SVMl','SVMnl','KNN','Naive','Decision','Random','XgBoost','AdaBoost','LightGBM'])
    for number,idex in enumerate(dataframe.index):
        dataframe.loc[idex,'Logistic']=acclog[number]
        dataframe.loc[idex,'SVMl']=accsvml[number]
        dataframe.loc[idex,'SVMnl']=accsvmnl[number]
        dataframe.loc[idex,'KNN']=accknn[number]
        dataframe.loc[idex,'Naive']=accnav[number]
        dataframe.loc[idex,'Decision']=accdes[number]
        dataframe.loc[idex,'Random']=accrf[number]
        dataframe.loc[idex,'XgBoost']=accxg[number]
        dataframe.loc[idex,'AdaBoost']=accada[number]
        dataframe.loc[idex,'LightGBM']=acclgbm[number]
    return dataframe      

In [43]:
result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf,accxg,accada,acclgbm)
highlighted_result=result.style.highlight_max(axis=1, color='lightgreen')
highlighted_result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random,XgBoost,AdaBoost,LightGBM
n=1,0.782609,0.782609,0.76087,0.76087,0.782609,0.630435,0.673913,0.73913,0.782609,0.804348
n=2,0.847826,0.804348,0.782609,0.782609,0.804348,0.717391,0.804348,0.847826,0.76087,0.804348
n=3,0.847826,0.826087,0.782609,0.782609,0.847826,0.695652,0.717391,0.717391,0.73913,0.76087
n=4,0.847826,0.804348,0.782609,0.782609,0.804348,0.76087,0.847826,0.76087,0.717391,0.804348
n=5,0.891304,0.869565,0.782609,0.782609,0.869565,0.782609,0.782609,0.847826,0.826087,0.891304
n=6,0.891304,0.891304,0.782609,0.782609,0.891304,0.76087,0.847826,0.847826,0.826087,0.847826
n=7,0.891304,0.891304,0.782609,0.76087,0.913043,0.782609,0.847826,0.782609,0.826087,0.869565
n=8,0.934783,0.913043,0.782609,0.76087,0.913043,0.826087,0.826087,0.826087,0.913043,0.891304
n=9,0.913043,0.913043,0.782609,0.76087,0.913043,0.847826,0.826087,0.826087,0.913043,0.869565
n=10,0.934783,0.913043,0.782609,0.73913,0.891304,0.847826,0.847826,0.869565,0.913043,0.869565


In [45]:
f1log=[]
f1svml=[]
f1svmnl=[]
f1knn=[]
f1nav=[]
f1des=[]
f1rf=[]
f1xg=[]
f1ada=[]
f1lgbm=[]
for n in range(1,13):
    selectk_features,selected_feature_names=selectkbest(indep_X,dep_Y,n)
    X_train, X_test, y_train, y_test = train_test_split(selectk_features, dep_Y, test_size=0.25, random_state=0)
    classifier,Accuracy,report,cm,f1=logistic(X_train,y_train,X_test,y_test)
    f1log.append(f1)
    
    classifier,Accuracy,report,cm,f1=svm_linear(X_train,y_train,X_test,y_test)  
    f1svml.append(f1)
        
    classifier,Accuracy,report,cm,f1=svm_NL(X_train,y_train,X_test,y_test)  
    f1svmnl.append(f1)
        
    classifier,Accuracy,report,cm,f1=knn(X_train,y_train,X_test,y_test)  
    f1knn.append(f1)
        
    classifier,Accuracy,report,cm,f1=Naive(X_train,y_train,X_test,y_test)  
    f1nav.append(f1)
        
    classifier,Accuracy,report,cm,f1=Decision(X_train,y_train,X_test,y_test)  
    f1des.append(f1)
        
    classifier,Accuracy,report,cm,f1=random(X_train,y_train,X_test,y_test)  
    f1rf.append(f1)
    
    classifier,Accuracy,report,cm,f1=Xgboost(X_train,y_train,X_test,y_test)  
    f1xg.append(f1)

    classifier,Accuracy,report,cm,f1=AdaBoost(X_train,y_train,X_test,y_test)  
    f1ada.append(f1)

    classifier,Accuracy,report,cm,f1=LightGBM(X_train,y_train,X_test,y_test)  
    f1lgbm.append(f1)




Selected features: ['thalach']
Selected features: ['thalach', 'ca']
Selected features: ['thalach', 'oldpeak', 'ca']
Selected features: ['age', 'thalach', 'oldpeak', 'ca']
Selected features: ['age', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'cp', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'sex', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'ca']
Selected features: ['age', 'sex', 'cp', 'chol', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
Selected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
Selected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
Selected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


In [47]:
def selectk_Classification(f1log,f1svml,f1svmnl,f1knn,f1nav,f1des,f1rf,f1xg,f1ada,f1lgbm):
    dataframe=pd.DataFrame(index=[f"n={i}" for i in range(1,13)],columns=['Logistic','SVMl','SVMnl','KNN','Naive','Decision','Random','XgBoost','AdaBoost','LightGBM'])
    for number,idex in enumerate(dataframe.index):
        dataframe.loc[idex,'Logistic']=f1log[number]
        dataframe.loc[idex,'SVMl']=f1svml[number]
        dataframe.loc[idex,'SVMnl']=f1svmnl[number]
        dataframe.loc[idex,'KNN']=f1knn[number]
        dataframe.loc[idex,'Naive']=f1nav[number]
        dataframe.loc[idex,'Decision']=f1des[number]
        dataframe.loc[idex,'Random']=f1rf[number]
        dataframe.loc[idex,'XgBoost']=f1xg[number]
        dataframe.loc[idex,'AdaBoost']=f1ada[number]
        dataframe.loc[idex,'LightGBM']=f1lgbm[number]
    return dataframe     

In [49]:
result=selectk_Classification(f1log,f1svml,f1svmnl,f1knn,f1nav,f1des,f1rf,f1xg,f1ada,f1lgbm)
highlighted_result=result.style.highlight_max(axis=1, color='lightgreen')
highlighted_result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random,XgBoost,AdaBoost,LightGBM
n=1,0.861111,0.861111,0.860759,0.853333,0.861111,0.738462,0.776119,0.837838,0.875,0.876712
n=2,0.90411,0.883117,0.878049,0.868421,0.873239,0.816901,0.876712,0.906667,0.84507,0.876712
n=3,0.906667,0.891892,0.878049,0.868421,0.901408,0.8,0.811594,0.816901,0.828571,0.849315
n=4,0.906667,0.876712,0.878049,0.868421,0.873239,0.849315,0.901408,0.849315,0.826667,0.873239
n=5,0.933333,0.918919,0.878049,0.868421,0.914286,0.861111,0.861111,0.906667,0.891892,0.931507
n=6,0.931507,0.933333,0.878049,0.868421,0.927536,0.849315,0.90411,0.90411,0.891892,0.906667
n=7,0.931507,0.933333,0.878049,0.853333,0.942857,0.857143,0.901408,0.868421,0.897436,0.921053
n=8,0.958904,0.944444,0.878049,0.853333,0.944444,0.882353,0.888889,0.891892,0.945946,0.933333
n=9,0.944444,0.945946,0.878049,0.853333,0.944444,0.898551,0.891892,0.891892,0.945946,0.921053
n=10,0.958904,0.945946,0.878049,0.842105,0.929577,0.898551,0.906667,0.921053,0.945946,0.921053
