In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
    def rfeFeature(indep_X, dep_Y, n):
        rfelist=[]
        selected_columns_dict = {}  # To store the selected features for each model
        
        log_model = LogisticRegression(solver='saga')
        RF=RandomForestClassifier(n_estimators=10, criterion='entropy')
        DT=DecisionTreeClassifier(criterion='gini', max_features='sqrt', splitter='best', random_state=0)
        svc_model=SVC(kernel='linear', random_state=0)
        
        rfemodellist=[log_model, svc_model, RF, DT]
        
        for i in rfemodellist:
            print(f"Applying RFE for {i}")
            log_rfe = RFE(estimator=i, n_features_to_select=n)
            log_fit = log_rfe.fit(indep_X, dep_Y)
            
            # Get the selected features (transformed dataset)
            log_rfe_feature = log_fit.transform(indep_X)
            
            # Get the names of selected columns
            selected_columns = indep_X.columns[log_fit.support_]
            
            # Store the selected features for each model
            selected_columns_dict[str(i)] = selected_columns.tolist()
            
            # Append the transformed features to the list
            rfelist.append(log_rfe_feature)
            
        return rfelist, selected_columns_dict

In [3]:
    def split_scalar(indep_X, dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
        sc=StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        return X_train, X_test, y_train, y_test

In [4]:
    def cm_prediction(classifier, X_test):
        y_pred = classifier.predict(X_test)
        #from sklearn.metrics import confusion_matrix
        cm= confusion_matrix(y_test, y_pred)
        #from sklearn.metrics import accuracy_score
        #from sklearn.metrics import classification_report
        accuracy=accuracy_score(y_test, y_pred)
        report=classification_report(y_test, y_pred)
        return classifier, accuracy, report, X_test, y_test, cm

In [5]:
    def logistic(X_train, y_train, X_test):
        from sklearn.linear_model import LogisticRegression
        classifier=LogisticRegression(random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [6]:
    def svm_linear(X_train, y_train, X_test):
        from sklearn.svm import SVC
        classifier=SVC(kernel='linear', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [7]:
    def svm_NL(X_train, y_train, X_test):
        from sklearn.svm import SVC
        classifier=SVC(kernel='rbf', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [8]:
    def Navie(X_train, y_train, X_test):
        from sklearn.naive_bayes import GaussianNB
        classifier=GaussianNB()
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [9]:
    def knn(X_train, y_train, X_test):
        from sklearn.neighbors import KNeighborsClassifier
        classifier=KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [10]:
    def Decision(X_train, y_train, X_test):
        from sklearn.tree import DecisionTreeClassifier
        classifier=DecisionTreeClassifier(criterion='entropy', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [11]:
    def random(X_train, y_train, X_test):
        from sklearn.ensemble import RandomForestClassifier
        classifier=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [12]:
    def rfe_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
        rfedataframe = pd.DataFrame(index=['Logistic', 'SVC', 'DecisionTree', 'Random'], 
                                    columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
        for number, value in enumerate(rfedataframe.index):
            rfedataframe.loc[value, 'Logistic'] = acclog[number]
            rfedataframe.loc[value, 'SVMl'] = accsvml[number]
            rfedataframe.loc[value, 'SVMnl'] = accsvmnl[number]
            rfedataframe.loc[value, 'KNN'] = accknn[number]
            rfedataframe.loc[value, 'Navie'] = accnav[number]
            rfedataframe.loc[value, 'Decision'] = accdes[number]
            rfedataframe.loc[value, 'Random'] = accrf[number]
        return rfedataframe

In [13]:
dataset1=pd.read_csv("prep.csv", index_col=None)

In [14]:
df2=dataset1

In [15]:
df2=pd.get_dummies(df2, drop_first=True)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [16]:
indep_X=df2.drop('classification_yes', axis=1)
dep_Y=df2['classification_yes']

In [17]:
rfelist, selected_columns_dict=rfeFeature(indep_X, dep_Y, 3)
rfelist

Applying RFE for LogisticRegression(solver='saga')




Applying RFE for SVC(kernel='linear', random_state=0)
Applying RFE for RandomForestClassifier(criterion='entropy', n_estimators=10)
Applying RFE for DecisionTreeClassifier(max_features='sqrt', random_state=0)


[array([[ 3.07735602, 12.51815562, 38.86890244],
        [ 0.7       , 10.7       , 34.        ],
        [ 0.6       , 12.        , 34.        ],
        ...,
        [ 6.        ,  9.1       , 26.        ],
        [ 6.8       ,  8.5       , 38.86890244],
        [ 1.        , 16.3       , 53.        ]]),
 array([[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        ...,
        [0., 1., 0.],
        [0., 1., 1.],
        [0., 0., 1.]]),
 array([[ 3.07735602, 12.51815562, 38.86890244],
        [ 0.7       , 10.7       , 34.        ],
        [ 0.6       , 12.        , 34.        ],
        ...,
        [ 6.        ,  9.1       , 26.        ],
        [ 6.8       ,  8.5       , 38.86890244],
        [ 1.        , 16.3       , 53.        ]]),
 array([[ 3.07735602, 38.86890244,  0.        ],
        [ 0.7       , 34.        ,  0.        ],
        [ 0.6       , 34.        ,  0.        ],
        ...,
        [ 6.        , 26.        ,  0.        ],
        [ 6.8       , 38.8

In [18]:
for model, selected_columns in selected_columns_dict.items():
    print(f"Selected features for {model}: {selected_columns}")

Selected features for LogisticRegression(solver='saga'): ['sc', 'hrmo', 'pcv']
Selected features for SVC(kernel='linear', random_state=0): ['sg_d', 'dm_yes', 'appet_yes']
Selected features for RandomForestClassifier(criterion='entropy', n_estimators=10): ['sc', 'hrmo', 'pcv']
Selected features for DecisionTreeClassifier(max_features='sqrt', random_state=0): ['sc', 'pcv', 'sg_d']


In [19]:
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [20]:
for i in rfelist:
    X_train, X_test, y_train, y_test = split_scalar(i, dep_Y)

    classifier, Accuracy, report, X_test, y_test, cm = logistic(X_train, y_train, X_test)
    acclog.append(Accuracy)
    
    classifier, Accuracy, report, X_test, y_test, cm = svm_linear(X_train, y_train, X_test)
    accsvml.append(Accuracy)

    classifier, Accuracy, report, X_test, y_test, cm = svm_NL(X_train, y_train, X_test)
    accsvmnl.append(Accuracy)

    classifier, Accuracy, report, X_test, y_test, cm = knn(X_train, y_train, X_test)
    accknn.append(Accuracy)

    classifier, Accuracy, report, X_test, y_test, cm = Navie(X_train, y_train, X_test)
    accnav.append(Accuracy)

    classifier, Accuracy, report, X_test, y_test, cm = Decision(X_train, y_train, X_test)
    accdes.append(Accuracy)

    classifier, Accuracy, report, X_test, y_test, cm = random(X_train, y_train, X_test)
    accrf.append(Accuracy)

result = rfe_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)

In [21]:
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Logistic,0.94,0.94,0.94,0.94,0.9,0.91,0.92
SVC,0.87,0.87,0.87,0.87,0.87,0.87,0.87
DecisionTree,0.94,0.94,0.94,0.94,0.9,0.91,0.92
Random,0.93,0.93,0.94,0.95,0.74,0.95,0.97
