In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np 
import time
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier   
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


In [2]:
def rfeFeature(indep_x,dep_y,n):
    rfelist=[]
    
    log_model=LogisticRegression(solver='lbfgs')
    
    RF=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    #NB=GaussianNB()
    
    DT=DecisionTreeClassifier(criterion='gini',max_features='sqrt',splitter='best',random_state=0)
    svc_model=SVC(kernel="linear",random_state=0)
    
    #knn=KNeighborsClassifier(n_neighbors=5,metrics='minkowski',p=2)
    rfemodellist=[log_model,svc_model,RF,DT]
    
    for i in rfemodellist:
        print(i)
        log_rfe=RFE(estimator=i, n_features_to_select=n)
        log_fit=log_rfe.fit(indep_x,dep_y)
        
        selected_features = indep_x.columns[log_fit.support_]
        print("Selected Features:", selected_features)
        log_rfe_feature=log_fit.transform(indep_x)
        rfelist.append(log_rfe_feature)
    return rfelist           
   

In [3]:
def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test=train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test



In [4]:
def cm_prediction(classifier,x_test):
    y_pred=classifier.predict(x_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    accuracy=accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    return classifier,accuracy, report,x_test,y_test,cm
 

In [5]:
def Logistic(x_train,y_train,x_test):
    from sklearn.linear_model import LogisticRegression
    classifier= LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm


def svm_linear(x_train,y_train,x_test):
    from sklearn.svm import SVC
    classifier= SVC(kernel='linear',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm

    
def svm_NL(x_train,y_train,x_test):
    from sklearn.svm import SVC
    classifier= SVC(kernel='rbf',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm 

def knn(x_train,y_train,x_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier= KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm

def Naive(x_train,y_train,x_test):
    from sklearn.naive_bayes import GaussianNB
    classifier= GaussianNB()
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm

def Decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier= DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm

def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,accuracy, report,x_test,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,accuracy, report,x_test,y_test,cm


In [6]:
def rfe_classification(acclog,accsvml,accsnmnl,accknn,accnav,accdes,accrf):
    rfedataframe=pd.DataFrame(index=['Logistic','SVC','Decision Tree','Random'],
                              columns=['Logistic','SVMLIN','SVMNL','KNN','Naive','Decision','Random'])
    
    for number, idex in enumerate(rfedataframe.index):
        rfedataframe['Logistic'][idex]= acclog [number]
        rfedataframe[ 'SVMLIN'][idex]= accsvml [number]
        rfedataframe['SVMNL'][idex]= accsnmnl [number]
        rfedataframe['KNN'][idex]= accknn [number]
        rfedataframe['Naive'][idex]= accnav [number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=  accrf [number]
    return rfedataframe
    
   




In [7]:
dataset1=pd.read_csv('prep.csv',index_col=None)

In [8]:
df2=dataset1
df2

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [9]:
df2=pd.get_dummies(df2,drop_first=True)
indep_x=df2.drop('classification_yes',1)
dep_y=df2['classification_yes']
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [21]:
rfelist=rfeFeature(indep_x,dep_y,6)

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Selected Features: Index(['al', 'hrmo', 'sg_c', 'sg_d', 'htn_yes', 'dm_yes'], dtype='object')
SVC(kernel='linear', random_state=0)
Selected Features: Index(['al', 'sg_c', 'sg_d', 'rbc_normal', 'dm_yes', 'appet_yes'], dtype='object')
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
Selected Features: Index(['al', 'bgr', 'sc', 'hrmo', 'pcv', 'rc'], dtype='object')
DecisionTreeClassifier(max_features='sqrt', random_state=0)
Selected Features: Index(['sc', 'hrmo', 'pcv', 'sg_c', 'rbc_normal', 'dm_yes'], dtype='object')


In [22]:
for i in rfelist:
    x_train,x_test,y_train,y_test=split_scalar(i,dep_y)

    classifier, accuracy, report, x_test, y_test, cm = Logistic(x_train, y_train, x_test)
    acclog.append(accuracy)
    classifier,accuracy, report,x_test,y_test,cm=svm_linear(x_train,y_train,x_test)
    accsvml.append(accuracy)
    classifier,accuracy, report,x_test,y_test,cm=svm_NL(x_train,y_train,x_test)
    accsvmnl.append(accuracy)
    classifier,accuracy, report,x_test,y_test,cm=knn(x_train,y_train,x_test)
    accknn.append(accuracy)
    classifier,accuracy, report,x_test,y_test,cm=Naive(x_train,y_train,x_test)
    accnav.append(accuracy)
    classifier,accuracy, report,x_test,y_test,cm=Decision(x_train,y_train,x_test)
    accdes.append(accuracy)
    classifier,accuracy, report,x_test,y_test,cm=random(x_train,y_train,x_test)
    accrf.append(accuracy)

    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)
    
    

In [20]:
result
#4

Unnamed: 0,Logistic,SVMLIN,SVMNL,KNN,Naive,Decision,Random
Logistic,0.95,0.95,0.95,0.95,0.95,0.95,0.95
SVC,0.96,0.96,0.96,0.96,0.96,0.96,0.96
Decision Tree,0.97,0.97,0.97,0.98,0.87,0.95,0.97
Random,0.98,0.98,0.92,0.98,0.81,0.98,0.98


In [17]:
result
#5

Unnamed: 0,Logistic,SVMLIN,SVMNL,KNN,Naive,Decision,Random
Logistic,0.98,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.99,0.99,0.99,0.99,0.99,0.99,0.99
Decision Tree,0.97,0.97,0.98,0.97,0.91,0.96,0.98
Random,0.95,0.98,0.93,0.94,0.85,0.97,0.98


In [23]:
result
#6

Unnamed: 0,Logistic,SVMLIN,SVMNL,KNN,Naive,Decision,Random
Logistic,0.98,0.98,0.98,0.98,0.98,0.99,0.98
SVC,0.99,0.99,0.99,0.99,0.99,0.99,0.99
Decision Tree,0.98,0.98,0.99,0.96,0.92,0.95,0.98
Random,0.96,0.96,0.97,0.95,0.85,0.97,0.96
