In [1]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
dataset=pd.read_csv('prep.csv')
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [3]:
independent=dataset.drop('classification_yes',axis=1)
dependent=dataset['classification_yes']

In [4]:
def RecursiveFeatureElimination(independent,dependent,n):
    rfelist=[]
    
    log=LogisticRegression(solver='lbfgs',random_state=0)
    svcl=SVC(kernel='linear',random_state=0)
    dtc=DecisionTreeClassifier(criterion='entropy',max_features='sqrt',splitter='best',random_state=0)
    rfc=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    
    rfemodellist=[log,svcl,dtc,rfc]
    
    from sklearn.feature_selection import RFE
    for estimator in rfemodellist:
        rfe=RFE(estimator,n_features_to_select=n)
        rfe_fit=rfe.fit(independent,dependent)
        rfe_features=rfe_fit.transform(independent)
        rfelist.append(rfe_features)
    return rfelist

In [5]:
def split_scalar(independent,dependent):
    from sklearn.model_selection import train_test_split
    X_train,X_test,Y_train,Y_test=train_test_split(independent,dependent,test_size=0.5,random_state=0)
    
    from sklearn.preprocessing import StandardScaler
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test

In [6]:
def pred_confusion(classifier,X_test,Y_test):
    Y_pred=classifier.predict(X_test)
    
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    
    cm=confusion_matrix(Y_test,Y_pred)
    accuracy=accuracy_score(Y_test,Y_pred)
    classification_report=classification_report(Y_test,Y_pred)
    return cm,accuracy,classification_report

In [7]:
def logistic(X_train,Y_train):
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

def svml(X_train,Y_train):
    classifier=SVC(kernel='linear',random_state=0)
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

def svmnl(X_train,Y_train):
    classifier=SVC(kernel='rbf',random_state=0)
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

def decisiontree(X_train,Y_train):
    classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

def randomforest(X_train,Y_train):
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

def naive_bayes(X_train,Y_train):
    classifier=GaussianNB()
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

def knn(X_train,Y_train):
    classifier=KNeighborsClassifier(n_neighbors=15,metric='minkowski',p=2)
    classifier.fit(X_train,Y_train)
    
    cm,accuracy,classification_report=pred_confusion(classifier,X_test,Y_test)
    return classifier,cm,accuracy,classification_report

In [8]:
def selectkclassification(acclog,accsvml,accsvmnl,accdt,accrf,accnb,accknn):
    Dataframe=pd.DataFrame(index=['Logistic','SVML','DecisionTree','RandomForest'],
                           columns=['Logistic','SVML','SVMNL','DecisionTree','RandomForest','NaiveBayes','KNN'])

    for number,index in enumerate(Dataframe.index):
        Dataframe['Logistic'][index]=acclog[number]
        Dataframe['SVML'][index]=accsvml[number]
        Dataframe['SVMNL'][index]=accsvmnl[number]
        Dataframe['DecisionTree'][index]=accdt[number]
        Dataframe['RandomForest'][index]=accrf[number]
        Dataframe['NaiveBayes'][index]=accnb[number]
        Dataframe['KNN'][index]=accknn[number]
    return Dataframe

In [9]:
acclog=[]
accsvml=[]
accsvmnl=[]
accdt=[]
accrf=[]
accnb=[]
accknn=[]

In [33]:
rfelist=RecursiveFeatureElimination(independent,dependent,5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [34]:
rfelist

[array([[3., 1., 0., 0., 0.],
        [2., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        ...,
        [3., 1., 0., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0.]]),
 array([[3., 1., 0., 0., 1.],
        [2., 1., 0., 0., 1.],
        [1., 0., 0., 0., 1.],
        ...,
        [3., 1., 0., 1., 0.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]]),
 array([[148.11267606,  57.48210526,   3.07735602,  12.51815562,
           0.        ],
        [148.11267606,  22.        ,   0.7       ,  10.7       ,
           0.        ],
        [ 99.        ,  23.        ,   0.6       ,  12.        ,
           0.        ],
        ...,
        [110.        , 115.        ,   6.        ,   9.1       ,
           0.        ],
        [207.        ,  80.        ,   6.8       ,   8.5       ,
           0.        ],
        [100.        ,  49.        ,   1.        ,  16.3       ,
           0.        ]]),
 array([[  3.        , 148.11267606,   3.07735602,  12.518155

In [29]:
for i in rfelist:
    X_train,X_test,Y_train,Y_test=split_scalar(i,dependent)
    
    classifier,cm,accuracy,classification_report=logistic(X_train,Y_train)
    acclog.append(accuracy)
    
    classifier,cm,accuracy,classification_report=svml(X_train,Y_train)
    accsvml.append(accuracy)
    
    classifier,cm,accuracy,classification_report=svmnl(X_train,Y_train)
    accsvmnl.append(accuracy)
    
    classifier,cm,accuracy,classification_report=decisiontree(X_train,Y_train)
    accdt.append(accuracy)
    
    classifier,cm,accuracy,classification_report=randomforest(X_train,Y_train)
    accrf.append(accuracy)
    
    classifier,cm,accuracy,classification_report=naive_bayes(X_train,Y_train)
    accnb.append(accuracy)
    
    classifier,cm,accuracy,classification_report=knn(X_train,Y_train)
    accknn.append(accuracy)

In [30]:
result=selectkclassification(acclog,accsvml,accsvmnl,accdt,accrf,accnb,accknn)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  Dataframe['Logistic'][index]=acclog[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFram

In [23]:
result #7

Unnamed: 0,Logistic,SVML,SVMNL,DecisionTree,RandomForest,NaiveBayes,KNN
Logistic,0.97,0.98,0.97,0.98,0.98,0.98,0.92
SVML,0.95,0.965,0.95,0.965,0.965,0.945,0.95
DecisionTree,0.97,0.96,0.965,0.96,0.96,0.83,0.93
RandomForest,0.97,0.965,0.975,0.925,0.955,0.91,0.97


In [19]:
result #6

Unnamed: 0,Logistic,SVML,SVMNL,DecisionTree,RandomForest,NaiveBayes,KNN
Logistic,0.97,0.98,0.97,0.98,0.98,0.98,0.92
SVML,0.95,0.965,0.95,0.965,0.965,0.945,0.95
DecisionTree,0.97,0.96,0.965,0.96,0.96,0.83,0.93
RandomForest,0.97,0.965,0.975,0.925,0.955,0.91,0.97


In [15]:
result #5

Unnamed: 0,Logistic,SVML,SVMNL,DecisionTree,RandomForest,NaiveBayes,KNN
Logistic,0.97,0.98,0.97,0.98,0.98,0.98,0.92
SVML,0.95,0.965,0.95,0.965,0.965,0.945,0.95
DecisionTree,0.97,0.96,0.965,0.96,0.96,0.83,0.93
RandomForest,0.97,0.965,0.975,0.925,0.955,0.91,0.97


In [27]:
result #4

Unnamed: 0,Logistic,SVML,SVMNL,DecisionTree,RandomForest,NaiveBayes,KNN
Logistic,0.97,0.98,0.97,0.98,0.98,0.98,0.92
SVML,0.95,0.965,0.95,0.965,0.965,0.945,0.95
DecisionTree,0.97,0.96,0.965,0.96,0.96,0.83,0.93
RandomForest,0.97,0.965,0.975,0.925,0.955,0.91,0.97


In [31]:
result #3

Unnamed: 0,Logistic,SVML,SVMNL,DecisionTree,RandomForest,NaiveBayes,KNN
Logistic,0.97,0.98,0.97,0.98,0.98,0.98,0.92
SVML,0.95,0.965,0.95,0.965,0.965,0.945,0.95
DecisionTree,0.97,0.96,0.965,0.96,0.96,0.83,0.93
RandomForest,0.97,0.965,0.975,0.925,0.955,0.91,0.97


In [None]:
# For all the models, we get the same accuracy. Hence we can choose Logistic model for RFE selection and SVM Linear for model selection with k=3.