## SelectK Algorithm - Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
def selectkbest(indep_x,dep_y,n):
    SK = SelectKBest(score_func = chi2,k=n)
    SKfit = SK.fit(indep_x,dep_y)
    selectK_features = SKfit.transform(indep_x)
    selected_indices = SKfit.get_support(indices=True) 
    selected_features_names = indep_x.columns[selected_indices]
    return selectK_features, selected_features_names.tolist()

def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test = train_test_split(indep_x, dep_y, test_size=0.2, random_state=0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def cm_prediction(classifier,x_test,y_test):
    y_pred = classifier.predict(x_test)
    
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test,y_pred)
    
    from sklearn.metrics import accuracy_score
    Accuracy = accuracy_score(y_test,y_pred)
    
    from sklearn.metrics import classification_report
    Report = classification_report(y_test,y_pred)

    return classifier,cm,Accuracy,Report,x_test,y_test

def logistic(x_train,y_train,x_test):
    
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 

def svm_linear(x_train,y_train,x_test):
    classifier = SVC(kernel='linear',random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 

def svm_nonlinear(x_train,y_train,x_test):
    classifier = SVC(kernel = 'rbf', random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 

def knn(x_train,y_train,x_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski')
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 

def naive(x_train,y_train,x_test):   
    classifier = GaussianNB()
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 
    
def decision(x_train,y_train,x_test):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 

def random(x_train,y_train,x_test):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=0)
    classifier.fit(x_train,y_train)
    return cm_prediction(classifier,x_test,y_test) 



In [3]:
def selectk_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdc,accrf):
    
    dataframe = pd.DataFrame(index=['ChiSquare'], columns = ['Logistic','SVM_L','SVM_NL','KNN','Naive','Decision','Random'])
    for index in dataframe.index:
        dataframe['Logistic'][index] = acclog
        dataframe['SVM_L'][index] = accsvml
        dataframe['SVM_NL'][index] = accsvmnl
        dataframe['KNN'][index] = accknn 
        dataframe['Naive'][index] = accnav 
        dataframe['Decision'][index] = accdc
        dataframe['Random'][index] = accrf
    return dataframe

In [4]:
raw_dataset = pd.read_csv("CKD.csv",index_col=None)
raw_dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [5]:
df = raw_dataset

In [6]:
df = pd.get_dummies(df,dtype=int,drop_first=True)

In [7]:
df

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [8]:
df['classification_yes'].value_counts()

classification_yes
1    249
0    150
Name: count, dtype: int64

In [9]:
indep_x = df.drop('classification_yes', axis = 1)
dep_y = df['classification_yes']

In [46]:
kbest, selected_features_names = selectkbest(indep_x,dep_y,7)

acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdc = []
accrf = []

In [47]:
selected_features_names

['al', 'bgr', 'bu', 'sc', 'hrmo', 'pcv', 'wc']

In [48]:
kbest

array([[3.00000000e+00, 1.48112676e+02, 5.74821053e+01, ...,
        1.25181556e+01, 3.88689024e+01, 8.40819113e+03],
       [2.00000000e+00, 1.48112676e+02, 2.20000000e+01, ...,
        1.07000000e+01, 3.40000000e+01, 1.23000000e+04],
       [1.00000000e+00, 9.90000000e+01, 2.30000000e+01, ...,
        1.20000000e+01, 3.40000000e+01, 8.40819113e+03],
       ...,
       [3.00000000e+00, 1.10000000e+02, 1.15000000e+02, ...,
        9.10000000e+00, 2.60000000e+01, 9.20000000e+03],
       [0.00000000e+00, 2.07000000e+02, 8.00000000e+01, ...,
        8.50000000e+00, 3.88689024e+01, 8.40819113e+03],
       [0.00000000e+00, 1.00000000e+02, 4.90000000e+01, ...,
        1.63000000e+01, 5.30000000e+01, 8.50000000e+03]])

In [49]:
kbest.shape

(399, 7)

In [50]:
x_train,x_test,y_train,y_test = split_scalar(kbest,dep_y)

In [51]:
classifier,cm,Accuracy,Report,x_test,y_test = logistic(x_train,y_train,x_test)
acclog.append(Accuracy)

classifier,cm,Accuracy,Report,x_test,y_test = svm_linear(x_train,y_train,x_test)
accsvml.append(Accuracy)

classifier,cm,Accuracy,Report,x_test,y_test = svm_nonlinear(x_train,y_train,x_test)
accsvmnl.append(Accuracy)

classifier,cm,Accuracy,Report,x_test,y_test = knn(x_train,y_train,x_test)
accknn.append(Accuracy)

classifier,cm,Accuracy,Report,x_test,y_test = naive(x_train,y_train,x_test)
accnav.append(Accuracy)

classifier,cm,Accuracy,Report,x_test,y_test = decision(x_train,y_train,x_test)
accdc.append(Accuracy)

classifier,cm,Accuracy,Report,x_test,y_test = random(x_train,y_train,x_test)
accrf.append(Accuracy)

result = selectk_classification(acclog[0],accsvml[0],accsvmnl[0],accknn[0],accnav[0],accdc[0],accrf[0])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic'][index] = acclog
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or S

In [24]:
result
#5

Unnamed: 0,Logistic,SVM_L,SVM_NL,KNN,Naive,Decision,Random
ChiSquare,0.9375,0.9375,0.9375,0.8875,0.8625,0.9375,0.925


In [31]:
result
#3

Unnamed: 0,Logistic,SVM_L,SVM_NL,KNN,Naive,Decision,Random
ChiSquare,0.825,0.825,0.825,0.8,0.8125,0.8625,0.8125


In [38]:
result
#4

Unnamed: 0,Logistic,SVM_L,SVM_NL,KNN,Naive,Decision,Random
ChiSquare,0.8625,0.825,0.8375,0.825,0.8125,0.85,0.9125


In [45]:
result
#6

Unnamed: 0,Logistic,SVM_L,SVM_NL,KNN,Naive,Decision,Random
ChiSquare,0.95,0.9625,0.95,0.9375,0.9125,0.95,0.95


In [52]:
result
#7

Unnamed: 0,Logistic,SVM_L,SVM_NL,KNN,Naive,Decision,Random
ChiSquare,0.975,0.975,0.975,0.975,0.9125,0.9625,0.975


### K = 7 ---> 7 features are the best input for SelectK Algorithm with High Accuracy