In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def selectkbest(indep_X, dep_Y, n):
    # Create a SelectKBest object with chi2 as the scoring function and n as the number of features to select
    test = SelectKBest(score_func=chi2, k=n)
    
    # Fit the SelectKBest object to the data
    fit1 = test.fit(indep_X, dep_Y)
    
    # Transform the data to select the top k features
    selectk_features = fit1.transform(indep_X)
    
    # Get the names of selected features
    selected_columns = indep_X.columns[fit1.get_support()]
    
    return selectk_features, selected_columns

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test
    
def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, Accuracy, report, cm

def logistic(X_train, y_train, X_test, y_test):       
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def svm_linear(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def svm_NL(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def Navie(X_train, y_train, X_test, y_test):       
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def knn(X_train, y_train, X_test, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def Decision(X_train, y_train, X_test, y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def random(X_train, y_train, X_test, y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, Accuracy, report, cm

def selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf): 
    dataframe = pd.DataFrame(index=['ChiSquare'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    for number, idex in enumerate(dataframe.index):      
        dataframe['Logistic'][idex] = acclog[number]       
        dataframe['SVMl'][idex] = accsvml[number]
        dataframe['SVMnl'][idex] = accsvmnl[number]
        dataframe['KNN'][idex] = accknn[number]
        dataframe['Navie'][idex] = accnav[number]
        dataframe['Decision'][idex] = accdes[number]
        dataframe['Random'][idex] = accrf[number]
    return dataframe

In [2]:
dataset1=pd.read_csv("prep.csv",index_col=None)

df2=dataset1

In [3]:
df2

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [4]:
df2 = pd.get_dummies(df2, drop_first=True)

In [5]:
indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

In [6]:
# Get selected features and their names
selected_features, selected_columns = selectkbest(indep_X, dep_Y, 6)

# Display the selected features shape
print("Selected Features Shape:", selected_features.shape)

# Display the names of the selected columns
print("Selected Feature Columns:")
print(selected_columns)

Selected Features Shape: (399, 6)
Selected Feature Columns:
Index(['al', 'bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object')


In [7]:
kbest=selectkbest(indep_X,dep_Y,6)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [8]:
kbest

(array([[3.00000000e+00, 1.48112676e+02, 5.74821053e+01, 3.07735602e+00,
         3.88689024e+01, 8.40819113e+03],
        [2.00000000e+00, 1.48112676e+02, 2.20000000e+01, 7.00000000e-01,
         3.40000000e+01, 1.23000000e+04],
        [1.00000000e+00, 9.90000000e+01, 2.30000000e+01, 6.00000000e-01,
         3.40000000e+01, 8.40819113e+03],
        ...,
        [3.00000000e+00, 1.10000000e+02, 1.15000000e+02, 6.00000000e+00,
         2.60000000e+01, 9.20000000e+03],
        [0.00000000e+00, 2.07000000e+02, 8.00000000e+01, 6.80000000e+00,
         3.88689024e+01, 8.40819113e+03],
        [0.00000000e+00, 1.00000000e+02, 4.90000000e+01, 1.00000000e+00,
         5.30000000e+01, 8.50000000e+03]]),
 Index(['al', 'bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object'))

In [9]:
# Split and scale the dataset
X_train, X_test, y_train, y_test = split_scalar(selected_features, dep_Y)

In [11]:
classifier, Accuracy, report, cm = logistic(X_train, y_train, X_test, y_test)
acclog.append(Accuracy)

classifier, Accuracy, report, cm = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(Accuracy)

classifier, Accuracy, report, cm = svm_NL(X_train, y_train, X_test, y_test)
accsvmnl.append(Accuracy)

classifier, Accuracy, report, cm = knn(X_train, y_train, X_test, y_test)
accknn.append(Accuracy)

classifier, Accuracy, report, cm = Navie(X_train, y_train, X_test, y_test)
accnav.append(Accuracy)

classifier, Accuracy, report, cm = Decision(X_train, y_train, X_test, y_test)
accdes.append(Accuracy)

classifier, Accuracy, report, cm = random(X_train, y_train, X_test, y_test)
accrf.append(Accuracy)

In [13]:
result = selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic'][idex] = acclog[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFra

In [15]:
result
#k=5

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.95,0.96,0.96,0.93,0.89,0.97,0.97


In [17]:
result
#k=3

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.95,0.96,0.96,0.93,0.89,0.97,0.97


In [19]:
result
#k=4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.95,0.96,0.96,0.93,0.89,0.97,0.97


In [21]:
result
#k=6

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.95,0.96,0.96,0.93,0.89,0.97,0.97
