In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
import pickle
from sklearn.model_selection import train_test_split

In [2]:
def featureselect(indepX,depY,n):
    sel=SelectKBest(score_func=chi2,k=n)
    selfit=sel.fit(indepX,depY)
    selkfeat=selfit.transform(indepX)
    selected_indices = selfit.get_support(indices=True)
    feature_names = indepX.columns[selected_indices]
    return selkfeat,feature_names

In [3]:
def split_scaler(indepX,depY):
    x_train,x_test,y_train,y_test=train_test_split(indepX,depY,test_size=0.30,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

In [4]:
def cmreport(classifier,x_test):
    y_pred=classifier.predict(x_test)
    cm=confusion_matrix(y_test,y_pred)
    accuracy=accuracy_score(y_test,y_pred)
    clr=classification_report(y_test,y_pred)
    return classifier,cm,accuracy,clr,x_test,y_test

In [5]:
def logistic(x_train,y_train,x_test):
    lrclassifier=LogisticRegression(random_state=0)
    lrclassifier.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(lrclassifier,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [6]:
def svmlinear(x_train,y_train,x_test):
    from sklearn.svm import SVC
    svli=SVC(kernel='linear',random_state=0)   
    svli.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(svli,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [7]:
def svmnl(x_train,y_train,x_test):
    from sklearn.svm import SVC
    svnl=SVC(kernel='rbf',random_state=0)
    svnl.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(svnl,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [8]:
def naive(x_train,y_train,x_test):
    from sklearn.naive_bayes import GaussianNB
    GNB=GaussianNB()
    GNB.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(GNB,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [9]:
def Knn(x_train,y_train,x_test):
    from sklearn.neighbors import KNeighborsClassifier
    KNC=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    KNC.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(KNC,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [10]:
def Decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeClassifier
    DTC=DecisionTreeClassifier(criterion='entropy',random_state=0)
    DTC.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(DTC,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [11]:
def Random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestClassifier
    RFC=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    RFC.fit(x_train,y_train)
    classifier,cm,accuracy,clr,x_test,y_test=cmreport(RFC,x_test)
    return classifier,cm,accuracy,clr,x_test,y_test

In [12]:
def selectkconfusion(acclog,accsvmli,accsvmnl,accnaive,accKNC,accdes,accran):
    df=pd.DataFrame(index=['chisquare'],columns=['Logistic','SVMLinear','SVMnonlinear','GaussianNB','KNeighbors','Decision','Random'])
    for number,idex in enumerate(df.index):
        df['Logistic'][idex]=acclog[number]
        df['SVMLinear'][idex]=accsvmli[number]
        df['SVMnonlinear'][idex]=accsvmnl[number]
        df['GaussianNB'][idex]=accnaive[number]
        df['KNeighbors'][idex]=accKNC[number]
        df['Decision'][idex]=accdes[number]
        df['Random'][idex]=accran[number]
        return df

In [13]:
dataset=pd.read_csv("prep.csv")

In [14]:
dataset=pd.get_dummies(dataset,drop_first=True)

In [15]:
indepX=dataset.drop('classification_yes',axis=1)
depY=dataset['classification_yes']

In [29]:
kbest,features=featureselect(indepX,depY,5)
print("The columns selected by selectkbestalgorithm:\n")
for col in features:
    print(col)

The columns selected by selectkbestalgorithm:

bgr
bu
sc
pcv
wc


In [23]:
x_train,x_test,y_train,y_test=split_scaler(kbest,depY)

In [18]:
acclog=[]
accsvmli=[]
accsvmnl=[]
accnaive=[]
accKNC=[]
accdes=[]
accran=[]

In [19]:
classifier,cm,accuracy,clr,x_test,y_test=logistic(x_train,y_train,x_test)
acclog.append(accuracy)
classifier,cm,accuracy,clr,x_test,y_test=svmlinear(x_train,y_train,x_test)
accsvmli.append(accuracy)
classifier,cm,accuracy,clr,x_test,y_test=svmnl(x_train,y_train,x_test)
accsvmnl.append(accuracy)
classifier,cm,accuracy,clr,x_test,y_test=naive(x_train,y_train,x_test)
accnaive.append(accuracy)
classifier,cm,accuracy,clr,x_test,y_test=naive(x_train,y_train,x_test)
accKNC.append(accuracy)
classifier,cm,accuracy,clr,x_test,y_test=Decision(x_train,y_train,x_test)
accdes.append(accuracy)
classifier,cm,accuracy,clr,x_test,y_test=Decision(x_train,y_train,x_test)
accran.append(accuracy)

In [20]:
result=selectkconfusion(acclog,accsvmli,accsvmnl,accnaive,accKNC,accdes,accran)
result

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Logistic'][idex]=acclog[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Ser

Unnamed: 0,Logistic,SVMLinear,SVMnonlinear,GaussianNB,KNeighbors,Decision,Random
chisquare,0.941667,0.941667,0.95,0.841667,0.841667,0.958333,0.958333
