# RFE another feature selection method

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
dataset=pd.read_csv("CKD.csv")

In [3]:
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
independent=dataset.drop("classification_yes",axis=1)
dependent=dataset["classification_yes"]

In [4]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# 1. first we select n features using RFE

In [5]:

def rfefeature(independent,dependent,n):
    rfelist=[]
    #RFE has many models cos for each model the n no of features will be different
    # it uses models as the evaluation metrics for deciding n,like chis_core for kbest.
    log_model=LogisticRegression(max_iter=1000, solver="liblinear", random_state=0)
    svc=SVC(kernel="linear",random_state=0)
    RF=RandomForestClassifier(n_estimators=3,criterion="entropy",random_state=0)
    rfemodellist=[log_model,svc,RF]
    for i in  rfemodellist:
        print(i)
        model=RFE(estimator=i, n_features_to_select=n)
        logrfe_fit=model.fit(independent,dependent) #
        logrfe_feature=logrfe_fit.transform(independent)
        rfelist.append(logrfe_feature)
    return rfelist

## 1. wat happens hier is wen i= log_model, the best n features is selected, the logisticregression uses weight of the feature
# like how much a feature influences a prediction and increase the model performance. higher the value feature is of 
# higher importance

# 2. same with svm

# 3. with RF, IT DECIDES BEST N FEATURES BY WHICH FEATUE reduces impurity in decision trees. if a features increases impurity ,
# it affects model performance.

# 4. so wen RFE(estimators=i, n_features_to_select=n) is run all the best features r selected and least important r deleted
#    until only n no of features with high importance r left. these r the parameters of RFE.

# 5 . logrfe_fit=model.fit(independent,dependent) for these best n features the model is trained with respective output

# 6. logrfe_fit.transform(independent) this  only keeps the top most n features.

In [30]:
rfelist=rfefeature(independent,dependent,3)

LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')
SVC(kernel='linear', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=3, random_state=0)


# 2. once we hv selected or best features we r going to split our data and standardise it

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def split_scaler(independent,dependent):
    x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test



In [32]:
acclog_reg=[]
accsvmlin=[]
accsvm_non=[]
accRF=[]
accDEC_tree=[]
accnavie_bayes=[]
accknn=[]

In [33]:
for i in rfelist:
    x_train,x_test,y_train,y_test=split_scaler(i,dependent)  

# 3. now we create models and see

In [34]:
def cm_prediction(model,x_test): # hier cm is the confusion matrix so we r going to predict cm,report,acc_score for each model.
    y_pred=model.predict(x_test) 
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    cm=confusion_matrix(y_test,y_pred)
    report=classification_report(y_test,y_pred)# always the format is first y_test, then y_pred
    acc_score=accuracy_score(y_test,y_pred)
    return cm,report,acc_score

In [35]:
def logistic(x_train,y_train,x_test,y_test): # 1st model logistic regression
    from sklearn.linear_model import LogisticRegression
    model=LogisticRegression(random_state=0)
    model.fit(x_train,y_train)
    # once the model is fitted we find cm,report,acc_score which is already in cm_prediction(model,x_test) so jus call it
    cm,report,acc_score=cm_prediction(model,x_test)
    return model,x_test,y_test,cm,report,acc_score

In [36]:
model,x_test,y_test,cm,report,acc_score=logistic(x_train,y_train,x_test,y_test)
acclog_reg.append(acc_score)
acclog_reg

[0.91]

In [37]:
def svmlinear(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVC
    model=SVC(kernel="linear",random_state=0)
    model.fit(x_train,y_train)
    cm,report,acc_score=cm_prediction(model,x_test)
    return model,x_test,y_test,cm,report,acc_score

In [38]:
def svmnonlinear(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVC
    model=SVC(kernel="rbf",random_state=0)
    model.fit(x_train,y_train)
    cm,report,acc_score=cm_prediction(model,x_test)
    return model,x_test,y_test,cm,report,acc_score

In [39]:
model,x_test,y_test,cm,report,acc_score=svmnonlinear(x_train,y_train,x_test,y_test)
accsvm_non.append(acc_score)
accsvm_non

[0.93]

In [40]:
model,x_test,y_test,cm,report,acc_score=svmlinear(x_train,y_train,x_test,y_test)
accsvmlin.append(acc_score)
accsvmlin

[0.92]

In [41]:
def RF(x_train,y_train,x_test,y_test):
    from sklearn.ensemble import RandomForestClassifier
    model=RandomForestClassifier(n_estimators=3,criterion="entropy",random_state=0)
    model.fit(x_train,y_train)
    cm,report,acc_score=cm_prediction(model,x_test)
    return model,x_test,y_test,cm,report,acc_score

In [42]:
model,x_test,y_test,cm,report,acc_score=RF(x_train,y_train,x_test,y_test)
accRF.append(acc_score)
accRF
    

[0.94]

In [45]:
def rfe_classification(acclog_reg,accsvmlin,accsvm_non,accRF):
    dataframe=pd.DataFrame(index=["logistic","svml","svmnl","RF"],columns=["logistic","svml","svmnl","RF"])
    for number,idex in enumerate(dataframe.index):
                           dataframe["logistic"][idex]=acclog_reg[number]
                           dataframe["svml"][idex]=accsvmlin[number]
                           dataframe["svmnl"][idex]=accsvm_non[number]
                           dataframe["RF"][idex]=accRF[number]
    return dataframe

In [46]:
for i in rfelist:
    x_train,x_test,y_train,y_test=split_scaler(i,dependent)
    model,x_test,y_test,cm,report,acc_score=logistic(x_train,y_train,x_test,y_test)
    acclog_reg.append(acc_score)
    model,x_test,y_test,cm,report,acc_score=svmlinear(x_train,y_train,x_test,y_test)
    accsvmlin.append(acc_score)
    model,x_test,y_test,cm,report,acc_score=svmnonlinear(x_train,y_train,x_test,y_test)
    accsvm_non.append(acc_score)
    model,x_test,y_test,cm,report,acc_score=RF(x_train,y_train,x_test,y_test)
    accRF.append(acc_score)
    
result1=rfe_classification(acclog_reg,accsvmlin,accsvm_non,accRF)

In [29]:
result1# for n=5

Unnamed: 0,logistic,svml,svmnl,RF
logistic,0.95,0.98,0.98,0.96
svml,0.98,0.98,0.98,0.98
svmnl,0.98,0.98,0.98,0.98
RF,0.98,0.98,0.98,0.98


In [47]:
result1# for n=3

Unnamed: 0,logistic,svml,svmnl,RF
logistic,0.91,0.92,0.93,0.94
svml,0.94,0.94,0.94,0.94
svmnl,0.87,0.87,0.87,0.94
RF,0.91,0.92,0.93,0.87
