## RFE (Recursive Feature Elimination) - Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle
import matplotlib.pyplot as plt

In [2]:
def RFEfeatures(indep_x,dep_y,n):
    rfelist = []
    
    log_model = LinearRegression()  
    svc_model = SVC(kernel='linear',random_state = 0)
    dc_model = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_features='sqrt', random_state = 0)
    rf_model = RandomForestRegressor(n_estimators = 10, criterion = 'absolute_error', max_features = 'sqrt')
    
    rfemodellist = [log_model,svc_model,dc_model,rf_model]
    for i in rfemodellist:
        rfe = RFE(estimator = i, n_features_to_select = n)
        rfe.fit(indep_x,dep_y)
        rfe_features = rfe.transform(indep_x)
        rfelist.append(rfe_features)
    return rfelist

def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test = train_test_split(indep_x, dep_y, test_size=0.2, random_state=0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def R2_prediction(regressor,x_test,y_test):
    y_pred = regressor.predict(x_test)
    
    from sklearn.metrics import r2_score
    R2_score = r2_score(y_test,y_pred)
    return R2_score

def linear(x_train,y_train,x_test):
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return regressor,R2_score,x_test,y_test

def svm_linear(x_train,y_train,x_test):
    from sklearn.svm import SVC
    regressor = SVC(kernel='linear',random_state=0)
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return regressor,R2_score,x_test,y_test

def svm_nonlinear(x_train,y_train,x_test):
    from sklearn.svm import SVC
    regressor = SVC(kernel = 'rbf', random_state=0)
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return regressor,R2_score,x_test,y_test

def decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_features='sqrt', random_state = 0)
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return regressor,R2_score,x_test,y_test

def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, criterion = 'absolute_error', max_features = 'sqrt', random_state=0)
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return regressor,R2_score,x_test,y_test

def rfe_regression(r2log,r2svml,r2svmnl,r2dc,r2rf):
    
    rfedataframe = pd.DataFrame(index=['Logistic','SVC','Decision','Random'], columns = ['Logistic','SVM_L','SVM_NL','Decision','Random'])
    for index in rfedataframe.index:
        rfedataframe['Logistic'][index] = r2log
        rfedataframe['SVM_L'][index] = r2svml 
        rfedataframe['SVM_NL'][index] = r2svmnl
        rfedataframe['Decision'][index] = r2dc 
        rfedataframe['Random'][index] = r2rf
    return rfedataframe

In [3]:
raw_dataset = pd.read_csv('CKD.csv',index_col=None)
raw_dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [4]:
df = raw_dataset

In [5]:
df = pd.get_dummies(df, dtype=int, drop_first=True)
df

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [6]:
df['classification_yes'].value_counts()

classification_yes
1    249
0    150
Name: count, dtype: int64

In [7]:
indep_x = df.drop('classification_yes',axis=1)
dep_y = df['classification_yes']

In [20]:
rfelist = RFEfeatures(indep_x,dep_y,8)

r2log = []
r2svml = []
r2svmnl = []
r2dc = []
r2rf = []

In [21]:
for i in rfelist:
    x_train,x_test,y_train,y_test = split_scalar(i,dep_y)
    
    regressor,R2_score,x_test,y_test = linear(x_train,y_train,x_test)
    r2log.append(R2_score)
    
    regressor,R2_score,x_test,y_test = svm_linear(x_train,y_train,x_test)
    r2svml.append(R2_score)
    
    regressor,R2_score,x_test,y_test = svm_nonlinear(x_train,y_train,x_test)
    r2svmnl.append(R2_score)
    
    regressor,R2_score,x_test,y_test = decision(x_train,y_train,x_test)
    r2dc.append(R2_score)
    
    regressor,R2_score,x_test,y_test = random(x_train,y_train,x_test)
    r2rf.append(R2_score)
    
result = rfe_regression(r2log[0],r2svml[0],r2svmnl[0],r2dc[0],r2rf[0])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  rfedataframe['Logistic'][index] = r2log
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or

In [13]:
result
#5


Unnamed: 0,Logistic,SVM_L,SVM_NL,Decision,Random
Logistic,0.651926,0.791667,0.791667,0.797934,0.735417
SVC,0.651926,0.791667,0.791667,0.797934,0.735417
Decision,0.651926,0.791667,0.791667,0.797934,0.735417
Random,0.651926,0.791667,0.791667,0.797934,0.735417


In [16]:
result
#6

Unnamed: 0,Logistic,SVM_L,SVM_NL,Decision,Random
Logistic,0.656725,0.84375,0.84375,0.808061,0.822917
SVC,0.656725,0.84375,0.84375,0.808061,0.822917
Decision,0.656725,0.84375,0.84375,0.808061,0.822917
Random,0.656725,0.84375,0.84375,0.808061,0.822917


In [19]:
result
#7

Unnamed: 0,Logistic,SVM_L,SVM_NL,Decision,Random
Logistic,0.65796,0.84375,0.84375,0.856963,0.848958
SVC,0.65796,0.84375,0.84375,0.856963,0.848958
Decision,0.65796,0.84375,0.84375,0.856963,0.848958
Random,0.65796,0.84375,0.84375,0.856963,0.848958


In [22]:
result
#8

Unnamed: 0,Logistic,SVM_L,SVM_NL,Decision,Random
Logistic,0.746682,0.895833,0.895833,0.868164,0.933594
SVC,0.746682,0.895833,0.895833,0.868164,0.933594
Decision,0.746682,0.895833,0.895833,0.868164,0.933594
Random,0.746682,0.895833,0.895833,0.868164,0.933594
