## 2-2. Feature Selection Using Wrapper Method

### A. Import Python libraries

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
from feature_selection import filter_method as ft

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_classif
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score, mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
import csv

import warnings

warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", FutureWarning, )
%matplotlib inline

### B. Set City Name and Import City Data

In [2]:
# set city name
city_data = "1_goyang_city.xlsx"
CITY_NAME_Eng = "GoYang-City"
results = {}

In [3]:
# Read excel file using pandas
df = pd.read_excel(open(f"../../data/{city_data}", 'rb'), sheet_name="training", header=4, index_col=0)
# Remove unnecessary columns for this analysis
df = df.iloc[2:]
# Change Date Format and Set Date as index
df.index = pd.to_datetime(df.index.str.strip(), format='%Y-%m')
df.index.name = "date"
# Change data format from "Object" to "Float"
df["water_supply"] = df.water_supply.astype(float)
df["Total_Population"] = df.Total_Population.astype(float)
# Delete unnecessary columns 
df.drop(columns=df.columns[19:21], inplace=True)
df.drop(columns=df.columns[22:23], inplace=True)
# Select clean data
df = df.loc["2010-01-01":]
df

Unnamed: 0_level_0,water_supply,Total_Population,Households,Population_per_Households,Male_Population,Female_Population,Male_Female_Ratio,Population_aging_Ratio,Power_usage,Num_of_Business,...,personal_expense,benefits_vs_personal_expense,employment_ratio,employment_insurance_ratio,Average_Temp,Monthly_Rainfall,Average_Relative_Humadity,Ground_Temp,Average_Wind,Average_Pressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,282265.709677,939497.0,353741.0,2.655889,463878.0,475619.0,0.975314,0.088,434436000.0,20326.0,...,104371555.0,0.191,99001.0,0.105377,-4.5,29.3,0.65,-1.2,2.3,1013.6
2010-02-01,273685.892857,940639.0,354266.0,2.655177,464518.0,476121.0,0.975630,0.088,418156000.0,20685.0,...,104371555.0,0.191,99893.0,0.106197,1.4,55.3,0.59,1.4,2.4,1010.7
2010-03-01,269918.193548,940982.0,354003.0,2.658119,464740.0,476242.0,0.975848,0.088,346653000.0,20809.0,...,104371555.0,0.191,99678.0,0.105930,4.3,82.5,0.59,5.0,2.9,1009.6
2010-04-01,274462.700000,941737.0,354192.0,2.658832,465148.0,476589.0,0.975994,0.088,356701000.0,21857.0,...,104371555.0,0.191,101594.0,0.107879,9.5,62.8,0.54,10.8,2.9,1007.4
2010-05-01,288537.806452,941724.0,354157.0,2.659058,465110.0,476614.0,0.975863,0.089,313793000.0,21739.0,...,104371555.0,0.191,102073.0,0.108390,17.2,124.0,0.62,18.7,2.6,1000.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-01,354382.451613,1080896.0,454793.0,2.376677,528972.0,551924.0,0.958415,0.140,487867544.0,46416.0,...,218316665.0,0.293,171845.0,0.158984,25.9,211.2,0.74,28.2,2.1,998.6
2021-09-01,343537.800000,1080787.0,455501.0,2.372743,528911.0,551876.0,0.958387,0.141,401433572.0,45514.0,...,218316665.0,0.293,172771.0,0.159857,22.6,131.0,0.71,24.6,2.3,1003.4
2021-10-01,340126.806452,1080240.0,455845.0,2.369753,528683.0,551557.0,0.958528,0.142,358286760.0,45839.0,...,218316665.0,0.293,173486.0,0.160599,15.6,57.0,0.70,16.7,2.1,1011.0
2021-11-01,335109.300000,1079722.0,456376.0,2.365861,528390.0,551332.0,0.958388,0.142,372991744.0,46076.0,...,218316665.0,0.293,173831.0,0.160996,8.2,62.4,0.68,7.9,2.1,1009.1


### C. Wrapper Methods

* Split Data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['water_supply'], axis=1),
    df['water_supply'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((115, 24), (29, 24))

#### 1) Step forward feature selection

#### - find correlated features

In [5]:
# find and remove correlated features

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', set(corr_features) )
print('correlated features: ', len(set(corr_features)) )

correlated features:  {'Population_per_Households', 'personal_expense', 'Male_Population', 'employment_insurance_ratio', 'Num_of_Business', 'Population_aging_Ratio', 'Average_Pressure', 'annual_household_income', 'Male_Female_Ratio', 'Female_Population', 'Households', 'benefits_vs_personal_expense', 'employment_ratio', 'Ground_Temp', 'High_School_Graduate_num'}
correlated features:  15


#### - removed correlated features

In [6]:
# removed correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

X_train.shape, X_test.shape

((115, 9), (29, 9))

In [7]:
X_train.columns

Index(['Total_Population', 'Power_usage', 'Business_above_100', 'complex_area',
       'High_School_Graduate_ratio', 'Average_Temp', 'Monthly_Rainfall',
       'Average_Relative_Humadity', 'Average_Wind'],
      dtype='object')

#### - Step Forward Feature Selection: 5개

In [8]:
# step forward feature selection

sfs = SFS(RandomForestRegressor(n_estimators=200, n_jobs=4, random_state=10), 
           k_features=X_train.shape[1], 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=2)

sfs = sfs.fit(X_train, y_train)


[2023-09-01 09:42:44] Features: 1/9 -- score: 0.6093864911312856
[2023-09-01 09:42:50] Features: 2/9 -- score: 0.8976577547463245
[2023-09-01 09:42:54] Features: 3/9 -- score: 0.8977105657262625
[2023-09-01 09:42:58] Features: 4/9 -- score: 0.8956563925689062
[2023-09-01 09:43:01] Features: 5/9 -- score: 0.8937401765163095
[2023-09-01 09:43:03] Features: 6/9 -- score: 0.8889457950665651
[2023-09-01 09:43:05] Features: 7/9 -- score: 0.885237851497253
[2023-09-01 09:43:06] Features: 8/9 -- score: 0.8754718338106604
[2023-09-01 09:43:07] Features: 9/9 -- score: 0.8718911876063906

From the logs above, we see that after ~9 features, adding more features does not really improve performance.

In [9]:
# indices of the selected columns
sfs.k_feature_idx_

(0, 1, 2, 3, 4, 5, 6, 7, 8)

In [10]:
# selected columns
results = {"step forward feature selection": X_train.columns[list(sfs.k_feature_idx_)].values}

X_train.columns[list(sfs.k_feature_idx_)]

Index(['Total_Population', 'Power_usage', 'Business_above_100', 'complex_area',
       'High_School_Graduate_ratio', 'Average_Temp', 'Monthly_Rainfall',
       'Average_Relative_Humadity', 'Average_Wind'],
      dtype='object')

#### - Compare performance of feature subsets

In [11]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests R2: {}'.format(r2_score(y_train, pred)))
    
    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests R2: {}'.format(r2_score(y_test, pred)))

In [12]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['Total_Population', 'Power_usage', 'Business_above_100', 'complex_area',
       'High_School_Graduate_ratio', 'Average_Temp', 'Monthly_Rainfall',
       'Average_Relative_Humadity', 'Average_Wind'],
      dtype='object')

In [13]:
# evaluate performance of algorithm built
# using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests R2: 0.9697862769022803
Test set
Random Forests R2: 0.9273250474798512


In [14]:
# and for comparison, we train random forests using
# all features (except the correlated ones, which we removed already)

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests R2: 0.9697862769022803
Test set
Random Forests R2: 0.9273250474798512


#### 2) Step backward feature selection


#### - Step Backward Feature Selection

In [15]:
# step backward feature selection algorithm

sfs = SFS(RandomForestRegressor(n_estimators=10, n_jobs=4, random_state=10), 
           k_features=X_train.shape[1], 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=2)

sfs = sfs.fit(X_train, y_train)

In [16]:
sfs.k_feature_names_

('Total_Population',
 'Power_usage',
 'Business_above_100',
 'complex_area',
 'High_School_Graduate_ratio',
 'Average_Temp',
 'Monthly_Rainfall',
 'Average_Relative_Humadity',
 'Average_Wind')

#### - Compare performance of feature subsets

In [17]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests R2: {}'.format(r2_score(y_train, pred)))
    
    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests R2: {}'.format(r2_score(y_test, pred)))

In [18]:
selected_feat= list(sfs.k_feature_names_)
results["step backward feature selection"] = selected_feat

selected_feat

['Total_Population',
 'Power_usage',
 'Business_above_100',
 'complex_area',
 'High_School_Graduate_ratio',
 'Average_Temp',
 'Monthly_Rainfall',
 'Average_Relative_Humadity',
 'Average_Wind']

In [19]:
# evaluate performance of algorithm built
# using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests R2: 0.9697862769022803
Test set
Random Forests R2: 0.9273250474798512


In [20]:
# and for comparison, we train random forests using
# all features (except the correlated ones, which we removed already)

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests R2: 0.9697862769022803
Test set
Random Forests R2: 0.9273250474798512


#### 3)  Exhaustive Feature Selection

In [21]:
# exhaustive search

# in order to shorter search time for the demonstration
# i will ask the algorithm to try all possible 10 and 11
# feature combinations

# if you have access to a multicore or distributed computer
# system you can try more greedy searches

efs = EFS(RandomForestRegressor(n_estimators=200,
                                n_jobs=4,
                                random_state=0,
                                max_depth=4),
          min_features=X_train.shape[1],
          max_features=X_train.shape[1],
          scoring='r2',
          print_progress=True,
          cv=2)

efs = efs.fit(X_train, y_train)

Features: 1/1

In [22]:
X_train.columns[list(efs.best_idx_)]

Index(['Total_Population', 'Power_usage', 'Business_above_100', 'complex_area',
       'High_School_Graduate_ratio', 'Average_Temp', 'Monthly_Rainfall',
       'Average_Relative_Humadity', 'Average_Wind'],
      dtype='object')

#### - Compare performance of feature subsets

In [23]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests R2: {}'.format(r2_score(y_train, pred)))
    
    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests R2: {}'.format(r2_score(y_test, pred)))

In [24]:
selected_feat = X_train.columns[list(efs.best_idx_)]

results["Exhaustive Feature Selection"] = selected_feat.values

selected_feat

Index(['Total_Population', 'Power_usage', 'Business_above_100', 'complex_area',
       'High_School_Graduate_ratio', 'Average_Temp', 'Monthly_Rainfall',
       'Average_Relative_Humadity', 'Average_Wind'],
      dtype='object')

In [25]:
results_f = pd.DataFrame()
results_f["step forward feature selection"] = [list(results["step forward feature selection"])]
results_f["step backward feature selection"] = [list(results["step backward feature selection"])]
results_f["Exhaustive Feature Selection"] = [list(results["Exhaustive Feature Selection"])]
results_f.to_csv(f'./results/{CITY_NAME_Eng}_wrapper_results.csv')
results_f

Unnamed: 0,step forward feature selection,step backward feature selection,Exhaustive Feature Selection
0,"[Total_Population, Power_usage, Business_above...","[Total_Population, Power_usage, Business_above...","[Total_Population, Power_usage, Business_above..."


In [26]:
# evaluate performance of algorithm built
# using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests R2: 0.9697862769022803
Test set
Random Forests R2: 0.9273250474798512


In [27]:
# and for comparison, we train random forests using
# all features (except the correlated ones, which we removed already)

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests R2: 0.9697862769022803
Test set
Random Forests R2: 0.9273250474798512
