# Exhaustive Search

In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [None]:
dataset1 = pd.read_csv("dataset_2.csv",nrows=10000)
dataset2 = pd.read_csv("housing.csv")

## Seperate to train - test sets

In [None]:
# Keep only numerical Variables in the second dataset
numerics = ['int64', 'float64']
numerical_vars = list(dataset2.select_dtypes(include=numerics).columns)
dataset2 = dataset2[numerical_vars]

In [None]:
# Separate features and target variables for both datasets

fs1 = dataset1.drop(labels=['target'], axis=1) # Features for dataset1
fs2 = dataset2.drop(labels=['SalePrice'], axis=1) # Features for dataset2

dv1 = dataset1['target'] # Target variable for dataset1
dv2 = dataset2['SalePrice'] # Target variable for dataset2

# Split dataset1 into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(fs1, 
                                                    dv1, 
                                                    test_size=0.3, 
                                                    random_state=0)

# Split dataset2 into training and testing sets
x_train2, x_test2, y_train2, y_test2 = train_test_split(fs2, 
                                                        dv2, 
                                                        test_size=0.3, 
                                                        random_state=0)

In [None]:
# Handle missing values by filling values with 0
x_train2.fillna(0, inplace=True)
x_test2.fillna(0, inplace=True)

## Exhaustive Feature Selection

In [None]:
# Initialize Exhaustive Feature Selector for classification task
efs = EFS(RandomForestClassifier(n_estimators=5,
                                 n_jobs=4,
                                 random_state=0,
                                 max_depth=2),
          min_features=1,
          
          max_features=2,
          scoring='accuracy',
          print_progress=False,
          cv=2)

# Perform feature selection
efs = efs.fit(x_train, y_train)

In [None]:
efs.best_idx_

In [None]:
# Get the index of best features
selected_feature = x_train.columns[list(efs.best_idx_)]

In [None]:
selected_feature

In [None]:
# Initialize Exhaustive Feature Selector for regression task
efs = EFS(RandomForestRegressor(n_estimators=5,
                                n_jobs=4,
                                random_state=0,
                                max_depth=2),
          min_features=1,
          max_features=3,
          scoring='r2',
          print_progress=True,
          cv=2)

# Perform feature selection
efs = efs.fit(x_train2, y_train2)

In [None]:
efs.best_idx_

In [None]:
x_train2.columns[list(efs.best_idx_)]