<a href="https://colab.research.google.com/github/UtharaS123/Feature-Selection/blob/main/feature_selection_optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Selection with Optuna

In [None]:
!pip install -q optuna

## Requirements

In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import StratifiedKFold

import optuna
from optuna.samplers import TPESampler

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
class FeatureSelectionOptuna:
    """
    This class implements feature selection using Optuna optimization framework.

    Parameters:

    - model (object): The predictive model to evaluate; this should be any object that implements fit() and predict() methods.
    - loss_fn (function): The loss function to use for evaluating the model performance. This function should take the true labels and the
                          predictions as inputs and return a loss value.
    - features (list of str): A list containing the names of all possible features that can be selected for the model.
    - X (DataFrame): The complete set of feature data (pandas DataFrame) from which subsets will be selected for training the model.
    - y (Series): The target variable associated with the X data (pandas Series).
    - splits (list of tuples): A list of tuples where each tuple contains two elements, the train indices and the validation indices.
    - penalty (float, optional): A factor used to penalize the objective function based on the number of features used.
    """

    def __init__(self,
                 model,
                 loss_fn,
                 features,
                 X,
                 y,
                 splits,
                 penalty=0):

        self.model = model
        self.loss_fn = loss_fn
        self.features = features
        self.X = X
        self.y = y
        self.splits = splits
        self.penalty = penalty

    def __call__(self,
                 trial: optuna.trial.Trial):

        # Select True / False for each feature
        selected_features = [trial.suggest_categorical(name, [True, False]) for name in self.features]

        # List with names of selected features
        selected_feature_names = [name for name, selected in zip(self.features, selected_features) if selected]

        # Optional: adds a penalty for the amount of features used
        n_used = len(selected_feature_names)
        total_penalty = n_used * self.penalty

        loss = 0

        for split in self.splits:
          train_idx = split[0]
          valid_idx = split[1]

          X_train = self.X.iloc[train_idx].copy()
          y_train = self.y.iloc[train_idx].copy()
          X_valid = self.X.iloc[valid_idx].copy()
          y_valid = self.y.iloc[valid_idx].copy()

          X_train_selected = X_train[selected_feature_names].copy()
          X_valid_selected = X_valid[selected_feature_names].copy()

          # Train model, get predictions and accumulate loss
          self.model.fit(X_train_selected, y_train)
          pred = self.model.predict(X_valid_selected)

          loss += self.loss_fn(y_valid, pred)

        # Take the average loss across all splits
        loss /= len(self.splits)

        # Add the penalty to the loss
        loss += total_penalty

        return loss

## Data

In [None]:
SEED = 32

# Load data
filename = "train.csv" # train.csv from https://www.kaggle.com/datasets/iabhishekofficial/mobile-price-classification

df = pd.read_csv(filename)

# Train - test split
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.iloc[:,-1], random_state=SEED)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# The last column is the target variable
X_train = df_train.iloc[:,0:20]
y_train = df_train.iloc[:,-1]
X_test = df_test.iloc[:,0:20]
y_test = df_test.iloc[:,-1]

# Stratified kfold over the train set for cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
splits = list(skf.split(X_train, y_train))

## All Features

We fit the RSF using all features as benchmark

In [None]:
model = RandomForestClassifier(random_state=SEED)
model.fit(X_train,y_train)
preds = model.predict(X_test)

print(classification_report(y_test, preds))
print(f"Global F1: {f1_score(y_test, preds, average='weighted')}")

              precision    recall  f1-score   support

           0       0.90      0.93      0.92       100
           1       0.80      0.78      0.79       100
           2       0.83      0.82      0.82       100
           3       0.95      0.95      0.95       100

    accuracy                           0.87       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.87      0.87       400

Global F1: 0.8695638871323328


## Feature Selection

### Optuna

We conduct an Optuna study for 100 trials and retrieve the best one

In [None]:
def loss_fn(y_true, y_pred):
  """
  Returns the negative F1 score, to be treated as a loss function.
  """
  res = -f1_score(y_true, y_pred, average='weighted')
  return res

features = list(X_train.columns)

model = RandomForestClassifier(random_state=SEED)

sampler = TPESampler(seed = SEED)
study = optuna.create_study(direction="minimize",sampler=sampler)

# We first try the model using all features
default_features = {ft: True for ft in features}
study.enqueue_trial(default_features)

study.optimize(FeatureSelectionOptuna(
                         model=model,
                         loss_fn=loss_fn,
                         features=features,
                         X=X_train,
                         y=y_train,
                         splits=splits,
                         penalty = 1e-4,
                         ), n_trials=100)

[I 2024-05-07 13:27:31,859] A new study created in memory with name: no-name-79f43521-f2a5-448d-8082-43c5155c177f
[I 2024-05-07 13:27:34,317] Trial 0 finished with value: -0.8638106948596054 and parameters: {'battery_power': True, 'blue': True, 'clock_speed': True, 'dual_sim': True, 'fc': True, 'four_g': True, 'int_memory': True, 'm_dep': True, 'mobile_wt': True, 'n_cores': True, 'pc': True, 'px_height': True, 'px_width': True, 'ram': True, 'sc_h': True, 'sc_w': True, 'talk_time': True, 'three_g': True, 'touch_screen': True, 'wifi': True}. Best is trial 0 with value: -0.8638106948596054.
[I 2024-05-07 13:27:36,383] Trial 1 finished with value: -0.28782853702378036 and parameters: {'battery_power': True, 'blue': False, 'clock_speed': False, 'dual_sim': False, 'fc': True, 'four_g': False, 'int_memory': True, 'm_dep': False, 'mobile_wt': True, 'n_cores': False, 'pc': True, 'px_height': True, 'px_width': False, 'ram': False, 'sc_h': False, 'sc_w': False, 'talk_time': False, 'three_g': Fals

In [None]:
selected_features = study.best_params
selected_features = [ft for ft in selected_features.keys() if selected_features[ft]]
selected_features

['battery_power',
 'blue',
 'dual_sim',
 'fc',
 'mobile_wt',
 'px_height',
 'px_width',
 'ram',
 'sc_w']

In [None]:
# We train the RSF using only the selected features

X_train_selected = X_train[selected_features].copy()
X_test_selected = X_test[selected_features].copy()

model = RandomForestClassifier(random_state=SEED)
model.fit(X_train_selected,y_train)
preds = model.predict(X_test_selected)

print(classification_report(y_test, preds))
print(f"Global F1: {f1_score(y_test, preds, average='weighted')}")

              precision    recall  f1-score   support

           0       0.91      0.94      0.93       100
           1       0.84      0.83      0.83       100
           2       0.84      0.84      0.84       100
           3       0.94      0.92      0.93       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400

Global F1: 0.8823930394871305


### Chi-Squared

Filter method retrieving the 10 features that achieve the highest Chi Squared test. Uses `SelectKBest` from scikit learn.

In [None]:
skb = SelectKBest(score_func=chi2, k=10)
skb.fit(X_train,y_train)

scores = pd.DataFrame(skb.scores_)
cols = pd.DataFrame(X_train.columns)
featureScores = pd.concat([cols,scores],axis=1)
featureScores.columns = ['feature','score']
featureScores.nlargest(10, 'score')

Unnamed: 0,feature,score
13,ram,750616.905156
11,px_height,11671.29834
0,battery_power,10905.619087
12,px_width,7397.70634
8,mobile_wt,109.440113
6,int_memory,62.487665
15,sc_w,19.316377
14,sc_h,11.805343
16,talk_time,11.401764
10,pc,10.86563


In [None]:
selected_features = featureScores.nlargest(10,'score').feature.values

X_train_selected = df_train[selected_features].copy()
X_test_selected = df_test[selected_features].copy()

model = RandomForestClassifier(random_state=SEED)
model.fit(X_train_selected,y_train)
preds = model.predict(X_test_selected)

print(classification_report(y_test, preds))
print(f"Global F1: {f1_score(y_test, preds, average='weighted')}")

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       100
           1       0.83      0.85      0.84       100
           2       0.84      0.83      0.83       100
           3       0.95      0.93      0.94       100

    accuracy                           0.89       400
   macro avg       0.89      0.89      0.89       400
weighted avg       0.89      0.89      0.89       400

Global F1: 0.8877508043276541


### Forward Search

Wrapper method using `SequentialFeatureSelector` from scikit-learn. At each step, it adds the feature that yields the highest performance increase.

In [None]:
model = RandomForestClassifier(random_state=SEED)
sfs = SequentialFeatureSelector(model, n_features_to_select=10, cv=splits)
sfs.fit(X_train, y_train);

selected_features = list(X_train.columns[sfs.get_support()])
print(selected_features)

['battery_power', 'blue', 'fc', 'mobile_wt', 'px_height', 'px_width', 'ram', 'talk_time', 'three_g', 'touch_screen']


In [None]:
X_train_selected = df_train[selected_features].copy()
X_test_selected = df_test[selected_features].copy()

model = RandomForestClassifier(random_state=SEED)
model.fit(X_train_selected,y_train)
preds = model.predict(X_test_selected)

print(classification_report(y_test, preds))
print(f"Global F1: {f1_score(y_test, preds, average='weighted')}")

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       100
           1       0.84      0.83      0.83       100
           2       0.83      0.83      0.83       100
           3       0.93      0.92      0.92       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400

Global F1: 0.8798717597890443


### Feature Importance

Uses feature importance estimated by the RSF to select the 10 most important features.

In [None]:
model = RandomForestClassifier(random_state=SEED)
model.fit(X_train,y_train)

importance = pd.DataFrame({'feature':X_train.columns, 'importance':model.feature_importances_})
importance.nlargest(10, 'importance')

Unnamed: 0,feature,importance
13,ram,0.483098
0,battery_power,0.076937
12,px_width,0.058843
11,px_height,0.058158
8,mobile_wt,0.039197
6,int_memory,0.033332
16,talk_time,0.030169
10,pc,0.028992
15,sc_w,0.028824
14,sc_h,0.027064


In [None]:
selected_features = importance.nlargest(10,'importance').feature.values

X_train_selected = df_train[selected_features].copy()
X_test_selected = df_test[selected_features].copy()

model = RandomForestClassifier(random_state=SEED)
model.fit(X_train_selected,y_train)
preds = model.predict(X_test_selected)

print(classification_report(y_test, preds))
print(f"Global F1: {f1_score(y_test, preds, average='weighted')}")

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       100
           1       0.82      0.84      0.83       100
           2       0.82      0.83      0.83       100
           3       0.97      0.91      0.94       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400

Global F1: 0.8830488107413922
