# Feature selection

This notebook aims to identify the most relevant features for predicting  handwritten digits in the MNIST dataset.
Feature selection helps reduce dimensionality, improve model performance, and gain insights into important factors influencing the target.

## 0. Data Load

In [1]:
import tensorflow as tf
# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [5]:
train_images, test_images = train_images / 255.0, test_images / 255.0

In [6]:
train_images.shape

(60000, 28, 28)

## 1. Univariate Selection
### Starting with Univariate Selection, which selects the best features based on univariate statistical tests. Using the chi-squared (chi2) statistical test for non-negative features to select 100 of the best features from our dataset.

In [7]:
# Flatten the images for feature selection
train_images = train_images.reshape(-1, 28*28)

In [3]:
train_images.shape   #now each image in the dataset is flattened into a 1-dimensional array of length 784

(60000, 784)

In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


# Feature selection using chi-square test
selector = SelectKBest(score_func=chi2, k=100)  # Select top 100 features
X_selected = selector.fit_transform(train_images, train_labels)
#X_test_kbest = selector.transform(test_labels)

#fit = selector.fit(train_images, train_labels)

In [6]:
X_selected.shape

(60000, 100)

In [20]:
X_selected[98]

array([  0,   0,   0,   0,   0,  15, 108, 233, 253, 255, 180, 101,   0,
         0, 253, 244,   0,   0,   0,   0,   0, 252, 253, 240,   0,   0,
         0,   0,   0,   0, 252, 252, 253,   0,   0,   0,   0,   0,   0,
         0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 116,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   5,   0,  24,   0,   0,   0], dtype=uint8)

In [10]:
X_selected

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [10]:
selected_feature_indices = selector.get_support(indices=True)
print("Selected feature indices:", selected_feature_indices)

Selected feature indices: [ 98  99 100 101 102 150 151 152 153 154 155 156 301 302 322 323 329 330
 331 344 345 349 350 351 357 358 359 371 372 373 376 377 378 385 386 387
 398 399 400 401 406 413 414 415 426 427 428 429 433 441 442 454 455 456
 457 458 461 468 469 470 482 483 484 485 486 487 488 489 496 497 510 511
 512 513 514 515 516 523 524 538 539 540 541 542 543 544 566 567 568 569
 570 594 595 596 597 624 625 711 712 713]


In [21]:
train_images.shape

(60000, 784)

## 2. Recursive Feature Elimination


In [26]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


# Create and train model
lr = LogisticRegression(solver='sag', max_iter=1000)
lr.fit(train_images, train_labels)


## 3. Feature Importance


In [9]:
from sklearn.ensemble import ExtraTreesClassifier

# Apply ExtraTreesClassifier for feature importance
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
etc.fit(train_images, train_labels)
print(etc.feature_importances_)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 2.46992662e-07 3.23736810e-07 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.43229412e-06 1.43818423e-06
 2.14323761e-06 2.83628994e-06 3.23021418e-06 3.47894069e-06
 4.61295232e-06 4.34318348e-06 3.80982949e-06 9.23656502e-06
 4.88484290e-06 2.96102969e-06 2.53133811e-06 4.11472418e-06
 2.84787216e-06 2.03323108e-06 1.72133533e-06 1.03647871e-06
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.43572430e-07
 5.50864158e-07 6.13408903e-07 2.23280952e-06 1.01168684e-05
 1.68273746e-05 2.216251

In [5]:
from sklearn.decomposition import PCA

# Feature Extraction with PCA
pca = PCA(n_components=3)
fit = pca.fit(train_images)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)

Explained Variance: [0.09704664 0.07095924 0.06169088]


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Feature selection using Random Forest feature importance
clf = RandomForestClassifier()
clf.fit(train_images, train_labels)

# Select top features based on feature importance scores
top_features_indices = clf.feature_importances_.argsort()[-100:][::-1]
X_selected = train_images[:, top_features_indices]

In [None]:
print(clf.feature_importances_)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 3.39181190e-07 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 7.30657758e-07 1.78397329e-06
 2.67997373e-06 1.07324332e-06 3.55337190e-07 2.74614060e-06
 3.03043669e-06 2.35037252e-06 2.62653134e-06 5.78633661e-06
 1.00541318e-05 1.82891923e-06 3.67074923e-07 1.08926080e-06
 4.81646574e-07 3.02580826e-07 3.65451538e-07 3.69645203e-07
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.70505975e-07
 0.00000000e+00 6.01028335e-07 3.36866117e-07 1.35414445e-05
 8.78838067e-06 3.593522