# FEATURE SELECTION

Welcome to Tirendaz Academy.
In this notebook, I'm going to talk about feature selection. 
Happy learning ðŸ˜€

## Loading the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("wine.data", header = None)

In [3]:
df.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 
'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 
'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 
'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [4]:
df.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
np.unique(df["Class label"])

array([1, 2, 3], dtype=int64)

## Data Preprocessing

In [6]:
X = df.iloc[:, 1:].values

In [7]:
y = df.iloc[:,0].values

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                   random_state=0,
                                                   stratify=y)

## Data Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
stdscaler = StandardScaler()

In [12]:
X_train_std = stdscaler.fit_transform(X_train)

In [13]:
X_test_std = stdscaler.transform(X_test)

## Building the Model

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
forest = RandomForestClassifier(n_estimators=500, random_state=1)

In [22]:
feat_labels = df.columns[1:]

In [17]:
forest.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=1)

## Feature Selection

In [18]:
importances = forest.feature_importances_

In [19]:
indices = np.argsort(importances)[::-1]

In [20]:
indices

array([12,  6,  9, 11,  0, 10,  5,  4,  1,  3,  8,  2,  7], dtype=int64)

In [23]:
for f in range(X_train.shape[1]):
    print(f"{f+1}", feat_labels[indices[f]],
         importances[indices[f]])

1 Proline 0.19425196709852016
2 Flavanoids 0.15967892986778753
3 Color intensity 0.14907323774360948
4 OD280/OD315 of diluted wines 0.12223110107587146
5 Alcohol 0.11479722353879414
6 Hue 0.07259491714547278
7 Total phenols 0.050824811977803086
8 Magnesium 0.031922184349841815
9 Malic acid 0.028867963800009225
10 Alcalinity of ash 0.025900122601886087
11 Proanthocyanins 0.022483463782451273
12 Ash 0.014381677761740612
13 Nonflavanoid phenols 0.012992399256212534


## Feature Selecetion with SelectFromModel class

In [24]:
from sklearn.feature_selection import SelectFromModel

In [25]:
selector = SelectFromModel(forest, threshold = 0.1, 
                          prefit=True)

In [26]:
X_selected = selector.transform(X_train)

In [27]:
for f in range(X_selected.shape[1]):
    print(f"{f+1}", feat_labels[indices[f]],
         importances[indices[f]])

1 Proline 0.19425196709852016
2 Flavanoids 0.15967892986778753
3 Color intensity 0.14907323774360948
4 OD280/OD315 of diluted wines 0.12223110107587146
5 Alcohol 0.11479722353879414


Follow us on [YouTube](https://youtube.com/c/TirendazAcademy), [Twitter](https://twitter.com/tirendazacademy), [GitHub](https://github.com/tirendazacademy), [Medium](https://tirendazacademy.medium.com) ðŸ˜€ðŸ˜Ž