<a href="https://colab.research.google.com/github/Tejeswini-98/CLASS-NOTE-ML-ALGORITHMNS/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Feature Engineering

Whatever you do wih features, we call it as Feature Engineering.

* Feature Elimination    - dropping the features.
* Feature Addition       - adding some features.
* Feature Transformation - transforming the given feature values into an another scale - Log Tranformation, Sqrt Transformation..
* Feature Selection      - deciding which features are important out of many features and choosing that features for model building.

#### Feature Selection Techniques

* sklearn - SelectFromModel
* sklearn - RFE(ie,Recursive Feature Elimination)

## 1. Import Necessary libraries

In [None]:
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix

from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.feature_selection import RFE,SelectFromModel

import warnings
warnings.filterwarnings('ignore')

## 2. Import Data

In [None]:
cancer_data = load_breast_cancer()

In [None]:
cancer_df = pd.DataFrame(data = cancer_data.data,columns = cancer_data.feature_names)
cancer_df['target'] = cancer_data.target
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


## 3. Model Building

In [None]:
X = cancer_df.drop('target',axis = 1)
y = cancer_df[['target']]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=12,stratify=y)

In [None]:
X_train.shape,y_train.shape

((455, 30), (455, 1))

In [None]:
X_test.shape,y_test.shape

((114, 30), (114, 1))

# FEATURE SELECTION TECHNIQUES

## 1. SelectFromModel Technique

In [None]:
select_from_model = SelectFromModel(estimator = RandomForestClassifier(),max_features=None)
select_from_model.fit(X_train,y_train)

SelectFromModel(estimator=RandomForestClassifier())

In [None]:
select_from_model.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

In [None]:
X_train.columns[select_from_model.get_support()]

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concave points'],
      dtype='object')

In [None]:
X_train_selected_features = select_from_model.transform(X_train)
X_test_selected_features  = select_from_model.transform(X_test)

In [None]:
X_train_selected_features.shape

(455, 8)

In [None]:
X_test_selected_features.shape

(114, 9)

In [None]:
def run_RandomForestClassifier(X_train,y_train,X_test,y_test):
    rf_classifier = RandomForestClassifier(random_state=12)
    rf_classifier.fit(X_train,y_train) #30features

    y_pred = rf_classifier.predict(X_test)

    print('Accuracy Score  : ',round(accuracy_score(y_test,y_pred),4))
    print('Precision Score : ',round(precision_score(y_test,y_pred),4))
    print('Recall Score    : ',round(recall_score(y_test,y_pred),4))
    print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

In [None]:
%%time
run_RandomForestClassifier(X_train,y_train,X_test,y_test) #30features

Accuracy Score  :  0.9561
Precision Score :  0.9718
Recall Score    :  0.9583
Confusion Matrix:
 [[40  2]
 [ 3 69]]
Wall time: 298 ms


In [None]:
%%time
run_RandomForestClassifier(X_train_selected_features,y_train,X_test_selected_features,y_test) #9features

Accuracy Score  :  0.9386
Precision Score :  0.9577
Recall Score    :  0.9444
Confusion Matrix:
 [[39  3]
 [ 4 68]]
Wall time: 266 ms


## 2. RFE - Recursive Feature Elimination

In [None]:
rfe_model = RFE(estimator = RandomForestClassifier(),n_features_to_select=None)
rfe_model.fit(X_train,y_train)

RFE(estimator=RandomForestClassifier())

In [None]:
rfe_model.get_support()

array([ True,  True,  True,  True, False, False,  True,  True, False,
       False,  True, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True,  True, False])

In [None]:
X_train.columns[rfe_model.get_support()]

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'radius error', 'area error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')

In [None]:
len(X_train.columns[rfe_model.get_support()])

15

In [None]:
X_train_rfe_selected = rfe_model.transform(X_train)
X_test_rfe_selected  = rfe_model.transform(X_test)

In [None]:
X_train_rfe_selected.shape

(455, 15)

In [None]:
X_test_rfe_selected.shape

(114, 15)

In [None]:
%%time
run_RandomForestClassifier(X_train_rfe_selected,y_train,X_test_rfe_selected,y_test)

Accuracy Score  :  0.9737
Precision Score :  0.9726
Recall Score    :  0.9861
Confusion Matrix:
 [[40  2]
 [ 1 71]]
Wall time: 266 ms


### Pick up the important features of Gradient Boosting Algorithm and train it with RFClassifier

In [None]:
rfe_model = RFE(estimator = GradientBoostingClassifier(random_state=12),n_features_to_select=13)
rfe_model.fit(X_train,y_train)

RFE(estimator=GradientBoostingClassifier(random_state=12),
    n_features_to_select=13)

In [None]:
rfe_model.get_support()

array([False,  True, False,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False,  True, False,
       False,  True,  True,  True, False,  True, False, False,  True,
        True,  True, False])

In [None]:
X_train.columns[rfe_model.get_support()]

Index(['mean texture', 'mean area', 'mean concavity', 'mean concave points',
       'area error', 'concavity error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst area', 'worst concavity',
       'worst concave points', 'worst symmetry'],
      dtype='object')

In [None]:
len(X_train.columns[rfe_model.get_support()])

13

In [None]:
X_train_rfe_selected = rfe_model.transform(X_train)
X_test_rfe_selected  = rfe_model.transform(X_test)

In [None]:
X_train_rfe_selected.shape

(455, 13)

In [None]:
X_test_rfe_selected.shape

(114, 13)

In [None]:
%%time
run_RandomForestClassifier(X_train_rfe_selected,y_train,X_test_rfe_selected,y_test)

Accuracy Score  :  0.9561
Precision Score :  0.9718
Recall Score    :  0.9583
Confusion Matrix:
 [[40  2]
 [ 3 69]]
Wall time: 291 ms


### How to decide the optimal number of features for training?

In [None]:
for i in range(1,31):
    rfe_model = RFE(estimator = GradientBoostingClassifier(random_state=12),n_features_to_select=i)
    rfe_model.fit(X_train,y_train)
    X_train_rfe_selected = rfe_model.transform(X_train)
    X_test_rfe_selected  = rfe_model.transform(X_test)
    print('Selected Features : ',i)
    run_RandomForestClassifier(X_train_rfe_selected,y_train,X_test_rfe_selected,y_test)
    print('************************************************')    

Selected Features :  1
Accuracy Score  :  0.8158
Precision Score :  0.8696
Recall Score    :  0.8333
Confusion Matrix:
 [[33  9]
 [12 60]]
************************************************
Selected Features :  2
Accuracy Score  :  0.9035
Precision Score :  0.9296
Recall Score    :  0.9167
Confusion Matrix:
 [[37  5]
 [ 6 66]]
************************************************
Selected Features :  3
Accuracy Score  :  0.9211
Precision Score :  0.9315
Recall Score    :  0.9444
Confusion Matrix:
 [[37  5]
 [ 4 68]]
************************************************
Selected Features :  4
Accuracy Score  :  0.9649
Precision Score :  0.9595
Recall Score    :  0.9861
Confusion Matrix:
 [[39  3]
 [ 1 71]]
************************************************
Selected Features :  5
Accuracy Score  :  0.9474
Precision Score :  0.9583
Recall Score    :  0.9583
Confusion Matrix:
 [[39  3]
 [ 3 69]]
************************************************
Selected Features :  6
Accuracy Score  :  0.9561
Precision S

In [None]:
rfe_model = RFE(estimator = GradientBoostingClassifier(random_state=12),n_features_to_select=7)
rfe_model.fit(X_train,y_train)

RFE(estimator=GradientBoostingClassifier(random_state=12),
    n_features_to_select=7)

In [None]:
X_train.columns[rfe_model.get_support()]

Index(['mean texture', 'mean concavity', 'mean concave points', 'area error',
       'worst texture', 'worst area', 'worst concave points'],
      dtype='object')

In [None]:
X_train_rfe_selected = rfe_model.transform(X_train)
X_test_rfe_selected  = rfe_model.transform(X_test)

In [None]:
X_train_rfe_selected.shape

(455, 7)

In [None]:
X_test_rfe_selected.shape

(114, 7)

In [None]:
%%time
run_RandomForestClassifier(X_train_rfe_selected,y_train,X_test_rfe_selected,y_test) #7features

Accuracy Score  :  0.9737
Precision Score :  0.9726
Recall Score    :  0.9861
Confusion Matrix:
 [[40  2]
 [ 1 71]]
Wall time: 261 ms


In [None]:
%%time
run_RandomForestClassifier(X_train,y_train,X_test,y_test)

Accuracy Score  :  0.9561
Precision Score :  0.9718
Recall Score    :  0.9583
Confusion Matrix:
 [[40  2]
 [ 3 69]]
Wall time: 311 ms


# THE END!!!