<a href="https://colab.research.google.com/github/Siddha-Regilla/Data-Science-Classroom-notebooks/blob/main/32.%20Feature_Engineering_28_03_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Univariate Feature Selection

In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4) # using chi square method and selecting 4 important features in the dataset
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)


#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


#### Recursive Feature Elimination

In [2]:
# RFE -
# eg - if we build a model with 3 features and its accurarcy is 90% and next we build a model with 2 features and its accurarcy is 95%.
# Then we can remove that feature and build the model. Therefore, based on accurarcy it'll try to remove all the unimportant features.

In [3]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(max_iter=400)



In [4]:
# Applying RFE and selecting 3 important features. Step = 1 indicates to execute it step by step and inform the important features
rfe = RFE(model, n_features_to_select=3, step=1)
fit = rfe.fit(X, Y)

In [5]:
#Num Features:
fit.n_features_

3

In [6]:
#Selected Features:
fit.support_

array([ True, False, False, False, False,  True,  True, False])

In [7]:
# Feature Ranking:
fit.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

#### Feature Importance using Decision Tree

In [8]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.061 0.309 0.097 0.026 0.046 0.229 0.118 0.115]


In [9]:
# Important features based on Chi2 is - 'test', 'plas', 'age', 'mass' & 'preg'
# Important features based on RFE is - 'preg', 'mass' & 'pedi'
# Important features based on Decision Tree is - 'plas', 'mass', 'pedi' & 'age'
# Total important features - 'preg', 'plas', 'test', 'mass', 'pedi' & 'age'

# 1st use decision tree model & findout what are the 4 important features
# nest try with RFE. It will also add a couple of important features.
# And then chi2 method and add the important features.
# Build model with these important features and check the R2 value.
# If we think that by adding any more variables will help iprove R2 value then add those features.

# while working on project we can try checking the following R2 value and choose the best model accordingly
# 1. Perform feature selection and check r2 value
# 2. Perform PCA on whole data and check r2 value
# 3. Perform PCA on selected features and check r2 value

# Which ever is the best can be used
