# Feature Engineering



__Features__: Variables we use to help predict our target.


__Feature Engineering__:
- Combining variables that are highly correlated into one variable.
- Computationally expensive to calculate.
- Information that can lead to discrimination or unethical.
- Continuous variables with noise.
- Combining variables to reduce the effect of dimensionality. __Curse of dimensionality__

Goal in feature engineering:

I want to make it easy for the computer to see the patterns.

Algorithmic Feature Selection:


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('student-mat.csv', sep=';')

In [3]:
df.shape

(395, 33)

In [4]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,395.0,16.696203,1.276043,15.0,16.0,17.0,18.0,22.0
Medu,395.0,2.749367,1.094735,0.0,2.0,3.0,4.0,4.0
Fedu,395.0,2.521519,1.088201,0.0,2.0,2.0,3.0,4.0
traveltime,395.0,1.448101,0.697505,1.0,1.0,1.0,2.0,4.0
studytime,395.0,2.035443,0.83924,1.0,1.0,2.0,2.0,4.0
failures,395.0,0.334177,0.743651,0.0,0.0,0.0,0.0,3.0
famrel,395.0,3.944304,0.896659,1.0,4.0,4.0,5.0,5.0
freetime,395.0,3.235443,0.998862,1.0,3.0,3.0,4.0,5.0
goout,395.0,3.108861,1.113278,1.0,2.0,3.0,4.0,5.0
Dalc,395.0,1.481013,0.890741,1.0,1.0,1.0,2.0,5.0


In [7]:
df.nunique()

school         2
sex            2
age            8
address        2
famsize        2
Pstatus        2
Medu           5
Fedu           5
Mjob           5
Fjob           5
reason         4
guardian       3
traveltime     4
studytime      4
failures       4
schoolsup      2
famsup         2
paid           2
activities     2
nursery        2
higher         2
internet       2
romantic       2
famrel         5
freetime       5
goout          5
Dalc           5
Walc           5
health         5
absences      34
G1            17
G2            17
G3            18
dtype: int64

In [8]:
df.isna().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [9]:
obj_df = df.select_dtypes('O')
columns_to_drop = obj_df.columns.values

In [10]:
dummy_df = pd.get_dummies(obj_df, drop_first=True, dummy_na=False)

df = pd.concat([df, dummy_df], axis=1)

In [11]:
df.drop(columns=columns_to_drop, inplace=True)

In [12]:
df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1,0,1,0,0,0,1,1,0,0
1,17,1,1,1,2,0,5,3,3,1,...,0,0,0,1,0,0,0,1,1,0
2,15,1,1,1,2,3,4,3,2,2,...,1,0,1,0,1,0,1,1,1,0
3,15,4,2,1,3,0,3,2,2,1,...,1,0,0,1,1,1,1,1,1,1
4,16,3,3,1,2,0,4,3,2,1,...,0,0,0,1,1,0,1,1,0,0


In [13]:
# Split the `Students` data into train_validate and test sets
train_validate, test = train_test_split(df, test_size=.2, random_state=123)

# Split the train_validate into validate and train sets
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

In [14]:
X_train = train.drop(columns='G3')
X_validate = validate.drop(columns='G3')
X_test = test.drop(columns='G3')

y_train = train.G3
y_validate = validate.G3
y_test = test.G3

In [15]:
from sklearn.preprocessing import MinMaxScaler

In [16]:
scaler = MinMaxScaler(copy=True).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [17]:
X_train_scaled = pd.DataFrame(X_train_scaled,
                              columns=X_train.columns.values,
                              index=X_train.index.values)

X_validate_scaled = pd.DataFrame(X_train_scaled,
                                 columns=X_validate.columns.values,
                                 index=X_validate.index.values)

X_test_scaled = pd.DataFrame(X_test_scaled,
                             columns=X_test.columns.values,
                             index=X_test.index.values)

In [18]:
X_train_scaled.shape

(221, 41)

In [19]:
X_train_scaled.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
142,0.0,1.0,1.0,0.0,0.666667,0.0,0.75,0.25,0.25,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
326,0.333333,0.75,0.75,0.0,0.0,0.0,0.75,0.5,1.0,0.5,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
88,0.166667,0.5,0.5,0.333333,0.333333,0.333333,0.75,0.75,0.25,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
118,0.333333,0.25,0.75,0.666667,0.333333,0.333333,1.0,0.25,0.75,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
312,0.666667,0.25,0.5,0.0,0.333333,0.333333,0.75,1.0,0.25,0.25,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


## Create a Feature Object

In [20]:
from sklearn.feature_selection import SelectKBest, f_regression

In [21]:
f_selector = SelectKBest(f_regression, k=13)

In [22]:
f_selector = f_selector.fit(X_train_scaled, y_train)

In [23]:
# The transformation of SelectKBest 
X_train_reduced = f_selector.transform(X_train_scaled)

In [24]:
X_train_reduced.shape

(221, 13)

In [25]:
f_support = f_selector.get_support()
print(f_support)

[ True  True  True  True  True  True False False False False False False
 False  True  True False  True False False False False  True False False
 False False False False False False  True False  True False False False
 False False  True False False]


In [26]:
f_feature = X_train_scaled.iloc[:,f_support].columns.to_list()

## Recursive Feature Elimnation: RFE

Wrapper method

Recursively build model after model with fewer and fewer features.

In [27]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [28]:
# Initialize linear regression model
lm = LinearRegression()

In [29]:
rfe = RFE(lm, 13)

X_rfe = rfe.fit_transform(X_train_scaled, y_train)

In [32]:
rfe_mask = rfe.support_

In [35]:
X_train_scaled_reduced = X_train_scaled.iloc[:, rfe_mask]

In [36]:
X_train_scaled_reduced.head()

Unnamed: 0,age,traveltime,failures,famrel,absences,G1,G2,Mjob_health,Mjob_other,Mjob_services,schoolsup_yes,famsup_yes,internet_yes
142,0.0,0.0,0.0,0.75,0.035714,0.357143,0.578947,0.0,0.0,0.0,0.0,1.0,1.0
326,0.333333,0.0,0.0,0.75,0.053571,0.714286,0.789474,0.0,1.0,0.0,0.0,0.0,1.0
88,0.166667,0.333333,0.333333,0.75,0.214286,0.5,0.526316,0.0,0.0,1.0,0.0,0.0,1.0
118,0.333333,0.666667,0.333333,1.0,0.357143,0.357143,0.368421,0.0,1.0,0.0,0.0,1.0,1.0
312,0.666667,0.0,0.333333,0.75,0.053571,0.642857,0.578947,0.0,1.0,0.0,0.0,0.0,1.0


In [37]:
rfe_feature = X_train_scaled_reduced.columns.to_list()

In [38]:
rfe_feature == f_feature

False