# Feature Engineering II

Best Practice:

1. Fill missing values (imputation)
2. everything else (onehot, binning, others)
3. Scaling
4. fit the model
5. do the same for the test set (without .fit!!!)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('penguins_simple.csv', sep=';')
df.head(3)

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Adelie,39.1,18.7,181.0,3750.0,MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE


In [3]:
X = df.iloc[:, 1:]
y = df['Species']

In [4]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((249, 5), (84, 5), (249,), (84,))

#### Feature Engineering

In [5]:
# 1. create a feature engineering tool
ohc = OneHotEncoder(sparse=False, handle_unknown='ignore')

# 2. fit with the training data (some columns of it)
ohc.fit(Xtrain[['Sex']])   #### this is learn the data set step, not fitting!!! we do not train the test data NEVER

# 3. transform the training data
onehot_sex = ohc.transform(Xtrain[['Sex']])
onehot_sex = pd.DataFrame(onehot_sex)
onehot_sex.head()

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,0.0,1.0
3,1.0,0.0
4,0.0,1.0


In [8]:
Xtrain

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
313,55.9,17.0,228.0,5600.0,MALE
258,43.6,13.9,217.0,4900.0,FEMALE
31,38.8,20.0,190.0,3950.0,MALE
300,47.5,14.0,212.0,4875.0,FEMALE
185,53.5,19.9,205.0,4500.0,MALE
...,...,...,...,...,...
188,50.9,19.1,196.0,3550.0,MALE
71,37.2,19.4,184.0,3900.0,MALE
106,39.7,17.7,193.0,3200.0,FEMALE
270,45.5,15.0,220.0,5000.0,MALE


In [None]:
# quantile strategy: different bin width, same number of penguins in each
# uniform strategy: same bin width, different number of penguins in each (like a histogram)

In [9]:
# 1. create a feature engineering tool
k = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')

# 2. fit with the training data (some columns of it)
k.fit(Xtrain[['Culmen Length (mm)', 'Body Mass (g)']])

# 3. transform the training data
bins = k.transform(Xtrain[['Culmen Length (mm)', 'Body Mass (g)']])
bins = pd.DataFrame(bins.todense())  # materializes a sparse matrix so that we can see it
# ALWAYS DO THIS UNLESS YOUR DATA SET IS REALLY BIG
bins.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [10]:
Xtrain.head()

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
313,55.9,17.0,228.0,5600.0,MALE
258,43.6,13.9,217.0,4900.0,FEMALE
31,38.8,20.0,190.0,3950.0,MALE
300,47.5,14.0,212.0,4875.0,FEMALE
185,53.5,19.9,205.0,4500.0,MALE


In [11]:
Xtrain.reset_index(inplace=True)    # concatenate
unmodified = Xtrain[['Flipper Length (mm)']]

In [12]:
Xtrain.head()

Unnamed: 0,index,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,313,55.9,17.0,228.0,5600.0,MALE
1,258,43.6,13.9,217.0,4900.0,FEMALE
2,31,38.8,20.0,190.0,3950.0,MALE
3,300,47.5,14.0,212.0,4875.0,FEMALE
4,185,53.5,19.9,205.0,4500.0,MALE


In [13]:
unmodified.head()

Unnamed: 0,Flipper Length (mm)
0,228.0
1,217.0
2,190.0
3,212.0
4,205.0


In [14]:
onehot_sex.shape, bins.shape, unmodified.shape

((249, 2), (249, 10), (249, 1))

In [15]:
# we need one dataframe, so we need to merge them
Xtrain_fe = pd.concat([onehot_sex, bins, unmodified], axis=1)
Xtrain_fe.shape

(249, 13)

In [None]:
# we could process this further, e.g. scaling
Xtrain_fe.head(3)

In [18]:
scaler = MinMaxScaler()  # scales every column independently
scaler.fit(Xtrain_fe)
Xtrain_scaled = scaler.transform(Xtrain_fe) # output is a numpy array, not a df



In [19]:
pd.DataFrame(Xtrain_scaled).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.949153
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.762712
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.305085


In [23]:
Xtrain_scaled

array([[0.        , 1.        , 0.        , ..., 0.        , 1.        ,
        0.94915254],
       [1.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.76271186],
       [0.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.30508475],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3559322 ],
       [0.        , 1.        , 0.        , ..., 0.        , 1.        ,
        0.81355932],
       [1.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.15254237]])

#### Model building

In [24]:
m = LogisticRegression()    # optimize the coefficients here
m.fit(Xtrain_scaled, ytrain)   #fit only the train

LogisticRegression()

#### Evaluation

In [None]:
train_accuracy = m.score(Xtrain_scaled, ytrain)
train_accuracy

### Now the same for the test data

* we already did ohc.fit()
* we only need to transform()
* NEVER FIT ANYTHING WITH TEST DATA!!!

IN THIS STEP WE NEVER FIT

In [None]:
test_ohc = ohc.transform(Xtest[['Sex']])
test_bins = k.transform(Xtest[['Culmen Length (mm)', 'Body Mass (g)']])
test_flipper = Xtest.reset_index()[['Flipper Length (mm)']]

test_ohc.shape, test_bins.shape, test_flipper.shape

In [None]:
test_ohc = pd.DataFrame(test_ohc)
test_bins = pd.DataFrame(test_bins.todense())

test_ohc.shape, test_bins.shape, test_flipper.shape

In [None]:
Xtest_fe = pd.concat([test_ohc, test_bins, test_flipper], axis=1)
Xtest_fe.shape

In [None]:
Xtest_scaled = scaler.transform(Xtest_fe)
Xtest_scaled.shape

In [None]:
train_accuracy

In [None]:
test_accuracy = m.score(Xtest_scaled, ytest)
test_accuracy

* test == training and both are high : GOOD! (the difference should not exit a 5-10%)
* test < training : **Overfitting** (the model is too powerful, take features out)
* training is low : **Underfitting** (the model is not powerful enough, add more features and/or more data)
* test > training : strange a) sampling bias (luck drawing the test set) b) the model is heavily biased (lots of constraints added)

In [None]:
# inspect the coefficients of the Adelie part of the model
m.coef_[0].round(3)