## Chapter 5 Basic Feature Engineering

In [1]:
import pandas as pd
import numpy as np

#### Date Time Features

In [2]:
df = pd.read_csv('data/daily-minimum-temperatures.csv', parse_dates=['Date']) 

In [3]:
df

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8
...,...,...
3645,1990-12-27,14.0
3646,1990-12-28,13.6
3647,1990-12-29,13.5
3648,1990-12-30,15.7


In [4]:
type(df['Date'].values[0])

numpy.datetime64

In [5]:
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df.head()

Unnamed: 0,Date,Temp,month,day
0,1981-01-01,20.7,1,1
1,1981-01-02,17.9,1,2
2,1981-01-03,18.8,1,3
3,1981-01-04,14.6,1,4
4,1981-01-05,15.8,1,5


#### Lag Features - lag 1

In [6]:
df2 = pd.concat([df['Temp'].shift(1), df['Temp']], axis=1)
df2.columns = ['Temp at t', 'Temp at t+1']
df2.head()

Unnamed: 0,Temp at t,Temp at t+1
0,,20.7
1,20.7,17.9
2,17.9,18.8
3,18.8,14.6
4,14.6,15.8


#### Lag Features - lag 3

In [7]:
df3 = pd.concat([df['Temp'].shift(3), df['Temp'].shift(2), df['Temp'].shift(1), df['Temp']], axis=1)
df3.columns = ['t-2', 't-1', 't', 't+1']
df3.head()

Unnamed: 0,t-2,t-1,t,t+1
0,,,,20.7
1,,,20.7,17.9
2,,20.7,17.9,18.8
3,20.7,17.9,18.8,14.6
4,17.9,18.8,14.6,15.8


#### Rolling Mean Features/ Sliding Window Features

In [14]:
df4 = pd.concat([df['Temp'].shift(1).rolling(2).mean(), df['Temp']], axis=1)
df4.columns = ['mean(t-1,t)', 't+1']
df4.head()

Unnamed: 0,"mean(t-1,t)",t+1
0,,20.7
1,,17.9
2,19.3,18.8
3,18.35,14.6
4,16.7,15.8


#### Rolling Stats Features

In [17]:
df5 = pd.concat([df['Temp'].shift(2).rolling(3).min(), df['Temp'].shift(2).rolling(3).mean(), df['Temp'].shift(2).rolling(3).max(), df['Temp']], axis=1)
df5.columns = ['min', 'mean', 'max', 't+1']
df5.head()

Unnamed: 0,min,mean,max,t+1
0,,,,20.7
1,,,,17.9
2,,,,18.8
3,,,,14.6
4,17.9,19.133333,20.7,15.8


#### Expanding Window Features

In [22]:
df6 = pd.concat([df['Temp'].expanding().min(), df['Temp'].expanding().mean(), df['Temp'].expanding().max(), df['Temp'].shift(-1)], axis=1)
df6.columns = ['min', 'mean', 'max','t+1']
df6.head()

Unnamed: 0,min,mean,max,t+1
0,20.7,20.7,20.7,17.9
1,17.9,19.3,20.7,18.8
2,17.9,19.133333,20.7,14.6
3,14.6,18.0,20.7,15.8
4,14.6,17.56,20.7,15.8
