# Feature Engineering

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/us_indicators.csv')

In [3]:
# Change date type to datetime format.
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

In [4]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease
...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight


## Feature Creation

### Interaction

In [5]:
df['cpi_policy_rate'] = df['cpi'] * df['policy_rate']

In [6]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease,1.980
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease,2.160
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease,2.160
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease,3.780
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease,4.140
...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight,-1.925
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight,-1.175
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight,0.475
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight,3.850


### Extraction

In [7]:
df['month'] = df['date'].dt.month

In [8]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,month
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease,1.980,1
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease,2.160,2
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease,2.160,3
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease,3.780,4
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease,4.140,5
...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight,-1.925,2
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight,-1.175,3
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight,0.475,4
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight,3.850,5


### Lag

In [9]:
df['money_supply_lag'] = df['money_supply'].shift(1)

In [10]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,month,money_supply_lag
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease,1.980,1,
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease,2.160,2,147.07
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease,2.160,3,145.49
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease,3.780,4,140.02
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease,4.140,5,138.88
...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight,-1.925,2,721.62
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight,-1.175,3,718.27
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight,0.475,4,711.58
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight,3.850,5,695.69


### Change

In [11]:
df['cpi_change'] = df['cpi'].diff(1)

In [12]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,month,money_supply_lag,cpi_change
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease,1.980,1,,
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease,2.160,2,147.07,0.12
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease,2.160,3,145.49,0.00
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease,3.780,4,140.02,1.08
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease,4.140,5,138.88,0.24
...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight,-1.925,2,721.62,0.34
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight,-1.175,3,718.27,0.30
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight,0.475,4,711.58,0.66
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight,3.850,5,695.69,1.35


In [13]:
df['cpi_change_rate'] = df['cpi'].pct_change(1)

In [14]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,month,money_supply_lag,cpi_change,cpi_change_rate
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease,1.980,1,,,
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease,2.160,2,147.07,0.12,0.090909
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease,2.160,3,145.49,0.00,0.000000
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease,3.780,4,140.02,1.08,0.750000
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease,4.140,5,138.88,0.24,0.095238
...,...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight,-1.925,2,721.62,0.34,-0.306306
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight,-1.175,3,718.27,0.30,-0.389610
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight,0.475,4,711.58,0.66,-1.404255
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight,3.850,5,695.69,1.35,7.105263


### Rolling

In [15]:
df['cpi_rolling_mean'] = df['cpi'].rolling(window=10).mean()
df['cpi_rolling_std'] = df['cpi'].rolling(window=10).std()

In [16]:
df.head(20)

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,month,money_supply_lag,cpi_change,cpi_change_rate,cpi_rolling_mean,cpi_rolling_std
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease,1.98,1,,,,,
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease,2.16,2,147.07,0.12,0.090909,,
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease,2.16,3,145.49,0.0,0.0,,
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease,3.78,4,140.02,1.08,0.75,,
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease,4.14,5,138.88,0.24,0.095238,,
5,2001-06-30,2.15,2.5,88.3,139.53,inrange,tight,5.375,6,139.49,-0.61,-0.221014,,
6,2001-07-31,2.03,2.5,87.81,138.87,inrange,tight,5.075,7,139.53,-0.12,-0.055814,,
7,2001-08-31,1.43,2.5,87.32,144.93,inrange,tight,3.575,8,138.87,-0.6,-0.295567,,
8,2001-09-30,1.3,2.5,88.12,144.44,inrange,tight,3.25,9,144.93,-0.13,-0.090909,,
9,2001-10-31,1.32,2.5,88.26,144.56,inrange,tight,3.3,10,144.44,0.02,0.015385,1.771,0.54913


## Encoding

### Label Encoding

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
label_encoder_inflation = LabelEncoder()
df['inflation_target'] = label_encoder_inflation.fit_transform(df['inflation_target'])

In [19]:
label_encoder_policy = LabelEncoder()
df['type_of_monetary_policy'] = label_encoder_policy.fit_transform(df['type_of_monetary_policy'])

In [20]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,month,money_supply_lag,cpi_change,cpi_change_rate,cpi_rolling_mean,cpi_rolling_std
0,2001-01-31,1.32,1.5,89.24,147.07,0,0,1.980,1,,,,,
1,2001-02-28,1.44,1.5,90.47,145.49,0,0,2.160,2,147.07,0.12,0.090909,,
2,2001-03-31,1.44,1.5,89.33,140.02,0,0,2.160,3,145.49,0.00,0.000000,,
3,2001-04-30,2.52,1.5,87.32,138.88,0,0,3.780,4,140.02,1.08,0.750000,,
4,2001-05-31,2.76,1.5,87.16,139.49,0,0,4.140,5,138.88,0.24,0.095238,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,1,1,-1.925,2,721.62,0.34,-0.306306,-0.114,0.668019
278,2024-03-31,-0.47,2.5,118.11,711.58,1,1,-1.175,3,718.27,0.30,-0.389610,-0.214,0.634931
279,2024-04-30,0.19,2.5,116.80,695.69,1,1,0.475,4,711.58,0.66,-1.404255,-0.218,0.631942
280,2024-05-31,1.54,2.5,117.21,702.68,0,1,3.850,5,695.69,1.35,7.105263,-0.102,0.829495


### One-hot Encoding

In [21]:
df = pd.get_dummies(df, columns=['month'], prefix=['m'])

In [22]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,money_supply_lag,cpi_change,...,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,2001-01-31,1.32,1.5,89.24,147.07,0,0,1.980,,,...,False,False,False,False,False,False,False,False,False,False
1,2001-02-28,1.44,1.5,90.47,145.49,0,0,2.160,147.07,0.12,...,False,False,False,False,False,False,False,False,False,False
2,2001-03-31,1.44,1.5,89.33,140.02,0,0,2.160,145.49,0.00,...,True,False,False,False,False,False,False,False,False,False
3,2001-04-30,2.52,1.5,87.32,138.88,0,0,3.780,140.02,1.08,...,False,True,False,False,False,False,False,False,False,False
4,2001-05-31,2.76,1.5,87.16,139.49,0,0,4.140,138.88,0.24,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,1,1,-1.925,721.62,0.34,...,False,False,False,False,False,False,False,False,False,False
278,2024-03-31,-0.47,2.5,118.11,711.58,1,1,-1.175,718.27,0.30,...,True,False,False,False,False,False,False,False,False,False
279,2024-04-30,0.19,2.5,116.80,695.69,1,1,0.475,711.58,0.66,...,False,True,False,False,False,False,False,False,False,False
280,2024-05-31,1.54,2.5,117.21,702.68,0,1,3.850,695.69,1.35,...,False,False,True,False,False,False,False,False,False,False


In [23]:
df[['date'] + [x for x in df.columns if 'm_' in x]]

Unnamed: 0,date,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,2001-01-31,True,False,False,False,False,False,False,False,False,False,False,False
1,2001-02-28,False,True,False,False,False,False,False,False,False,False,False,False
2,2001-03-31,False,False,True,False,False,False,False,False,False,False,False,False
3,2001-04-30,False,False,False,True,False,False,False,False,False,False,False,False
4,2001-05-31,False,False,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,False,True,False,False,False,False,False,False,False,False,False,False
278,2024-03-31,False,False,True,False,False,False,False,False,False,False,False,False
279,2024-04-30,False,False,False,True,False,False,False,False,False,False,False,False
280,2024-05-31,False,False,False,False,True,False,False,False,False,False,False,False


## Transformation

### Normalization

In [24]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
df['neer'].describe()

count    282.000000
mean     103.934220
std       11.502054
min       85.840000
25%       94.317500
50%      102.730000
75%      114.322500
max      126.920000
Name: neer, dtype: float64

In [26]:
neer_scaler = MinMaxScaler()
df['neer'] = neer_scaler.fit_transform(df[['neer']])

In [27]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,money_supply_lag,cpi_change,...,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,2001-01-31,1.32,1.5,0.082765,147.07,0,0,1.980,,,...,False,False,False,False,False,False,False,False,False,False
1,2001-02-28,1.44,1.5,0.112707,145.49,0,0,2.160,147.07,0.12,...,False,False,False,False,False,False,False,False,False,False
2,2001-03-31,1.44,1.5,0.084956,140.02,0,0,2.160,145.49,0.00,...,True,False,False,False,False,False,False,False,False,False
3,2001-04-30,2.52,1.5,0.036027,138.88,0,0,3.780,140.02,1.08,...,False,True,False,False,False,False,False,False,False,False
4,2001-05-31,2.76,1.5,0.032132,139.49,0,0,4.140,138.88,0.24,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,0.795278,718.27,1,1,-1.925,721.62,0.34,...,False,False,False,False,False,False,False,False,False,False
278,2024-03-31,-0.47,2.5,0.785540,711.58,1,1,-1.175,718.27,0.30,...,True,False,False,False,False,False,False,False,False,False
279,2024-04-30,0.19,2.5,0.753651,695.69,1,1,0.475,711.58,0.66,...,False,True,False,False,False,False,False,False,False,False
280,2024-05-31,1.54,2.5,0.763632,702.68,0,1,3.850,695.69,1.35,...,False,False,True,False,False,False,False,False,False,False


In [28]:
df['neer'].describe()

count    282.000000
mean       0.440463
std        0.279992
min        0.000000
25%        0.206366
50%        0.411149
75%        0.693342
max        1.000000
Name: neer, dtype: float64

### Standardization

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
df['money_supply'].describe()

count    282.000000
mean     432.494929
std      202.217172
min      138.870000
25%      237.987500
50%      470.675000
75%      613.550000
max      765.730000
Name: money_supply, dtype: float64

In [31]:
money_supply_scaler = StandardScaler()
df['money_supply'] = money_supply_scaler.fit_transform(df[['money_supply']])

In [32]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,money_supply_lag,cpi_change,...,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,2001-01-31,1.32,1.5,0.082765,-1.413986,0,0,1.980,,,...,False,False,False,False,False,False,False,False,False,False
1,2001-02-28,1.44,1.5,0.112707,-1.421814,0,0,2.160,147.07,0.12,...,False,False,False,False,False,False,False,False,False,False
2,2001-03-31,1.44,1.5,0.084956,-1.448912,0,0,2.160,145.49,0.00,...,True,False,False,False,False,False,False,False,False,False
3,2001-04-30,2.52,1.5,0.036027,-1.454560,0,0,3.780,140.02,1.08,...,False,True,False,False,False,False,False,False,False,False
4,2001-05-31,2.76,1.5,0.032132,-1.451538,0,0,4.140,138.88,0.24,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,0.795278,1.415721,1,1,-1.925,721.62,0.34,...,False,False,False,False,False,False,False,False,False,False
278,2024-03-31,-0.47,2.5,0.785540,1.382579,1,1,-1.175,718.27,0.30,...,True,False,False,False,False,False,False,False,False,False
279,2024-04-30,0.19,2.5,0.753651,1.303860,1,1,0.475,711.58,0.66,...,False,True,False,False,False,False,False,False,False,False
280,2024-05-31,1.54,2.5,0.763632,1.338489,0,1,3.850,695.69,1.35,...,False,False,True,False,False,False,False,False,False,False


In [33]:
df['money_supply'].describe()

count    2.820000e+02
mean    -1.007862e-16
std      1.001778e+00
min     -1.454609e+00
25%     -9.635839e-01
50%      1.891429e-01
75%      8.969414e-01
max      1.650837e+00
Name: money_supply, dtype: float64

## Dimensionality Reduction

In [34]:
from sklearn.decomposition import PCA

In [35]:
# Drop date as PCA only accept number.
# Drop ['money_supply_lag', 'cpi_change', 'cpi_change_rate', 'cpi_rolling_mean', 'cpi_rolling_std'] as PCA not accept NaN.
features = df.drop(columns=['date', 'money_supply_lag', 'cpi_change', 'cpi_change_rate', 'cpi_rolling_mean', 'cpi_rolling_std'])

In [36]:
features

Unnamed: 0,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy,cpi_policy_rate,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,1.32,1.5,0.082765,-1.413986,0,0,1.980,True,False,False,False,False,False,False,False,False,False,False,False
1,1.44,1.5,0.112707,-1.421814,0,0,2.160,False,True,False,False,False,False,False,False,False,False,False,False
2,1.44,1.5,0.084956,-1.448912,0,0,2.160,False,False,True,False,False,False,False,False,False,False,False,False
3,2.52,1.5,0.036027,-1.454560,0,0,3.780,False,False,False,True,False,False,False,False,False,False,False,False
4,2.76,1.5,0.032132,-1.451538,0,0,4.140,False,False,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,-0.77,2.5,0.795278,1.415721,1,1,-1.925,False,True,False,False,False,False,False,False,False,False,False,False
278,-0.47,2.5,0.785540,1.382579,1,1,-1.175,False,False,True,False,False,False,False,False,False,False,False,False
279,0.19,2.5,0.753651,1.303860,1,1,0.475,False,False,False,True,False,False,False,False,False,False,False,False
280,1.54,2.5,0.763632,1.338489,0,1,3.850,False,False,False,False,True,False,False,False,False,False,False,False


In [37]:
pca = PCA(n_components=2)
pca_data = pca.fit_transform(features)

In [38]:
pca_data

array([[-3.03893716e+00, -1.30593762e-01],
       [-2.83608544e+00, -5.50576112e-02],
       [-2.83395669e+00, -6.28266350e-02],
       [-1.00383584e+00,  6.00403020e-01],
       [-5.97294441e-01,  7.48839824e-01],
       [ 5.78805442e-01, -4.90444042e-01],
       [ 2.61395477e-01, -5.45898286e-01],
       [-1.33042873e+00, -8.04692301e-01],
       [-1.67515074e+00, -8.64017937e-01],
       [-1.62225918e+00, -8.54426701e-01],
       [-2.57284053e+00, -1.01868077e+00],
       [-3.41026659e+00, -1.00176408e+00],
       [-3.35062997e+00, -8.16384618e-01],
       [-4.39425733e+00, -1.06464959e+00],
       [-3.87335472e+00, -9.36815696e-01],
       [-4.15550872e+00, -1.00329545e+00],
       [-4.65544883e+00, -1.12168344e+00],
       [-4.39525995e+00, -1.05730560e+00],
       [-4.65547216e+00, -1.12289153e+00],
       [-4.39506763e+00, -1.05906659e+00],
       [-4.15310545e+00, -1.01477929e+00],
       [-1.85657205e+00, -4.40179896e-01],
       [-2.73403765e+00, -2.85727878e-01],
       [-2.

In [39]:
pca_df = pd.DataFrame(data=pca_data)

In [40]:
pca_df

Unnamed: 0,0,1
0,-3.038937,-0.130594
1,-2.836085,-0.055058
2,-2.833957,-0.062827
3,-1.003836,0.600403
4,-0.597294,0.748840
...,...,...
277,-7.328408,-1.123824
278,-6.531222,-0.999705
279,-4.776793,-0.726224
280,-1.206550,-0.108411


In [41]:
features_df = pd.concat([df[['date']], pca_df], axis=1)
features_df

Unnamed: 0,date,0,1
0,2001-01-31,-3.038937,-0.130594
1,2001-02-28,-2.836085,-0.055058
2,2001-03-31,-2.833957,-0.062827
3,2001-04-30,-1.003836,0.600403
4,2001-05-31,-0.597294,0.748840
...,...,...,...
277,2024-02-29,-7.328408,-1.123824
278,2024-03-31,-6.531222,-0.999705
279,2024-04-30,-4.776793,-0.726224
280,2024-05-31,-1.206550,-0.108411
