# Module 5: Regime Prediction with Machine Learning - Part 2

In this part we will prepare the dataset for the recession forecasting problem. We will clean the data and perform feature engineering on the raw data to make it ready for prediction step.

&nbsp;&nbsp;1. [Data Cleaning](#1)

&nbsp;&nbsp;2. [Data Processing](#2)



In [1]:
import pandas as pd
import numpy as np

## 1. Data Cleaning <a id="1"></a>

- Remove features that have more than 10 missing values.
- Forward fill missing values 

In [2]:
def remove_variables(df,n):
    # if a variable has more than 'n' NaN values remove it.
    dropped_cols = []
    for col in df.columns:
        if df[col].isna().sum() > n:
            dropped_cols.append(col)
            df.drop(col, axis=1, inplace=True)
    return df,dropped_cols

In [3]:
df_macro = pd.read_csv('data/macro_raw.csv')[:-1] #remove last row

missing_num = 10
df_clean, dropped_cols = remove_variables(df_macro,missing_num)

df_clean.fillna(method='ffill', inplace=True)  # forward fill last month and missing values in between
df_clean.rename(columns={'sasdate': 'Date'}, inplace=True)
df_clean.head()

Unnamed: 0,Date,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,DDURRG3M086SBEA,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,MZMSL,DTCOLNVHFNM,DTCTHFNM,INVEST
0,Transform:,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
1,1/1/1959,2437.296,2288.8,17.302,292258.8329,18235.77392,22.625,23.4581,22.1904,32.4078,...,56.918,17.791,11.358,2.13,2.45,2.04,274.9,6476.0,12298.0,84.2043
2,2/1/1959,2446.902,2297.0,17.482,294429.5453,18369.56308,23.0681,23.7747,22.3827,32.6455,...,56.951,17.798,11.375,2.14,2.46,2.05,276.0,6476.0,12298.0,83.528
3,3/1/1959,2462.689,2314.0,17.647,293425.3813,18523.05762,23.4004,23.9186,22.4925,32.6455,...,57.022,17.785,11.395,2.15,2.45,2.07,277.4,6508.0,12349.0,81.6405
4,4/1/1959,2478.744,2330.3,17.584,299331.6505,18534.466,23.8989,24.2641,22.8221,33.1606,...,57.08,17.796,11.436,2.16,2.47,2.08,278.1,6620.0,12484.0,81.8099


## 2. Data Processing <a id="2"></a>

- Convert the features into stationary form by applying the necessary transformations as stated in the __[appendix]( https://s3.amazonaws.com/files.fred.stlouisfed.org/fred-md/Appendix_Tables_Update.pdf)__ by authors.

- Add 1,3,6,9,12 months lags of the features 

In [4]:
import pandas as pd
import numpy as np

class MacroDataProcess:
    # Stationarity transofrmation
    # Add lag of the features

    def __init__(self,macro_data):
        self.data =macro_data
        self.transformation_codes = None

    def transform(self, df_col, code):
        """
        Transforms each column of dataframe (df_col) according to code
        Transformations for each code are shown in appendix

        Parameters
        ----------
        df_col: pandas dataframe column

        code: int or float
        """
        if code == 1:
            df_col.apply(lambda x: x)
            return df_col
        elif code == 2:
            df_col = df_col.diff()
            return df_col
        elif code == 3:
            df_col = df_col.diff(periods=2)
            return df_col
        elif code == 4:
            df_col = df_col.apply(np.log)
            return df_col
        elif code == 5:
            df_col = df_col.apply(np.log)
            df_col = df_col.diff(periods=2)
            return df_col
        elif code == 6:
            df_col = df_col.apply(np.log)
            df_col = df_col.diff(periods=2)
            return df_col
        elif code == 7:
            df_col = df_col.pct_change()
            df_col = df_col.diff()
            return df_col

    def stationarity(self):
        """
        Clean macro dataset and perform necessary changes
        """
        # Keep transformation codes for each variable in a dictionary 
        transformation_codes = {}
        df_tmp=pd.DataFrame(columns = self.data.columns)
        for col in self.data.columns:
            df_tmp[col]= self.data[col].iloc[1:] 
            transformation_codes[col] = self.data[col].iloc[0]
        df_tmp['Date'] = pd.to_datetime(df_tmp['Date'])

        self.data=df_tmp
        self.tansformation_codes = transformation_codes
        # Make each feature stationary 
        data_transformed: DataFrame = pd.DataFrame(columns=self.data.columns)
        for col in self.data.columns:
            if col == 'Date':
                data_transformed[col] = self.data[col]
            else:
                data_transformed[col] = self.transform(self.data[col], transformation_codes[col])
        self.data = data_transformed

    def add_lag(self,lag_values):
        for col in self.data.drop(['Date'], axis=1):
            for n in lag_values:
                self.data['{} {}M lag'.format(col, n)] = self.data[col].shift(n).ffill().values
        self.data.dropna(axis=0, inplace=True)
        return self.data

In [5]:
df = MacroDataProcess(macro_data = df_clean)
df.stationarity()
lag_values = [1,3,6,9,12]
df_process= df.add_lag(lag_values)
df_process.to_csv('./data/macro_processed.csv',index=False)
print('MacroFeatures shape:', df_process.shape)
print('Start date: '+ str(df_process.Date.iloc[0])+' End date: '+str(df_process.Date.iloc[-1]))

MacroFeatures shape: (727, 709)
Start date: 1960-03-01 00:00:00 End date: 2020-09-01 00:00:00


In [6]:
df_process

Unnamed: 0,Date,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,DTCTHFNM 1M lag,DTCTHFNM 3M lag,DTCTHFNM 6M lag,DTCTHFNM 9M lag,DTCTHFNM 12M lag,INVEST 1M lag,INVEST 3M lag,INVEST 6M lag,INVEST 9M lag,INVEST 12M lag
15,1960-03-01,0.003055,0.001836,0.018399,-0.013609,0.002594,-0.017954,-0.009113,-0.004585,-0.010302,...,0.006099,0.018233,0.034111,0.034793,0.004138,-0.037926,-0.000688,-0.027018,-0.038791,-0.030921
16,1960-04-01,0.005336,0.004539,0.029454,-0.018185,0.024801,-0.016979,-0.001142,0.000000,0.008034,...,0.012437,0.012450,0.024419,0.042800,0.015011,-0.053379,-0.005654,-0.014082,-0.021701,-0.020784
17,1960-05-01,0.005829,0.006069,-0.005034,-0.021753,0.010857,-0.009104,0.007978,0.008012,0.012579,...,0.018840,0.006099,0.016845,0.041298,0.023766,-0.025753,-0.037926,-0.013266,-0.017855,-0.011197
18,1960-06-01,0.003192,0.002199,-0.020636,-0.022982,-0.016411,-0.013793,-0.005724,-0.004601,-0.001143,...,0.022225,0.012437,0.018233,0.034111,0.034793,0.002181,-0.053379,-0.000688,-0.027018,-0.038791
19,1960-07-01,0.002621,0.001656,0.001809,0.001741,-0.011463,-0.016129,-0.017175,-0.017250,-0.016038,...,0.027577,0.018840,0.012450,0.024419,0.042800,-0.004001,-0.025753,-0.005654,-0.014082,-0.021701
20,1960-08-01,0.000208,-0.000663,0.001097,-0.008128,-0.006685,-0.004642,-0.008070,-0.005783,-0.006888,...,0.031706,0.022225,0.006099,0.016845,0.041298,0.018194,0.002181,-0.037926,-0.013266,-0.017855
21,1960-09-01,-0.000350,-0.002278,0.003826,0.014828,0.002479,-0.011681,-0.010445,-0.005817,-0.002311,...,0.026114,0.027577,0.012437,0.018233,0.034111,0.026257,-0.004001,-0.053379,-0.000688,-0.027018
22,1960-10-01,0.004228,0.003437,0.009600,-0.002937,0.007831,-0.011695,-0.002319,-0.001162,0.005741,...,0.016761,0.031706,0.018840,0.012450,0.024419,0.013662,0.018194,-0.025753,-0.005654,-0.014082
23,1960-11-01,-0.001340,-0.002616,0.001363,-0.028858,-0.005629,-0.015394,-0.004677,-0.007027,-0.008137,...,0.004461,0.026114,0.022225,0.006099,0.016845,0.037561,0.026257,0.002181,-0.037926,-0.013266
24,1960-12-01,-0.008864,-0.011642,-0.015813,-0.002003,-0.021726,-0.033496,-0.024662,-0.024691,-0.025523,...,0.001635,0.016761,0.027577,0.012437,0.018233,0.040770,0.013662,-0.004001,-0.053379,-0.000688
