# Importing the dataset

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.datasets import make_classification
from sklearn import svm
from sklearn.decomposition import PCA
import statsmodels.api as sm

df = pd.read_csv('./Macrodata_raw.csv')
df.drop(["Regime","Unnamed: 0"], axis=1, inplace = True) #gets rid of column with string value for recession indicator

FileNotFoundError: [Errno 2] No such file or directory: './Macrodata_raw.csv'

In [None]:
first_column = df.pop("Regime in 0 = Normal & 1 = Recession") #moves the dependent variable column to the beginning of the data table
df.insert(0, "Regime in 0 = Normal & 1 = Recession", first_column)

In [None]:
variables_tested = df.drop(labels=["Regime in 0 = Normal & 1 = Recession"], axis=1)   # new object for colinearity 
variables_tested.head() 

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler #scales the variables
sample = pd.DataFrame(StandardScaler().fit(variables_tested).transform(variables_tested))
sample.head()

# Method 1 : VIF check

In [None]:
sample.drop(sample.iloc[:,[32,33,18,22,6,8,1,7,11,26,27,36,38,19,20,5,0,10,13,12,4,2,9,24,14,25,35,16,17,23,28,29,30,31,37]],axis = 1, inplace=True) #drops the columns that create a bias in the VIF

In [None]:
M_Output = add_constant(sample)
VIF_list=pd.Series([variance_inflation_factor(M_Output, i)
for i in range(M_Output.shape[1])], index = M_Output.columns)
VIF_list.sort_values(ascending=False) #calculates the VIFs and show them in descending order 

In [None]:
sns.pairplot(M_Output) #correlation matrix with scatter plots 
plt.show()
plt.close()

In [None]:
X=M_Output
y=df["Regime in 0 = Normal & 1 = Recession"]

# Train/Test split 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .6, stratify=y)
print(X_train.size) #train and test split
print(X_test.size)

In [None]:
print('Observations in train data:', len(X_train))
print('Observations in test data:', len(X_test))

In [None]:
Data_for_MLR=pd.concat([X_train, y_train], axis=1)
Data_for_MLR #matrix we use for the multiple logistic regression

In [None]:
Data_for_MLR.columns =['constant', 'ISRATIO', 'DTCTHFNM', 'EXUSUK',"SP500","NASDAQ","GOLDBAR","P/E","Dividend Yield","Recession_Index"] 

In [None]:
Xtrain = Data_for_MLR.drop("Recession_Index", axis = 1)
ytrain = Data_for_MLR["Recession_Index"]

# Multiple logistic regression

In [None]:
log_reg = sm.Logit(ytrain, Xtrain).fit()
print(log_reg.summary())
print(log_reg.params)

# Odds ratio

In [None]:
ExpBeta=np.exp(log_reg.params)
ExpBeta

# Method 2 : PCA

In [None]:
Non_Financial_idx =variables_tested[["UNRATE","USGOOD","USTPU","PAYEMS","RPI", "INDPRO","HOUST","PERMIT", "DPCERA3M086SBEA","AMTMNO", "AMTMTI"
                                ,"AMDMUO", "ACOGNO", "BUSINV", "ISRATIO"]]
Financial_idx = variables_tested[["M1SL","M2SL", "TOTRESNS", "BUSLOANS", "REALLN", "DTCTHFNM", "FEDFUNDS", "TB6MS", "GS5"
                             , "GS10", "EXSZUS", "EXCAUS", "EXUSUK", "WPSFD49207", "WPSID61", "CPIAUCSL","SP500","NASDAQ", "GOLDBAR", "P/E", "Dividend Yield" ]]
pca = PCA(n_components=8)
PCA_NFI = pca.fit_transform(Non_Financial_idx)
PCA_NFI = pd.DataFrame(PCA_NFI)
pca = PCA(n_components=8)
PCA_FI = pca.fit_transform(Financial_idx)
PCA_FI = pd.DataFrame(PCA_FI)

In [None]:
X2=pd.concat([PCA_FI,PCA_NFI],axis=1)
y2=df["Regime in 0 = Normal & 1 = Recession"]
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=0, train_size = .6, stratify=y2)
Data_for_MLR=pd.concat([X2_train, y2_train], axis=1)

In [None]:
import statsmodels.api as sm
X2train = Data_for_MLR.drop("Regime in 0 = Normal & 1 = Recession", axis = 1)
y2train = Data_for_MLR["Regime in 0 = Normal & 1 = Recession"]
log_reg2 = sm.Logit(y2train, X2train).fit()

In [None]:
print(log_reg2.summary())
ExpBeta2=np.exp(log_reg2.params)
ExpBeta2

# ROC curves

In [None]:
#define metrics
Y_pred_prob = log_reg.predict(X_test)
fpr, tpr, _ = metrics.roc_curve(y_test,  Y_pred_prob)
Y2_pred_prob = log_reg2.predict(X2_test)
fpr2, tpr2, _ = metrics.roc_curve(y2_test,  Y2_pred_prob)

#create ROC curve
plt.plot(fpr, tpr, 'r', label = "VIF method") # plotting t, a separately 
plt.plot(fpr2, tpr2, 'b', label = "PCA method") # plotting t, b separately 
plt.legend(loc="lower right")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
