# Libraries

In [119]:
import fbprophet
from fbprophet import Prophet
from fbprophet.plot import add_changepoints_to_plot
from scipy.stats import boxcox
import pandas as pd
import numpy as np
from datetime import date
from datetime import time
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy import stats
from scipy.stats import ttest_1samp
import math
import plotly 
import plotly.graph_objs as go
import plotly.express as px
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

# Preparing the Data for ML

In [13]:
# Economic estimators

In [2]:
yc_cci = pd.read_csv('/Users/vladimirautier/Documents/GitHub/Week9/Project-Week-9-Final-Project/your-project/Final Project/Modified:Cleaned Data/yield_curve_cci_mre.csv')

In [3]:
yc_cci.head()

Unnamed: 0,Date,10 Year Treasury Yield,3 Month Treasury Yield,3 Month Treasury Yield (Bond Equivalent Basis),Spread,Rec_prob,NBER_Rec,Year,CCI Value
0,1970-12-31,6.39,4.87,4.99918,1.39082,0.399079,0.0,1970,98.74387
1,1971-01-31,6.24,4.44,4.552764,1.687236,0.37792,0.0,1971,98.92701
2,1971-02-28,6.11,3.7,3.786806,2.323194,0.324102,0.0,1971,99.06342
3,1971-03-31,5.7,3.38,3.456476,2.243524,0.247934,0.0,1971,99.16283
4,1971-04-30,5.83,3.86,3.952173,1.877827,0.167723,0.0,1971,99.24342


In [4]:
# Using the Spread as the main Value

In [5]:
yc_cci.drop(['10 Year Treasury Yield','3 Month Treasury Yield','3 Month Treasury Yield (Bond Equivalent Basis)','Rec_prob','Year'],axis=1,inplace=True)

In [6]:
yc_cci.isna().sum()

Date         0
Spread       0
NBER_Rec     0
CCI Value    0
dtype: int64

In [7]:
yc_cci.head()

Unnamed: 0,Date,Spread,NBER_Rec,CCI Value
0,1970-12-31,1.39082,0.0,98.74387
1,1971-01-31,1.687236,0.0,98.92701
2,1971-02-28,2.323194,0.0,99.06342
3,1971-03-31,2.243524,0.0,99.16283
4,1971-04-30,1.877827,0.0,99.24342


In [8]:
# Ordering columns

yc_cci = yc_cci[['Date', 'CCI Value', 'Spread', 'NBER_Rec']]

In [9]:
# Changing the types

yc_cci['Date'] = pd.to_datetime(yc_cci['Date'])

In [10]:
yc_cci.head()

Unnamed: 0,Date,CCI Value,Spread,NBER_Rec
0,1970-12-31,98.74387,1.39082,0.0
1,1971-01-31,98.92701,1.687236,0.0
2,1971-02-28,99.06342,2.323194,0.0
3,1971-03-31,99.16283,2.243524,0.0
4,1971-04-30,99.24342,1.877827,0.0


In [11]:
# Month resampling

yc_cci = yc_cci.resample('M', on='Date').mean()

In [12]:
yc_cci.head()

Unnamed: 0_level_0,CCI Value,Spread,NBER_Rec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1970-12-31,98.74387,1.39082,0.0
1971-01-31,98.92701,1.687236,0.0
1971-02-28,99.06342,2.323194,0.0
1971-03-31,99.16283,2.243524,0.0
1971-04-30,99.24342,1.877827,0.0


In [29]:
# Copper price

In [30]:
copper = pd.read_csv('/Users/vladimirautier/Documents/GitHub/Week9/Project-Week-9-Final-Project/your-project/Final Project/Modified:Cleaned Data/copper_ready_ML.csv')

In [31]:
copper.head()

Unnamed: 0,Date,Value of Copper in Dollars per Tonne,ds,y
0,1959-07-02,645.772,1959-07-02,645.772
1,1959-07-06,641.364,1959-07-06,641.364
2,1959-07-07,621.528,1959-07-07,621.528
3,1959-07-08,627.2584,1959-07-08,627.2584
4,1959-07-09,608.304,1959-07-09,608.304


In [32]:
#Cleaning the unwanted columns

copper.drop(['Value of Copper in Dollars per Tonne','ds'],axis=1,inplace=True)

In [33]:
copper.head()

Unnamed: 0,Date,y
0,1959-07-02,645.772
1,1959-07-06,641.364
2,1959-07-07,621.528
3,1959-07-08,627.2584
4,1959-07-09,608.304


In [34]:
#Renaming column

copper = copper.rename({'y':'Value of Copper'}, axis=1)

In [35]:
copper.head()

Unnamed: 0,Date,Value of Copper
0,1959-07-02,645.772
1,1959-07-06,641.364
2,1959-07-07,621.528
3,1959-07-08,627.2584
4,1959-07-09,608.304


In [38]:
copper['Date'] = pd.to_datetime(copper['Date']) 

In [39]:
copper = copper.reset_index()

In [40]:
# Month resampling

copper = copper.resample('M', on='Date').mean()

In [51]:
#Merging cleaned DF

original_values = pd.merge(yc_cci,copper,left_index=True, right_index=True)

In [52]:
original_values.head()

Unnamed: 0_level_0,CCI Value,Spread,NBER_Rec,level_0,index,Value of Copper
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970-12-31,98.74387,1.39082,0.0,2851.5,2851.5,1029.989309
1971-01-31,98.92701,1.687236,0.0,2872.5,2872.5,1000.5609
1971-02-28,99.06342,2.323194,0.0,2891.5,2891.5,1034.4474
1971-03-31,99.16283,2.243524,0.0,2912.0,2912.0,1160.549739
1971-04-30,99.24342,1.877827,0.0,2934.0,2934.0,1240.684076


In [53]:
# Dropping  unwanted columns

original_values.drop(['level_0','index'],axis=1,inplace=True)

In [54]:
# Ordering columns

original_values = original_values[['CCI Value','Spread','Value of Copper','NBER_Rec']]

In [55]:
original_values.head()

Unnamed: 0_level_0,CCI Value,Spread,Value of Copper,NBER_Rec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1970-12-31,98.74387,1.39082,1029.989309,0.0
1971-01-31,98.92701,1.687236,1000.5609,0.0
1971-02-28,99.06342,2.323194,1034.4474,0.0
1971-03-31,99.16283,2.243524,1160.549739,0.0
1971-04-30,99.24342,1.877827,1240.684076,0.0


# Using Random Forest Algorithm

In [62]:
#Splitting the data

yt = original_values.NBER_Rec
Xt= original_values.drop('NBER_Rec', axis=1)

In [63]:
train_Xt, test_Xt, train_yt, test_yt = train_test_split(Xt, yt, test_size=0.2, stratify=yt)

In [64]:
#RANDOM FOREST ALGO

In [65]:
model_rf = RandomForestClassifier()

In [66]:
model_rf.fit(train_Xt,train_yt)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [67]:
rf_predictions = model_rf.predict(test_Xt) # Random forest predictions

In [68]:
rf_probs = model_rf.predict_proba(test_Xt)[:, 1]

In [69]:
# Anlysing the efficiency of the algorithm

In [70]:
#AUC-SCORE

In [71]:
roc_value = roc_auc_score(test_yt, rf_probs)

In [72]:
roc_value

0.9041895604395604

In [73]:
#Confusion Matrix & Accuracy

In [78]:
y_pred_t = model_rf.predict(test_Xt)

print(confusion_matrix(test_yt, y_pred_t))
print(accuracy_score(test_yt, y_pred_t))

[[104   0]
 [  8   6]]
0.9322033898305084


In [79]:
print(rf_predictions)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
# I will now use this fitted Random Forest Algorithm and use it on the forecast values Dataframe to predict recession

In [80]:
forecast = pd.read_csv('/Users/vladimirautier/Documents/GitHub/Week9/Project-Week-9-Final-Project/your-project/Final Project/Modified:Cleaned Data/forecast_values.csv')

In [81]:
forecast.head()

Unnamed: 0,Date,CCI forecast value,Spread forecast value,Copper forecast value
0,2019-12-31,102.061426,1.897287,5762.266288
1,2020-01-31,101.988899,1.931208,5764.090938
2,2020-02-29,101.946537,1.959907,5869.08871
3,2020-03-31,102.048541,1.943971,5841.285785
4,2020-04-30,102.121244,2.028687,5835.889038


In [82]:
forecast['Date'] = pd.to_datetime(forecast['Date']) 

In [83]:
forecast.set_index('Date', inplace=True)

In [84]:
forecast.head()

Unnamed: 0_level_0,CCI forecast value,Spread forecast value,Copper forecast value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-12-31,102.061426,1.897287,5762.266288
2020-01-31,101.988899,1.931208,5764.090938
2020-02-29,101.946537,1.959907,5869.08871
2020-03-31,102.048541,1.943971,5841.285785
2020-04-30,102.121244,2.028687,5835.889038


In [85]:
rf_predictions = model_rf.predict(forecast)

In [86]:
rf_probs = model_rf.predict_proba(forecast)[:, 1]

In [87]:
print(rf_predictions)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [88]:
type(rf_predictions)

numpy.ndarray

In [89]:
new_series = pd.Series(rf_predictions)

In [90]:
forecast['NBER_Rec'] = rf_predictions

In [91]:
forecast

Unnamed: 0_level_0,CCI forecast value,Spread forecast value,Copper forecast value,NBER_Rec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,102.061426,1.897287,5762.266288,0.0
2020-01-31,101.988899,1.931208,5764.090938,0.0
2020-02-29,101.946537,1.959907,5869.08871,0.0
2020-03-31,102.048541,1.943971,5841.285785,0.0
2020-04-30,102.121244,2.028687,5835.889038,0.0
2020-05-31,102.169202,2.096228,5795.636618,0.0
2020-06-30,102.163057,2.084937,5708.449673,0.0
2020-07-31,102.133214,2.01841,5749.931522,0.0
2020-08-31,102.133983,1.956485,5701.377373,0.0
2020-09-30,102.156798,1.960727,5668.041225,0.0


In [92]:
# Here we have the predicted economic situation regarding the chosen estimators
# NOOOO RECESSION IN THE YEAR TO COME

# Looking at all the economic estimators without copper

In [96]:
# We have to take into account that this has been done using Year resampling in oder to keep the GDP(which evolves yearly).

In [94]:
eco_indicators = pd.read_csv('/Users/vladimirautier/Documents/GitHub/Week9/Project-Week-9-Final-Project/your-project/Final Project/Modified:Cleaned Data/df_gdp_yield_cci_yre.csv')

In [95]:
eco_indicators.head()

Unnamed: 0,Date,CCI Value,GDP per capita growth (annual %),index,10 Year Treasury Yield,3 Month Treasury Yield,3 Month Treasury Yield (Bond Equivalent Basis),Spread,Rec_prob,NBER_Rec,Year
0,1970-12-31,98.74387,1.552273,125.5,7.348333,6.391667,6.588419,0.759914,0.344521,1.0,1970
1,1971-12-31,99.425517,2.188471,137.5,6.159167,4.3325,4.442091,1.717076,0.168818,0.0,1971
2,1972-12-31,100.458448,3.620346,149.5,6.21,4.0725,4.172884,2.037116,0.056955,0.0,1972
3,1973-12-31,98.640602,4.452205,161.5,6.8425,7.031667,7.261206,-0.418706,0.040342,1.0,1973
4,1974-12-31,98.089943,0.050785,173.5,7.5575,7.83,8.099749,-0.542249,0.410346,1.0,1974


In [98]:
#Droping columns (rec_prob is in % and I decide to drop it as well) to keep essential ones

eco_indicators.drop(['index','10 Year Treasury Yield','3 Month Treasury Yield','3 Month Treasury Yield (Bond Equivalent Basis)','Year','Rec_prob'],axis=1,inplace=True)

In [99]:
eco_indicators.head()

Unnamed: 0,Date,CCI Value,GDP per capita growth (annual %),Spread,NBER_Rec
0,1970-12-31,98.74387,1.552273,0.759914,1.0
1,1971-12-31,99.425517,2.188471,1.717076,0.0
2,1972-12-31,100.458448,3.620346,2.037116,0.0
3,1973-12-31,98.640602,4.452205,-0.418706,1.0
4,1974-12-31,98.089943,0.050785,-0.542249,1.0


In [100]:
eco_indicators['Date'] =  pd.to_datetime(eco_indicators['Date'])

In [101]:
eco_indicators.set_index('Date', inplace=True)

In [102]:
eco_indicators.head()

Unnamed: 0_level_0,CCI Value,GDP per capita growth (annual %),Spread,NBER_Rec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1970-12-31,98.74387,1.552273,0.759914,1.0
1971-12-31,99.425517,2.188471,1.717076,0.0
1972-12-31,100.458448,3.620346,2.037116,0.0
1973-12-31,98.640602,4.452205,-0.418706,1.0
1974-12-31,98.089943,0.050785,-0.542249,1.0


In [103]:
#Splitting the data

y = eco_indicators.NBER_Rec
X= eco_indicators.drop('NBER_Rec', axis=1)

In [104]:
#Training the data

import sklearn
from sklearn.model_selection import train_test_split

In [105]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,stratify=y)

In [106]:
#RANDOM FOREST ALGO

In [107]:
model_rf = RandomForestClassifier()

In [108]:
model_rf.fit(train_X,train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [109]:
rf_predictions = model_rf.predict(test_X)

In [110]:
rf_probs = model_rf.predict_proba(test_X)[:, 1]

In [111]:
# Anlysing the efficiency of the algorithm

In [112]:
#AUC-SCORE

In [113]:
roc_value = roc_auc_score(test_y, rf_probs)

In [114]:
roc_value

0.8125

In [115]:
#CONFUSION MATRIX & ACCURACY

In [116]:
y_pred_CM = model_rf.predict(test_X)

print(confusion_matrix(test_y, y_pred_CM))
print(accuracy_score(test_y, y_pred_CM))

[[7 1]
 [1 1]]
0.8


In [117]:
#CROSS-VALIDATION

In [120]:
cv_results = cross_validate(model_rf, X, y, cv=3)

In [121]:
sorted(cv_results.keys())

['fit_time', 'score_time', 'test_score']

In [122]:
cv_results['test_score']

array([0.82352941, 0.8125    , 0.6875    ])

# Looking at CCI & Yield (Spread) without copper

In [135]:
## We have to take into account that this has been done using month resampling because here I frop GDP(which evolves yearly).

In [136]:
yield_cci = pd.read_csv('/Users/vladimirautier/Documents/GitHub/Week9/Project-Week-9-Final-Project/your-project/Final Project/Modified:Cleaned Data/yield_curve_cci_mre.csv')

In [137]:
yield_cci.head()

Unnamed: 0,Date,10 Year Treasury Yield,3 Month Treasury Yield,3 Month Treasury Yield (Bond Equivalent Basis),Spread,Rec_prob,NBER_Rec,Year,CCI Value
0,1970-12-31,6.39,4.87,4.99918,1.39082,0.399079,0.0,1970,98.74387
1,1971-01-31,6.24,4.44,4.552764,1.687236,0.37792,0.0,1971,98.92701
2,1971-02-28,6.11,3.7,3.786806,2.323194,0.324102,0.0,1971,99.06342
3,1971-03-31,5.7,3.38,3.456476,2.243524,0.247934,0.0,1971,99.16283
4,1971-04-30,5.83,3.86,3.952173,1.877827,0.167723,0.0,1971,99.24342


In [138]:
##Droping columns (rec_prob is in % and I decide to drop it as well) to keep essential ones

yield_cci.drop(['10 Year Treasury Yield','3 Month Treasury Yield','3 Month Treasury Yield (Bond Equivalent Basis)','Year','Rec_prob'],axis=1,inplace=True)

In [139]:
yield_cci.head()

Unnamed: 0,Date,Spread,NBER_Rec,CCI Value
0,1970-12-31,1.39082,0.0,98.74387
1,1971-01-31,1.687236,0.0,98.92701
2,1971-02-28,2.323194,0.0,99.06342
3,1971-03-31,2.243524,0.0,99.16283
4,1971-04-30,1.877827,0.0,99.24342


In [140]:
# Ordering columns

yield_cci = yield_cci[['Date','Spread','CCI Value','NBER_Rec']]

In [141]:
yield_cci.head()

Unnamed: 0,Date,Spread,CCI Value,NBER_Rec
0,1970-12-31,1.39082,98.74387,0.0
1,1971-01-31,1.687236,98.92701,0.0
2,1971-02-28,2.323194,99.06342,0.0
3,1971-03-31,2.243524,99.16283,0.0
4,1971-04-30,1.877827,99.24342,0.0


In [142]:
yield_cci['Date'] =  pd.to_datetime(yield_cci['Date'])

In [143]:
yield_cci.set_index('Date', inplace=True)

In [144]:
yield_cci.head()

Unnamed: 0_level_0,Spread,CCI Value,NBER_Rec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1970-12-31,1.39082,98.74387,0.0
1971-01-31,1.687236,98.92701,0.0
1971-02-28,2.323194,99.06342,0.0
1971-03-31,2.243524,99.16283,0.0
1971-04-30,1.877827,99.24342,0.0


In [145]:
#Splitting the data

y1 = yield_cci.NBER_Rec
X1= yield_cci.drop('NBER_Rec', axis=1)

In [146]:
train_X1, test_X1, train_y1, test_y1 = train_test_split(X1, y1, test_size=0.2,stratify=y1)

In [147]:
#RANDOM FOREST ALGO

In [148]:
model_rf.fit(train_X1,train_y1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [149]:
rf_predictions = model_rf.predict(test_X1)

In [150]:
rf_probs = model_rf.predict_proba(test_X1)[:, 1]

In [151]:
#AUC-SCORE

In [152]:
roc_value = roc_auc_score(test_y1, rf_probs)

In [153]:
roc_value

0.8901098901098901

In [154]:
#CONFUSION MATRIX & ACCURACY

In [155]:
y_pred_CM = model_rf.predict(test_X1)

print(confusion_matrix(test_y1, y_pred_CM))
print(accuracy_score(test_y1, y_pred_CM))

[[100   4]
 [  9   5]]
0.8898305084745762


In [156]:
#CROSS-VALIDATION

In [157]:
cv_results = cross_validate(model_rf, X1, y1, cv=3)

In [158]:
sorted(cv_results.keys())

['fit_time', 'score_time', 'test_score']

In [159]:
cv_results['test_score']

array([0.8622449 , 0.90816327, 0.79081633])