In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import precision_recall_curve, \
    plot_precision_recall_curve, average_precision_score
from sklearn.metrics import plot_roc_curve
import sys
import warnings
from matplotlib.pyplot import figure
warnings.filterwarnings("ignore")

file_path = "./Economic_data_clean_20200801.xlsx"

csv_data = pd.read_excel(file_path)
csv_data['EARN_DOWN'] = csv_data['EARN_DOWN'].astype(np.float16)
#complete_data = complete_data.reset_index()
#complete_data = complete_data.drop(['index'], axis=1)
lags = [10, 30]
cols_to_lag = ['CDX_HY', 'CDX_IG']

def create_lag_variables(df, lags, cols):
    df = df.assign(**{
        '{} (t-{})'.format(col, t): csv_data[col].shift(t)
        for t in lags
        for col in cols_to_lag
    })
    return df
create_lag_variables(csv_data, lags, cols_to_lag)

def wavg(group):
    group = group.reset_index().drop(['index'], axis=1)
    i = 0
    weight_sum = 0
    for j in range(0, group.shape[0]):
        i = i + 1
        weight_sum = weight_sum + i
#         print(group.iloc[[j]] * i)
        group.iloc[[j]] = group.iloc[[j]] * i
#     print(group)
    return group.iloc[:, :].sum(axis=0) / weight_sum

def DataPreprocess(raw_data, back_rows, forward_rows, is_weighted):
    raw_data = raw_data.drop(['Dates'], axis=1)
    start_index = back_rows
    end_index = raw_data.shape[0] - forward_rows + 1
    new_rows = []
    for i in range(start_index, end_index):
        if is_weighted:
            new_rows.append(wavg(raw_data.iloc[i-back_rows:i, :]))
        else:
            new_rows.append(raw_data.iloc[i-back_rows:i, :].mean(axis=0))
    training_data = pd.concat(new_rows, axis=1).T
    HY_labels = raw_data.iloc[start_index + forward_rows - 1: raw_data.shape[0], [8]].reset_index().drop(['index'], axis=1)
    IG_labels = raw_data.iloc[start_index + forward_rows - 1: raw_data.shape[0], [13]].reset_index().drop(['index'], axis=1)
    HY_spread_labels = raw_data.iloc[start_index + forward_rows - 1: raw_data.shape[0], [12]].reset_index().drop(['index'], axis=1)
    return training_data, HY_labels, IG_labels, HY_spread_labels


In [2]:
print("Preprocessing data...")

csv_data['CDX_HY_UpNextDay'] = csv_data['CDX_HY'].shift(-1) > csv_data['CDX_HY']
csv_data['CDX_IG_UpNextDay'] = csv_data['CDX_IG'].shift(-1) > csv_data['CDX_IG']

csv_data['CDX_HY_momentum'] = csv_data['CDX_HY_10D_AVG'] - csv_data['CDX_HY_30D_AVG'] / csv_data['CDX_HY_30D_AVG']
csv_data['CDX_IG_momentum'] = csv_data['CDX_IG_10D_AVG'] - csv_data['CDX_IG_30D_AVG'] / csv_data['CDX_IG_30D_AVG']

complete_data = csv_data.dropna()
complete_data_bool = complete_data

Preprocessing data...


In [3]:
training_data, HY_labels, IG_labels, HY_spread_labels = DataPreprocess(complete_data, 2, 2, True)

complete_data = pd.DataFrame(scale(training_data, axis=0, with_mean=True, with_std=True, copy=True),columns=training_data.columns.values)

X = complete_data.drop(['CDX_HY_UpNextDay','CDX_IG_UpNextDay'],axis=1)

X_HY = X.drop(['CDX_HY'],axis=1)
X_IG = X.drop(['CDX_IG'],axis=1)

In [20]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1963 entries, 0 to 1962
Data columns (total 42 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   10YR_UST                           1963 non-null   float64
 1   10_2_Curve                         1963 non-null   float64
 2   10_30_Curve                        1963 non-null   float64
 3   10_5_Curve                         1963 non-null   float64
 4   1YR_SWAP                           1963 non-null   float64
 5   2YR_UST                            1963 non-null   float64
 6   30YR_UST                           1963 non-null   float64
 7   5YR_UST                            1963 non-null   float64
 8   CDX_HY                             1963 non-null   float64
 9   CDX_HY_10D_AVG                     1963 non-null   float64
 10  CDX_HY_30D_AVG                     1963 non-null   float64
 11  CDX_HY_5D_AVG                      1963 non-null   float

In [19]:
complete_data_bool.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1966 entries, 29 to 1994
Data columns (total 43 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Dates                              1966 non-null   datetime64[ns]
 1   10YR_UST                           1966 non-null   float64       
 2   10_2_Curve                         1966 non-null   float64       
 3   10_30_Curve                        1966 non-null   float64       
 4   10_5_Curve                         1966 non-null   float64       
 5   1YR_SWAP                           1966 non-null   float64       
 6   2YR_UST                            1966 non-null   float64       
 7   30YR_UST                           1966 non-null   float64       
 8   5YR_UST                            1966 non-null   float64       
 9   CDX_HY                             1966 non-null   float64       
 10  CDX_HY_10D_AVG                     

In [None]:
# Split your data
print("Splitting Test and Training Data...")
X_HY_train, X_HY_test, Y_HY_train, Y_HY_test = train_test_split(X_HY, Y_HY, test_size=.25)
X_IG_train, X_IG_test, Y_IG_train, Y_IG_test = train_test_split(X_IG, Y_IG, test_size=.25)

In [None]:
# Encode Target
print("Encoding target data...")
lab_enc = preprocessing.LabelEncoder()
Y_HY_train_encoded = lab_enc.fit_transform(Y_HY_train)
Y_IG_train_encoded = lab_enc.fit_transform(Y_IG_train)

In [None]:
print("Training the HY models...")
AB_model_HY = AdaBoostClassifier().fit(X_HY_train, Y_HY_train_encoded)
print("Training the IG models...")
AB_model_IG = AdaBoostClassifier().fit(X_IG_train, Y_IG_train_encoded)

In [None]:
feat_impt = pd.DataFrame(AB_model_HY.feature_importances_, columns=['Feature Importance'], index=X_HY_train.columns)
feat_impt = feat_impt.sort_values('Feature Importance', ascending=True)
feat_impt.plot(kind='barh', figsize=(10, 12))

In [None]:
feat_impt = pd.DataFrame(AB_model_IG.feature_importances_, columns=['Feature Importance'], index=X_IG_train.columns)
feat_impt = feat_impt.sort_values('Feature Importance', ascending=True)
feat_impt.plot(kind='barh', figsize=(10, 12))

In [None]:
clf_IG_forrest = RandomForestClassifier()
clf_HY_forrest = RandomForestClassifier()

#Depreciated
drop_list = []

# Fit the model
print("Fitting IG Random Forrest Model...")
clf_IG_forrest.fit(X_IG_train, Y_IG_train_encoded.ravel())
      
print("Fitting HY Random Forrest Model...")
clf_HY_forrest.fit(X_HY_train, Y_HY_train_encoded.ravel())


In [None]:
# Print IG Features
def print_IG_features():
    
    features_IG = np.array(X_IG_train.columns)
    features_IG = list(features_IG)

    for item in drop_list:
        features_IG.remove(item)

    features_IG = np.array(features_IG) 
    figure(num=None, figsize=(11, 8), dpi=80, facecolor='w', edgecolor='k')
    importances = clf_IG_forrest.feature_importances_
    sorted_idx = np.argsort(importances)
    padding = np.arange(len(features_IG)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, features_IG[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance for CDX.IG")
    pl.show()
        
print_IG_features()

In [None]:
# Print IG Features
def print_HY_features():
    
    features_IG = np.array(X_IG_train.columns)
    features_IG = list(features_IG)

    for item in drop_list:
        features_IG.remove(item)

    features_IG = np.array(features_IG) 
    figure(num=None, figsize=(11, 8), dpi=80, facecolor='w', edgecolor='k')
    importances = clf_HY_forrest.feature_importances_
    sorted_idx = np.argsort(importances)
    padding = np.arange(len(features_IG)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, features_IG[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance for CDX.HY")
    pl.show()
        
print_HY_features()

In [None]:
Y_IG_test

In [None]:
# AB_model_IG = AdaBoostClassifier().fit(X_IG_train, Y_IG_train_encoded)

Y_HY_test_encoded = lab_enc.fit_transform(Y_HY_test)
Y_IG_test_encoded = lab_enc.fit_transform(Y_IG_test)

Y_IG_pred = clf_IG_forrest.predict(X_IG_test)
plot_roc_curve(clf_IG_forrest, X_IG_test, Y_IG_test_encoded)
area_roc = roc_auc_score(Y_IG_test, Y_IG_pred)
plt.title("{}, area under ROC: {:.2%}".format('Adaboost', area_roc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')