# Credit Spread Prediction â€“ Data Collection
This notebook downloads and cleans macro-financial data from FRED and Yahoo Finance.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from fredapi import Fred

In [3]:
fred = Fred(api_key="2dee14a9f6222157bf414d8fd93746dc")

# Corporate yields
aaa_yield = fred.get_series('AAA')     # Moody's AAA Corporate Bond Yield
baa_yield = fred.get_series('BAA')     # Moody's BAA Corporate Bond Yield


In [4]:
# Treasury yield (10-year constant maturity)
treasury_10y = fred.get_series('GS10') # 10-year Treasury

print(aaa_yield.head())
print(baa_yield.head())

# Combine into a DataFrame
data = pd.DataFrame({
    'AAA Yield': aaa_yield,
    'BAA Yield': baa_yield,
    '10Y Treasury': treasury_10y
}).dropna()




1919-01-01    5.35
1919-02-01    5.35
1919-03-01    5.39
1919-04-01    5.44
1919-05-01    5.39
dtype: float64
1919-01-01    7.12
1919-02-01    7.20
1919-03-01    7.15
1919-04-01    7.23
1919-05-01    7.09
dtype: float64


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# df: your full dataset, indexed by date, one row per bond-date
# Columns example:
# ['spread', 'rf_2y', 'rf_10y', 'vix', 'cdx', 'rating', 'sector', 'maturity',
#  'duration', 'leverage', 'date', 'bond_id', ...]


df = df.sort_values('date')

# Example: create some lag features
max_lag = 5
for lag in range(1, max_lag + 1):
    df[f'spread_lag_{lag}'] = df.groupby('bond_id')['spread'].shift(lag)

# Target: next-day spread change
df['target'] = df.groupby('bond_id')['spread'].shift(-1) - df['spread']

# Drop rows with NaNs due to lags / target
df = df.dropna()

# Train/val/test split by date
train_end = '2018-12-31'
val_end   = '2021-12-31'

train = df[df['date'] <= train_end]
val   = df[(df['date'] > train_end) & (df['date'] <= val_end)]
test  = df[df['date'] > val_end]

feature_cols_num = [
    'spread', 'rf_2y', 'rf_10y', 'vix', 'cdx',
    'maturity', 'duration', 'leverage',
] + [f'spread_lag_{lag}' for lag in range(1, max_lag + 1)]

feature_cols_cat = ['rating', 'sector']

X_train = train[feature_cols_num + feature_cols_cat]
y_train = train['target']

X_val = val[feature_cols_num + feature_cols_cat]
y_val = val['target']

X_test = test[feature_cols_num + feature_cols_cat]
y_test = test['target']

# Preprocess: one-hot for categorical
preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', feature_cols_num),
        ('cat', OneHotEncoder(handle_unknown='ignore'), feature_cols_cat)
    ]
)

model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror'
)

pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', model)
])

pipeline.fit(X_train, y_train)

pred_val = pipeline.predict(X_val)
mae_val = mean_absolute_error(y_val, pred_val)
print("Validation MAE:", mae_val)
