In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from functools import reduce


In [2]:
import mathsModels as mvm

In [3]:
def download_sp500_d():   
    symbol = "^GSPC" 
    start_date = "2000-01-01"
    end_date = None
    output_file = "../data/sp500_daily_data.csv"

    data = yf.download(symbol, start=start_date, end=end_date, interval="1d")
    data.to_csv(output_file)

In [4]:
def download_sp500_m():   
    symbol = "^GSPC" 
    interval = "1m"
    period = "7d"
    output_file = "../data/sp500_intraday_1m.csv"

    data = yf.download(symbol, interval=interval, period=period)
    data.to_csv(output_file)

In [5]:
def load_csv(file_path, date_column, rename_columns):
    df = pd.read_csv(file_path)
    df[date_column] = pd.to_datetime(df[date_column])
    df = df.rename(columns=rename_columns)
    return df

In [6]:
def load_data():
    global inflation, cpi_df, daily_volume_df, spx_volume_df, treasury_yield_df, sp500_daily_df, sp500_intraday_df, gdp, mortage_rate, unemployement, fed_fund_rate
    inflation = load_csv("../data/Inflation.csv", "observation_date", {"observation_date": "Date", "T10YIE": "Inflation"})
    cpi_df = load_csv("../data/CPI.csv", "observation_date", {"observation_date": "Date", "CPIAUCSL": "CPI"})
    daily_volume_df = load_csv("../data/daily_volume_2024_1.csv", "Trade Date", {"Trade Date": "Date", "Volume": "Daily_Volume"})
    spx_volume_df = load_csv("../data/daily_volume_SPX_2020-01-01_2024-12-18.csv", "Trade Date", {"Trade Date": "Date", "Volume": "SPX_Volume"})
    treasury_yield_df = load_csv("../data/Market Yield.csv", "observation_date", {"observation_date": "Date", "DGS10": "Treasury_Yield"})
    sp500_daily_df = load_csv("../data/sp500_daily.csv", "Date", {"Adj Close": "SP500_Adj_Close"})
    gdp=load_csv("../data/GDP.csv", "observation_date", {"observation_date": "Date"})
    unemployement=load_csv("../data/UNRATE.csv", "observation_date", {"observation_date": "Date", "UNRATE":"unemployement"})
    mortage_rate=load_csv("../data/MORTGAGE30US.csv", "observation_date", {"observation_date": "Date", "MORTGAGE":"mortage"})
    fed_fund_rate=load_csv("../data/Federal_Funds_Effective_Rate.csv", "observation_date", {"observation_date": "Date", "FEDFUNDS":"fed_fund_rate"})

    #sp500_intraday_df = load_csv("../data/sp500_minute.csv", "Datetime", {"Adj Close": "SP500_Intraday_Adj_Close"})

In [7]:
def data_cleaning():
    global sp500_intraday_agg, sp500_daily_df, daily_volume_df, spx_volume_df, gdp, inflation, cpi_df, treasury_yield_df, mortage_rate, unemployement, fed_fund_rate
    #sp500_intraday_df["Date"] = sp500_intraday_df["Datetime"].dt.date
    #sp500_intraday_agg = sp500_intraday_df.groupby("Date")["SP500_Intraday_Adj_Close"].mean().reset_index()
    #sp500_intraday_agg["Date"] = pd.to_datetime(sp500_intraday_agg["Date"])

    sp500_daily_df["Date"] = pd.to_datetime(sp500_daily_df["Date"])
    daily_volume_df["Date"] = pd.to_datetime(daily_volume_df["Date"])
    spx_volume_df["Date"] = pd.to_datetime(spx_volume_df["Date"])
    gdp["Date"] = pd.to_datetime(gdp["Date"])
    inflation["Date"] = pd.to_datetime(inflation["Date"])
    cpi_df["Date"] = pd.to_datetime(cpi_df["Date"])
    treasury_yield_df["Date"] = pd.to_datetime(treasury_yield_df["Date"])
    unemployement["Date"] = pd.to_datetime(unemployement["Date"])
    mortage_rate["Date"] = pd.to_datetime(mortage_rate["Date"])
    fed_fund_rate["Date"] = pd.to_datetime(fed_fund_rate["Date"])



In [8]:
def data_filling(df, date_column, value_column):
    df = df.set_index(date_column)
    df = df.asfreq('D')
    df[value_column] = df[value_column].fillna(method='ffill')
    return df.reset_index()

In [9]:
def fill():
    global cpi_df, gdp, mortage_rate, unemployement, fed_fund_rate
    cpi_df=data_filling(cpi_df, "Date", "CPI")
    gdp=data_filling(gdp, "Date", "GDP")
    fed_fund_rate=data_filling(fed_fund_rate, "Date", "fed_fund_rate")
    unemployement=data_filling(unemployement, "Date", "unemployement")
    mortage_rate=data_filling(mortage_rate, "Date", "mortage")

    

In [10]:
def data_merging():
    data_cleaning()
    #all_dfs = [breakeven_df, cpi_df, treasury_yield_df, sp500_daily_df, sp500_intraday_agg, daily_volume_df, spx_volume_df]
    #all_dfs = [breakeven_df, cpi_df, treasury_yield_df, sp500_daily_df, daily_volume_df, spx_volume_df]
    all_dfs = [inflation, cpi_df, treasury_yield_df, sp500_daily_df, gdp, mortage_rate, unemployement, fed_fund_rate]
    merged_df = reduce(lambda left, right: pd.merge(left, right, on="Date", how="inner"), all_dfs)

    global cleaned_df
    cleaned_df = merged_df.dropna()

In [11]:
def rename_fields():
    global cleaned_df
    cleaned_df = cleaned_df.rename(columns={"10Y_Breakeven_Inflation_Rate": "Inflation"})
    cleaned_df = cleaned_df.rename(columns={"10Y_Treasury_Yield": "Market_Yield"})
    #cleaned_df = cleaned_df.rename(columns={"Ancien_Nom": "Nouveau_Nom"})


In [12]:
def add_volatility():
    global cleaned_df
    cleaned_df['volatility'] = (cleaned_df['High'] - cleaned_df['Low']) / cleaned_df['Low'] * 100
    cleaned_df['volatility_forcast'] = cleaned_df['volatility'].shift(-1)
    # plus returns
    cleaned_df['returns'] = np.log(cleaned_df['Close'] / cleaned_df['Close'].shift(1))



In [13]:
def add_models_data():
    lambda_ = 0.94
    cleaned_df['EWMA_VM'] = mvm.EWMA_VM(cleaned_df['returns'].fillna(0).values, lambda_)

    omega, alpha, beta = 0.0001, 0.05, 0.8
    cleaned_df['GARCH_VM'] = mvm.GARCH_VM(cleaned_df['returns'].fillna(0).values, omega, alpha, beta)

    omega, alpha, beta, gamma = 0.0001, 0.05, 0.9, -0.1
    cleaned_df['EGARCH_VM'] = mvm.EGARCH_VM(cleaned_df['returns'].fillna(0).values, omega, alpha, beta, gamma)

    cleaned_df['RogersSatchell_VM'] = mvm.RogersSatchell_VM(cleaned_df['High'].values,cleaned_df['Low'].values,cleaned_df['Open'].values,cleaned_df['Close'].values)


    cleaned_df['garman_klass'] = mvm.GarmanKlass_VM(
    cleaned_df['Open'].values,
    cleaned_df['Close'].values,
    cleaned_df['High'].values,
    cleaned_df['Low'].values
    )

    cleaned_df['parkinson'] = mvm.parkinson_VM(
        cleaned_df['High'].values,
        cleaned_df['Low'].values
    )

    cleaned_df['yang_zhang'] = mvm.Yang_Zhang_VM(
        cleaned_df['Open'].values,
        cleaned_df['Close'].values,
        cleaned_df['High'].values,
        cleaned_df['Low'].values
    )



    
    # ARCH volatility
    lags = 5
    arch_volatility = mvm.ARCH_VM(cleaned_df['returns'].values, lags=lags)
    #cleaned_df['arch_volatility'] = np.concatenate((np.full(lags, np.nan), arch_volatility))



In [14]:
def dataPreprocessing():
    global cleaned_df
    load_data()
    data_cleaning()
    fill()
    data_merging()
    add_volatility()
    add_models_data()
    cleaned_df=cleaned_df.dropna()
    cleaned_df = cleaned_df.round(4)
    cleaned_df.to_csv("../data/dataV.csv", index=False)


In [15]:
dataPreprocessing()

In [16]:
cleaned_df


Unnamed: 0,Date,Inflation,CPI,Treasury_Yield,Open,High,Low,Close,SP500_Adj_Close,Volume,...,volatility,volatility_forcast,returns,EWMA_VM,GARCH_VM,EGARCH_VM,RogersSatchell_VM,garman_klass,parkinson,yang_zhang
1,2003-01-03,1.62,182.600,4.05,909.0300,911.2500,903.0700,908.5900,908.5900,1130800000,...,0.9058,2.5512,-0.0005,0.0116,0.0002,0.0182,0.0069,0.0064,0.0054,0.0090
2,2003-01-06,1.63,182.600,4.09,908.5900,931.7700,908.5900,929.0100,929.0100,1435900000,...,2.5512,1.1827,0.0222,0.0112,0.0003,0.0267,0.0086,0.0112,0.0151,0.0252
3,2003-01-07,1.62,182.600,4.04,929.0100,930.8100,919.9300,922.9300,922.9300,1545200000,...,1.1827,1.6085,-0.0066,0.0122,0.0003,0.0376,0.0070,0.0072,0.0071,0.0118
4,2003-01-08,1.71,182.600,4.00,922.9300,922.9300,908.3200,909.9300,909.9300,1467600000,...,1.6085,2.0199,-0.0142,0.0119,0.0004,0.0512,0.0053,0.0070,0.0096,0.0160
5,2003-01-09,1.78,182.600,4.19,909.9300,928.3100,909.9300,927.5700,927.5700,1560300000,...,2.0199,1.6597,0.0192,0.0121,0.0004,0.0675,0.0040,0.0076,0.0120,0.0200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5404,2024-06-24,2.22,313.049,4.25,5459.5801,5490.6602,5447.5898,5447.8701,5447.8701,3696750000,...,0.7906,0.4832,-0.0031,0.0055,0.0005,0.8196,0.0067,0.0054,0.0047,0.0079
5405,2024-06-25,2.22,313.049,4.23,5460.7300,5472.8799,5446.5601,5469.2998,5469.2998,3591960000,...,0.4832,0.5736,0.0039,0.0054,0.0005,0.8196,0.0035,0.0033,0.0029,0.0054
5406,2024-06-26,2.26,313.049,4.32,5460.7100,5483.1401,5451.8701,5477.8999,5477.8999,3563920000,...,0.5736,0.4256,0.0016,0.0053,0.0005,0.8196,0.0034,0.0035,0.0034,0.0059
5407,2024-06-27,2.26,313.049,4.29,5473.5898,5490.8101,5467.5400,5482.8701,5482.8701,3589530000,...,0.4256,1.3304,0.0009,0.0052,0.0005,0.8196,0.0028,0.0028,0.0026,0.0043
