In [71]:
import numpy as np
import pandas as pd
import yfinance as yf
from functools import reduce


In [72]:
import sys
sys.path.append("..") 
from models import mathsModels as mvm


In [73]:
def download_sp500_d():   
    symbol = "^GSPC" 
    start_date = "2000-01-01"
    end_date = None
    output_file = "../data/sp500_daily_data.csv"

    data = yf.download(symbol, start=start_date, end=end_date, interval="1d")
    data.to_csv(output_file)

In [74]:
def download_sp500_m():   
    symbol = "^GSPC" 
    interval = "1m"
    period = "7d"
    output_file = "../data/sp500_intraday_1m.csv"

    data = yf.download(symbol, interval=interval, period=period)
    data.to_csv(output_file)

In [75]:
def load_csv(file_path, date_column, rename_columns):
    df = pd.read_csv(file_path)
    df[date_column] = pd.to_datetime(df[date_column])
    df = df.rename(columns=rename_columns)
    return df

In [76]:
def load_data():
    global inflation, cpi_df, daily_volume_df, spx_volume_df, treasury_yield_df, sp500_daily_df, sp500_intraday_df, gdp, mortage_rate, unemployement, fed_fund_rate
    inflation = load_csv("../../data/Inflation.csv", "observation_date", {"observation_date": "Date", "T10YIE": "Inflation"})
    cpi_df = load_csv("../../data/CPI.csv", "observation_date", {"observation_date": "Date", "CPIAUCSL": "CPI"})
    daily_volume_df = load_csv("../../data/daily_volume_2024_1.csv", "Trade Date", {"Trade Date": "Date", "Volume": "Daily_Volume"})
    spx_volume_df = load_csv("../../data/daily_volume_SPX_2020-01-01_2024-12-18.csv", "Trade Date", {"Trade Date": "Date", "Volume": "SPX_Volume"})
    treasury_yield_df = load_csv("../../data/Market Yield.csv", "observation_date", {"observation_date": "Date", "DGS10": "Treasury_Yield"})
    sp500_daily_df = load_csv("../../data/sp500_daily.csv", "Date", {"Adj Close": "SP500_Adj_Close"})
    gdp=load_csv("../../data/GDP.csv", "observation_date", {"observation_date": "Date"})
    unemployement=load_csv("../../data/UNRATE.csv", "observation_date", {"observation_date": "Date", "UNRATE":"unemployement"})
    mortage_rate=load_csv("../../data/MORTGAGE30US.csv", "observation_date", {"observation_date": "Date", "MORTGAGE":"mortage"})
    fed_fund_rate=load_csv("../../data/Federal_Funds_Effective_Rate.csv", "observation_date", {"observation_date": "Date", "FEDFUNDS":"fed_fund_rate"})

    #sp500_intraday_df = load_csv("../../data/sp500_minute.csv", "Datetime", {"Adj Close": "SP500_Intraday_Adj_Close"})

In [77]:
def data_cleaning():
    global sp500_intraday_agg, sp500_daily_df, daily_volume_df, spx_volume_df, gdp, inflation, cpi_df, treasury_yield_df, mortage_rate, unemployement, fed_fund_rate
    #sp500_intraday_df["Date"] = sp500_intraday_df["Datetime"].dt.date
    #sp500_intraday_agg = sp500_intraday_df.groupby("Date")["SP500_Intraday_Adj_Close"].mean().reset_index()
    #sp500_intraday_agg["Date"] = pd.to_datetime(sp500_intraday_agg["Date"])

    sp500_daily_df["Date"] = pd.to_datetime(sp500_daily_df["Date"])
    daily_volume_df["Date"] = pd.to_datetime(daily_volume_df["Date"])
    spx_volume_df["Date"] = pd.to_datetime(spx_volume_df["Date"])
    gdp["Date"] = pd.to_datetime(gdp["Date"])
    inflation["Date"] = pd.to_datetime(inflation["Date"])
    cpi_df["Date"] = pd.to_datetime(cpi_df["Date"])
    treasury_yield_df["Date"] = pd.to_datetime(treasury_yield_df["Date"])
    unemployement["Date"] = pd.to_datetime(unemployement["Date"])
    mortage_rate["Date"] = pd.to_datetime(mortage_rate["Date"])
    fed_fund_rate["Date"] = pd.to_datetime(fed_fund_rate["Date"])



In [78]:
def data_filling(df, date_column, value_column):
    df = df.set_index(date_column)
    df = df.asfreq('D')
    df[value_column] = df[value_column].fillna(method='ffill')
    return df.reset_index()

In [79]:
def fill():
    global cpi_df, gdp, mortage_rate, unemployement, fed_fund_rate
    cpi_df=data_filling(cpi_df, "Date", "CPI")
    gdp=data_filling(gdp, "Date", "GDP")
    fed_fund_rate=data_filling(fed_fund_rate, "Date", "fed_fund_rate")
    unemployement=data_filling(unemployement, "Date", "unemployement")
    mortage_rate=data_filling(mortage_rate, "Date", "mortage")

    

In [80]:
def data_merging():
    data_cleaning()
    #all_dfs = [breakeven_df, cpi_df, treasury_yield_df, sp500_daily_df, sp500_intraday_agg, daily_volume_df, spx_volume_df]
    #all_dfs = [breakeven_df, cpi_df, treasury_yield_df, sp500_daily_df, daily_volume_df, spx_volume_df]
    all_dfs = [inflation, cpi_df, treasury_yield_df, sp500_daily_df, gdp, mortage_rate, unemployement, fed_fund_rate]
    merged_df = reduce(lambda left, right: pd.merge(left, right, on="Date", how="inner"), all_dfs)

    global cleaned_df
    cleaned_df = merged_df.dropna()

In [81]:
def rename_fields():
    global cleaned_df
    cleaned_df = cleaned_df.rename(columns={"10Y_Breakeven_Inflation_Rate": "Inflation"})
    cleaned_df = cleaned_df.rename(columns={"10Y_Treasury_Yield": "Market_Yield"})
    #cleaned_df = cleaned_df.rename(columns={"Ancien_Nom": "Nouveau_Nom"})


In [82]:
def add_volatility():
    global cleaned_df
    cleaned_df['volatility'] = (cleaned_df['High'] - cleaned_df['Low']) / cleaned_df['Low'] * 100
    cleaned_df['volatility_forcast'] = cleaned_df['volatility'].shift(-1)
    # plus returns
    #cleaned_df['returns'] = np.log(cleaned_df['Close'] / cleaned_df['Close'].shift(1))
    cleaned_df['returns'] = cleaned_df['Close'] - cleaned_df['Close'].shift(1)




In [83]:
def add_models_data():
    lambda_ = 0.94
    cleaned_df['EWMA_VM'] = mvm.EWMA_VM(cleaned_df['returns'].fillna(0).values, lambda_)

    omega, alpha, beta = 0.0001, 0.05, 0.8
    cleaned_df['GARCH_VM'] = mvm.GARCH_VM(cleaned_df['returns'].fillna(0).values, omega, alpha, beta)

    omega, alpha, beta, gamma = 0.0001, 0.05, 0.9, -0.1
    cleaned_df['EGARCH_VM'] = mvm.EGARCH_VM(cleaned_df['returns'].fillna(0).values, omega, alpha, beta, gamma)

    cleaned_df['RogersSatchell_VM'] = mvm.RogersSatchell_VM(cleaned_df['High'].values,cleaned_df['Low'].values,cleaned_df['Open'].values,cleaned_df['Close'].values)


    cleaned_df['garman_klass'] = mvm.GarmanKlass_VM(
    cleaned_df['Open'].values,
    cleaned_df['Close'].values,
    cleaned_df['High'].values,
    cleaned_df['Low'].values
    )

    cleaned_df['parkinson'] = mvm.parkinson_VM(
        cleaned_df['High'].values,
        cleaned_df['Low'].values
    )

    cleaned_df['yang_zhang'] = mvm.Yang_Zhang_VM(
        cleaned_df['Open'].values,
        cleaned_df['Close'].values,
        cleaned_df['High'].values,
        cleaned_df['Low'].values
    )



    
    # ARCH volatility
    lags = 5
    arch_volatility = mvm.ARCH_VM(cleaned_df['returns'].values, lags=lags)
    #cleaned_df['arch_volatility'] = np.concatenate((np.full(lags, np.nan), arch_volatility))



In [84]:
def norm():
    global cleaned_df, df_normalized

    df_normalized = (cleaned_df - cleaned_df.min()) / (cleaned_df.max() - cleaned_df.min())

In [85]:
def dataPreprocessing():
    global cleaned_df, df_normalized
    load_data()
    data_cleaning()
    fill()
    data_merging()
    add_volatility()
    add_models_data()
    norm()
    #df_normalized['move'] = (df_normalized['Close'] > df_normalized['Close'].shift(1)).astype(int)
    df_normalized['move'] = (df_normalized['Close'].shift(-1) > df_normalized['Close']).astype(int)

    df_normalized=df_normalized.dropna()
    df_normalized = df_normalized.round(4)
    df_normalized.to_csv("../../data/dataV.csv", index=False)
    #cleaned_df=cleaned_df.dropna()
    #cleaned_df = cleaned_df.round(4)
    #cleaned_df.to_csv("../../data/dataV.csv", index=False)


In [86]:
dataPreprocessing()

  df_normalized = (cleaned_df - cleaned_df.min()) / (cleaned_df.max() - cleaned_df.min())


In [87]:
df_normalized


Unnamed: 0,Date,Inflation,CPI,Treasury_Yield,Open,High,Low,Close,SP500_Adj_Close,Volume,...,volatility_forcast,returns,EWMA_VM,GARCH_VM,EGARCH_VM,RogersSatchell_VM,garman_klass,parkinson,yang_zhang,move
1,0.000127,0.530201,0.0,0.744726,0.047659,0.044731,0.049179,0.04824,0.04824,0.069794,...,0.211467,0.58431,0.146978,0.047215,0.696998,0.085844,0.071783,0.070277,0.054586,1
2,0.000509,0.533557,0.0,0.753165,0.047568,0.048981,0.050328,0.052485,0.052485,0.097281,...,0.09116,0.621878,0.141267,0.037729,0.50212,0.107998,0.135956,0.220622,0.174552,0
3,0.000637,0.530201,0.0,0.742616,0.051804,0.048783,0.052688,0.051221,0.051221,0.107127,...,0.128589,0.574153,0.139468,0.0319,0.356275,0.086944,0.083285,0.095749,0.074911,0
4,0.000764,0.560403,0.0,0.734177,0.050542,0.04715,0.050271,0.048519,0.048519,0.100136,...,0.164762,0.561691,0.134322,0.025633,0.284605,0.06641,0.080619,0.134779,0.106056,1
5,0.000892,0.583893,0.0,0.774262,0.047846,0.048265,0.050606,0.052186,0.052186,0.108488,...,0.13309,0.616871,0.13058,0.021176,0.251777,0.049889,0.087794,0.172344,0.13603,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5404,0.999108,0.731544,0.996296,0.78692,0.991617,0.99317,0.995061,0.99186,0.99186,0.300958,...,0.029669,0.554937,0.170373,0.010327,0.0,0.083362,0.059119,0.059663,0.046516,1
5405,0.999236,0.731544,0.996296,0.7827,0.991856,0.989487,0.994847,0.996314,0.996314,0.291517,...,0.03761,0.623696,0.166129,0.009403,0.0,0.043778,0.030969,0.031271,0.027511,1
5406,0.999363,0.744966,0.996296,0.801688,0.991852,0.991612,0.995952,0.998102,0.998102,0.288991,...,0.024603,0.600591,0.163462,0.009417,0.0,0.042616,0.034581,0.039623,0.0317,1
5407,0.999491,0.744966,0.996296,0.795359,0.994524,0.993201,0.999213,0.999135,0.999135,0.291299,...,0.104142,0.594054,0.157843,0.007803,0.0,0.034535,0.024997,0.025938,0.019745,0


In [88]:
def smaller_df():
    df_s = df_normalized[['Date', 'Open', 'High', 'Low', 'Close', 'SP500_Adj_Close','move', 'Volume', 'volatility', 'volatility_forcast', 'returns', 'GARCH_VM']]

    df_s['High']=abs(df_s['Open'] - df_s['High'])
    df_s['Low']=abs(df_s['Open'] - df_s['Low'])
    df_s['Close']=abs(df_s['Open'] - df_s['Close'])


    df_s.to_csv("../../data/dataS.csv", index=False)

smaller_df()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['High']=abs(df_s['Open'] - df_s['High'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['Low']=abs(df_s['Open'] - df_s['Low'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['Close']=abs(df_s['Open'] - df_s['Close'])
