In [13]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

import yfinance as yf
import pandas as pd
from scripts.utils import load_config
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Let's set up our config settings so we have what we need.
config_path = os.path.join(project_root, 'config', 'config.yaml')
config = load_config(config_path)

In [12]:
# Now let's bring in the data we need from our config file.
core_tickers = config['yfinance']['core_tickers']
start_date = config['yfinance']['start_date']
end_date = config['yfinance']['end_date']

In [22]:
# Now let's start to bring in the data we need with the variables we just created.
def core_tickers_data(tickers, start, end):
    data = {}
    for ticker in tickers:
        try:
            stock_data = yf.download(ticker, start = start, end = end)[['Close', 'Volume']]
            data[ticker] = stock_data
        except Exception as e:
            print(f" Error downloading data for {ticker}: {e}")
    return data

def prep_time_series_df(data):
    df_list = []
    for ticker, df in data.items():
        df = df.copy()
        df['Ticker'] = ticker
        df_list.append(df)
    combined_df = pd.concat(df_list)
    combined_df.reset_index(inplace=True)
    return combined_df

def check_missing_vals(df):
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0]
    return missing_data

core_data = core_tickers_data(core_tickers, start_date, end_date)

df = prep_time_series_df(core_data)

df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

df_missing = check_missing_vals(df)
print("Missing values before filling methods:", df_missing)

# Let's save this dataframe to a csv file in case we need it later.
df.to_csv(os.path.join(project_root, 'data', 'core_stock_data.csv'), index = True)

# Great, no missing values in our data so far.  Let's take a look.
df.sample(25)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Missing values before filling methods: Series([], dtype: int64)





Unnamed: 0_level_0,Close,Volume,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-25,88.206497,19600000,GOOGL
2021-12-28,146.447998,18624000,GOOG
2021-11-18,149.838501,35608000,GOOGL
2019-06-13,48.537498,86698400,AAPL
2019-10-22,88.286499,42234000,AMZN
2021-10-22,166.777496,62782000,AMZN
2024-05-24,189.979996,36294600,AAPL
2021-06-07,125.900002,71057600,AAPL
2022-10-13,99.709999,32812200,GOOG
2023-04-03,104.360001,25035400,GOOGL


In [31]:
# So far so good.  Now let's start to set up and read in our exogenous data.
def dl_exogenous_data(start, end):
    exo_tickers = {
        'Interest Rates 10yr' : '^TNX',
        'Gold' : 'GC=F',
        'Copper' : 'HG=F',
        'Platinum' : 'PL=F',
        'Silver' : 'SI=F',
        'Crude Oil' : 'CL=F',
        'Natural Gas' : 'NG=F',
        'Corn' : 'ZC=F',
        'Wheat' : 'ZW=F',
        'VIX' : '^VIX',
        'USD / EUR' : 'EURUSD=X',
        'USD / JPY' : 'JPY=X',
        'S&P 500' : '^GSPC',
        'Nasdaq 100' : '^NDX',
        'Dow Jones IndAvg' : '^DJI',
        'Consumer Conf Index' : 'CCI',
        'Vanguard Total World Stock ETF' : 'VT',
        'US Treasury Bond ETF' : 'GOVT',
    }
    exo_data = {}
    for name, ticker in exo_tickers.items():
        try:
            data = yf.download(ticker, start = start, end = end)
            exo_data[name] = data['Close']
        except Exception as e:
            print(f"Error downloading data for {name} ({ticker}): {e}")
    exo_df = pd.DataFrame(exo_data)
    exo_df.reset_index(inplace=True)
    return exo_df

exo_df = dl_exogenous_data(start_date, end_date)

# There were various missing values in almost all our exogenous variables.  Let's fill them, being careful about how we do so.
def fill_missing_vals(df):
    df.fillna(method = 'ffill', inplace = True)
    df.fillna(method = 'bfill', inplace = True)
    
    return df

exo_df = fill_missing_vals(exo_df)
print(exo_df.isnull().sum())

# Just to be sure let's check for missing values again.
missing_exo_data = check_missing_vals(exo_df)
print("Missing values in exogenous data before filling methods:", missing_exo_data)

# Great, let's save this data to another csv file for quick reference if we should need it.
exo_df.to_csv(os.path.join(project_root, 'data', 'exogenous_data.csv'), index = True)

# Let's take a quick look at the output.
exo_df.head()
exo_df.shape
exo_df.sample(15)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Date                              0
Interest Rates 10yr               0
Gold                              0
Copper                            0
Platinum                          0
Silver                            0
Crude Oil                         0
Natural Gas                       0
Corn                              0
Wheat                             0
VIX                               0
USD / EUR                         0
USD / JPY                         0
S&P 500                           0
Nasdaq 100                        0
Dow Jones IndAvg                  0
Consumer Conf Index               0
Vanguard Total World Stock ETF    0
US Treasury Bond ETF              0
dtype: int64
Missing values in exogenous data before filling methods: Series([], dtype: int64)


Unnamed: 0,Date,Interest Rates 10yr,Gold,Copper,Platinum,Silver,Crude Oil,Natural Gas,Corn,Wheat,VIX,USD / EUR,USD / JPY,S&P 500,Nasdaq 100,Dow Jones IndAvg,Consumer Conf Index,Vanguard Total World Stock ETF,US Treasury Bond ETF
1286,2023-12-06,4.121,2030.5,3.7165,889.700012,23.889,69.379997,2.569,464.75,615.75,12.97,1.079366,147.184998,4549.339844,15788.049805,36054.429688,117.089996,98.449997,22.690001
714,2021-09-27,1.484,1750.0,4.2935,982.599976,22.657,75.449997,5.706,539.5,722.25,18.76,1.172305,110.764,4443.109863,15204.820312,34869.371094,177.25,104.830002,26.58
901,2022-06-15,3.395,1815.300049,4.1675,925.400024,21.402,115.309998,7.42,774.0,1050.0,29.620001,1.044059,135.294006,3789.98999,11593.769531,30668.529297,160.25,86.599998,23.49
651,2021-06-30,1.443,1770.800049,4.2965,1070.5,26.165001,73.470001,3.65,720.0,671.5,15.83,1.190193,110.535004,4297.5,14554.799805,34502.511719,195.100006,103.610001,26.6
932,2022-07-28,2.681,1750.300049,3.475,871.900024,19.827,96.419998,8.134,615.0,817.0,22.33,1.020929,136.110992,4072.429932,12717.870117,32529.630859,181.210007,90.209999,24.25
1009,2022-11-14,3.865,1773.599976,3.855,1042.300049,22.091999,85.870003,5.933,657.25,818.5,23.73,1.032855,139.552994,3957.25,11700.94043,33536.699219,135.110001,88.029999,22.59
1137,2023-05-11,3.397,2014.699951,3.6975,1108.099976,24.254999,70.870003,2.19,632.5,614.25,16.93,1.098406,134.087006,4130.620117,13389.780273,33309.511719,116.690002,92.800003,23.48
1329,2024-02-05,4.164,2025.699951,3.772,895.900024,22.334999,72.779999,2.082,442.75,590.25,13.67,1.078004,148.501007,4942.810059,17613.039062,38380.121094,105.589996,103.970001,22.690001
108,2019-05-31,2.142,1305.800049,2.646,792.599976,14.53,53.5,2.454,427.0,503.0,18.709999,1.113115,109.357002,2752.060059,7127.959961,24815.039062,130.009995,71.190002,25.65
911,2022-06-29,3.093,1813.699951,3.779,910.799988,20.667999,109.779999,6.498,770.25,915.5,28.16,1.052355,136.046997,3818.830078,11658.259766,31029.310547,167.309998,85.989998,23.799999
