<a href="https://colab.research.google.com/github/harperd17/energy_volatility_prediction/blob/main/notebooks/Data%20Ingestion/Futures_Data_Ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal: Ingest Futures Data using yahoo finance

In [None]:
!pip install yfinance
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
from datetime import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%cd /content/drive/Shareddrives/Data606_Energy/data/futures

/content/drive/Shareddrives/Data606_Energy/data/futures


In [None]:
futures_symbols = {'BZ=F':'Brent Crude Oil','NG=F':'Natural Gas','CL=F':'Light Sweet Crude Oil','HO=F':'Heating Oil'}

In [None]:
def get_futures_summaries(data,column_addition_string):
  # first break down the dataset by year
  data['year'] = data.index.year
  data['month'] = data.index.month
  unique_years = list(range(2000,2022))#data['year'].unique()
  mean_movement = []
  std_movement = []
  cv_movement = []
  mean_volume = []
  std_volume = []
  cv_volume = []
  # mean_open_interest = []
  # std_open_interest = []
  dates = []
  for year in unique_years:
    for month in range(1,13):
      dates.append(datetime(year=year,month=month,day=1))
      data_year_month = data[(data['year']==year)&(data['month']==month)]
      # get the average price movement within a day
      mean_movement.append(((data_year_month['High']-data_year_month['Low'])/data_year_month['Open']).mean())
      std_movement.append(((data_year_month['High']-data_year_month['Low'])/data_year_month['Open']).std())
      cv_movement.append(data_year_month['Close'].std()/data_year_month['Close'].mean())
      mean_volume.append(data_year_month['Volume'].mean())
      std_volume.append(data_year_month['Volume'].std())
      cv_volume.append(data_year_month['Volume'].std()/data_year_month['Volume'].mean())
  return {'Date':dates,'Mean Movement'+column_addition_string:mean_movement,'Std Movement'+column_addition_string:std_movement,'CV Movement'+column_addition_string:cv_movement,
          'Mean Volume'+column_addition_string:mean_volume,'Std Volume'+column_addition_string:std_volume,'CV Volume'+column_addition_string:cv_volume}#, 'Mean Open Interest'+column_addition_string:mean_open_interest,'Std Open Interest'+column_addition_string:std_open_interest}

In [None]:
symbol_summaries = []
symbols_daily_data = []
symbols_weekly_data = []
for symbol in futures_symbols:
  # get summaries at the daily level
  daily_data = yf.download(tickers=symbol,start = '2000-01-01',end='2021-01-01',interval = '1d')
  symbols_daily_data.append(daily_data)
  daily_summaries = pd.DataFrame(get_futures_summaries(daily_data.copy(),' Daily '+symbol))
  # get summaries at the weekly level
  weekly_data = yf.download(tickers=symbol,start = '2000-01-01',end='2021-01-01',interval = '1wk')
  symbols_weekly_data.append(weekly_data)
  weekly_summaries = pd.DataFrame(get_futures_summaries(weekly_data.copy(),' Weekly '+symbol))
  # # get the summaries at the monthly level
  # monthly_data = yf.download(tickers=symbol,start = '2000-01-01',end='2021-01-01',interval = '1mo')
  # monthly_summaries = pd.DataFrame(get_futures_summaries(monthly_data.copy(),' Monthly '+symbol))
  # combine it all into one dataframe
  all_time_frames = pd.merge(daily_summaries,weekly_summaries,on='Date',how='outer')
  #all_time_frames = pd.merge(all_time_frames,monthly_summaries,on='Year')
  symbol_summaries.append(all_time_frames)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:
# now go through the dataframes and create one large dataframe
try:
  del grand_data_frame
except:
  ...
for symbol_summary in symbol_summaries:
  # see if the dataframe has been started
  try:
    grand_data_frame
    # if it exists, merge the current dataframe to it
    grand_data_frame = pd.merge(grand_data_frame, symbol_summary,on='Date',how='outer')
  except:
    # if it doesn't exist, create it!
    grand_data_frame = symbol_summary.copy()

In [None]:
#grand_data_frame.to_csv('futures_data.csv',index=False)

In [None]:
# here I am creating a standardized data set that it standardized based on rolling means and rolling standard deviations
# the reason for doing this is that for many of these contracts, the average volume increases as time goes on. However, we want to
# know about the impact of volume that is above or below the normal ammount
data_standardized = grand_data_frame.copy()
period = 12#period = 21
for col in data_standardized.columns:
  # standardize the columns which have monthly average to be (avg - rollinging avg)/(rolling std)
  if 'Mean' in col:
    # how the mean of each month/year compare to the means of the last "period" number of months in comparison to the last "period" of months' standard deviations in the underlying column (either price movement or volume)
    # create a column that is the average for that (month - moving average of monthly averages)/(moving average of monthly standard deviations)
    data_standardized[col+' Standardized'] = (data_standardized[col] - data_standardized[col].rolling(period).mean())/data_standardized[col.replace('Mean','Std')].rolling(period).mean()
  # standardize the columns which have monthly standard deviation to be (std - rolling mean of std)/(rolling std)
  if 'Std' in col:
    # how the standard deviation compare to the previous "period" number of months - is the variation in the underlying data (either price movement or volume) higher or lower than expected
    data_standardized[col+' Standardized'] = (data_standardized[col]-data_standardized[col].rolling(period).mean())/data_standardized[col].rolling(period).std()

In [None]:
# I also want to include some variables for the coefficient of variances
for ticker in ['BZ=F','NG=F','CL=F','HO=F']:
  # another set where the CV is for the rolling values from the past  "period" number of months
  data_standardized['CV Movement Rolling '+ticker] = data_standardized['Std Movement Daily '+ticker].rolling(period).std()/data_standardized['Mean Movement Daily '+ticker].rolling(period).mean()
  data_standardized['CV Volume Rolling '+ticker] = data_standardized['Std Volume Daily '+ticker].rolling(period).std()/data_standardized['Mean Volume Daily '+ticker].rolling(period).mean()

In [None]:
data_standardized.index = data_standardized['Date']
data_standardized['Year'] = data_standardized.index.year
data_standardized = data_standardized.reset_index(drop=True)

In [None]:
data_standardized[~data_standardized['Mean Volume Weekly HO=F Standardized'].isnull()]

Unnamed: 0,Date,Mean Movement Daily BZ=F,Std Movement Daily BZ=F,CV Movement Daily BZ=F,Mean Volume Daily BZ=F,Std Volume Daily BZ=F,CV Volume Daily BZ=F,Mean Movement Weekly BZ=F,Std Movement Weekly BZ=F,CV Movement Weekly BZ=F,Mean Volume Weekly BZ=F,Std Volume Weekly BZ=F,CV Volume Weekly BZ=F,Mean Movement Daily NG=F,Std Movement Daily NG=F,CV Movement Daily NG=F,Mean Volume Daily NG=F,Std Volume Daily NG=F,CV Volume Daily NG=F,Mean Movement Weekly NG=F,Std Movement Weekly NG=F,CV Movement Weekly NG=F,Mean Volume Weekly NG=F,Std Volume Weekly NG=F,CV Volume Weekly NG=F,Mean Movement Daily CL=F,Std Movement Daily CL=F,CV Movement Daily CL=F,Mean Volume Daily CL=F,Std Volume Daily CL=F,CV Volume Daily CL=F,Mean Movement Weekly CL=F,Std Movement Weekly CL=F,CV Movement Weekly CL=F,Mean Volume Weekly CL=F,Std Volume Weekly CL=F,CV Volume Weekly CL=F,Mean Movement Daily HO=F,Std Movement Daily HO=F,CV Movement Daily HO=F,...,Std Movement Daily BZ=F Standardized,Mean Volume Daily BZ=F Standardized,Std Volume Daily BZ=F Standardized,Mean Movement Weekly BZ=F Standardized,Std Movement Weekly BZ=F Standardized,Mean Volume Weekly BZ=F Standardized,Std Volume Weekly BZ=F Standardized,Mean Movement Daily NG=F Standardized,Std Movement Daily NG=F Standardized,Mean Volume Daily NG=F Standardized,Std Volume Daily NG=F Standardized,Mean Movement Weekly NG=F Standardized,Std Movement Weekly NG=F Standardized,Mean Volume Weekly NG=F Standardized,Std Volume Weekly NG=F Standardized,Mean Movement Daily CL=F Standardized,Std Movement Daily CL=F Standardized,Mean Volume Daily CL=F Standardized,Std Volume Daily CL=F Standardized,Mean Movement Weekly CL=F Standardized,Std Movement Weekly CL=F Standardized,Mean Volume Weekly CL=F Standardized,Std Volume Weekly CL=F Standardized,Mean Movement Daily HO=F Standardized,Std Movement Daily HO=F Standardized,Mean Volume Daily HO=F Standardized,Std Volume Daily HO=F Standardized,Mean Movement Weekly HO=F Standardized,Std Movement Weekly HO=F Standardized,Mean Volume Weekly HO=F Standardized,Std Volume Weekly HO=F Standardized,CV Movement Rolling BZ=F,CV Volume Rolling BZ=F,CV Movement Rolling NG=F,CV Volume Rolling NG=F,CV Movement Rolling CL=F,CV Volume Rolling CL=F,CV Movement Rolling HO=F,CV Volume Rolling HO=F,Year
19,2001-08-01,,,,,,,,,,,,,0.059086,0.037053,0.110806,28764.173913,13327.493090,0.463337,0.160392,0.045092,0.140393,141053.50,33481.046484,0.237364,0.022945,0.007454,0.019192,63650.608696,20279.500314,0.318607,0.052301,0.014410,0.022641,313907.75,52410.533937,0.166962,0.024485,0.010145,0.023905,...,,,,,,,,0.390747,1.375126,-0.067545,1.186496,1.123163,1.019606,-0.042027,0.155795,-0.674466,-1.757786,-0.149666,0.178118,-0.942170,-0.732343,-0.106627,-0.244433,-0.586980,-0.767849,-0.799095,-0.264297,-0.472114,-0.443604,-1.029627,-0.299533,,,0.202735,0.101312,0.083037,0.046731,0.149257,0.063979,2001
20,2001-09-01,,,,,,,,,,,,,0.049034,0.014263,0.097745,11657.562500,12657.299370,1.085759,0.173100,0.066637,0.090462,46630.25,35713.108820,0.765879,0.041976,0.028461,0.099033,57879.875000,39237.013019,0.677904,0.105206,0.045475,0.098937,231519.50,154561.701756,0.667597,0.045281,0.033923,0.098301,...,,,,,,,,-0.099474,-0.933936,-1.602024,0.950303,1.101479,2.132695,-2.764385,0.272378,0.807094,2.850148,-0.314214,2.779845,1.011855,1.114661,-1.116325,2.019221,0.738368,2.426444,-0.931222,2.355037,0.822834,0.611981,-1.880284,1.414147,,,0.190543,0.109559,0.171665,0.099690,0.223412,0.091814,2001
21,2001-10-01,,,,,,,,,,,,,0.073091,0.035400,0.128645,34235.173913,9063.167797,0.264733,0.156749,0.018203,0.154624,165166.60,32849.677796,0.198888,0.034608,0.014579,0.023623,59992.565217,13940.306058,0.232367,0.083796,0.027432,0.043084,294679.00,24704.667403,0.083836,0.029945,0.013052,0.025175,...,,,,,,,,0.765425,1.036153,0.719529,-0.173972,0.609589,-1.246042,1.120960,0.087789,0.216005,0.251050,-0.209317,-0.993381,0.234381,0.218342,-0.144637,-1.061380,-0.233972,-0.362094,-0.124311,-0.024313,-0.447018,-0.792168,-0.028108,-0.661667,,,0.189093,0.107187,0.170342,0.105343,0.222265,0.091987,2001
22,2001-11-01,,,,,,,,,,,,,0.062255,0.049405,0.080852,38516.200000,13327.050199,0.346012,0.149842,0.065416,0.045833,182975.00,53834.025275,0.294215,0.051309,0.025210,0.064814,75565.200000,22747.772913,0.301035,0.159496,0.047548,0.091681,354434.50,91597.911132,0.258434,0.040854,0.020574,0.063779,...,,,,,,,,0.240578,1.808993,1.182083,1.249404,0.335416,1.582541,1.828648,1.943015,1.187669,1.719612,0.484452,0.252151,2.878183,1.438835,0.673178,0.657533,0.366347,0.531539,0.491184,0.272706,1.231938,1.020789,0.690399,1.200196,,,0.218040,0.109603,0.181141,0.103326,0.210233,0.083865,2001
23,2001-12-01,,,,,,,,,,,,,0.056872,0.016785,0.045744,37613.421053,12342.079288,0.328130,0.137405,0.038718,0.095216,156763.80,53715.641467,0.342653,0.042692,0.017662,0.045715,66750.210526,12385.790822,0.185554,0.091258,0.028046,0.052227,297331.20,86632.197771,0.291366,0.042618,0.018220,0.049682,...,,,,,,,,0.101511,-0.782868,0.912684,0.785039,0.156687,0.080913,0.713096,1.603579,0.601697,0.467302,0.049881,-1.082637,0.302577,0.365928,-0.212095,0.653160,0.508343,0.226362,0.893842,-0.944084,0.410079,-0.234344,0.908807,0.925448,,,0.231116,0.106336,0.183162,0.108486,0.212428,0.087731,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,2020-08-01,0.021017,0.010759,0.012491,19330.571429,6433.643848,0.332822,0.053421,0.021772,0.021148,100036.60,22648.133815,0.226398,0.061618,0.029310,0.073873,168853.333333,71737.727286,0.424852,0.136889,0.059287,0.069120,868549.40,250815.085238,0.288775,0.027870,0.009730,0.015516,320039.619048,101492.458689,0.317125,0.062440,0.032502,0.029560,1657913.40,359430.050939,0.216797,0.027799,0.009219,0.015229,...,-0.569684,-1.006928,-1.121160,-1.270053,-0.519447,-1.004142,-1.749077,0.655319,1.622442,-0.215535,0.890251,0.692618,1.437401,-0.051974,0.220170,-0.390500,-0.333272,-1.086507,-0.946959,-0.811066,-0.318377,-1.419433,-0.782702,-0.868869,-0.622042,-0.462926,-1.165743,-1.057398,-0.266559,-0.146221,-0.087171,0.520027,0.166986,0.144565,0.050446,3.877514,0.193862,0.310148,0.126905,2020
248,2020-09-01,0.036000,0.015052,0.033515,24164.900000,10413.706105,0.430943,0.083304,0.025274,0.044090,115735.50,17941.539259,0.155022,0.076724,0.039517,0.098334,155994.904762,61496.913157,0.394224,0.241400,0.098746,0.076156,699669.25,101614.179606,0.145232,0.041494,0.017420,0.040685,334529.047619,135006.123227,0.403571,0.095133,0.025979,0.052613,1572811.25,334965.595555,0.212973,0.035868,0.014710,0.034315,...,-0.377055,-0.581645,-0.252548,-0.616969,-0.462203,-0.647532,-1.598983,1.152160,2.125443,-0.381027,-0.239914,2.661035,2.465543,-0.726597,-1.447731,-0.245804,-0.303443,-0.922525,-0.521265,-0.609865,-0.338246,-1.339721,-0.854443,-0.435890,-0.221867,0.052497,0.368694,-0.695412,-0.516243,-0.122522,-0.721322,0.523158,0.156467,0.170842,0.050778,3.856324,0.202738,0.305275,0.126287,2020
249,2020-10-01,0.034115,0.015596,0.042095,27713.409091,8467.969778,0.305555,0.081452,0.038960,0.062616,133952.50,29150.878609,0.217621,0.059341,0.022259,0.085236,146478.000000,48105.948079,0.328418,0.150104,0.021276,0.095221,725726.25,210659.071032,0.290273,0.041227,0.015943,0.041531,349353.045455,110098.973945,0.315151,0.088140,0.039440,0.060258,1712724.25,398059.784657,0.232413,0.036664,0.013493,0.030842,...,-0.381255,-0.228988,-0.585619,-0.676715,-0.229053,-0.229933,-0.873898,0.193029,0.168433,-0.476883,-1.475668,0.491137,-0.871420,-0.518114,-0.243700,-0.257516,-0.312368,-0.784213,-0.723401,-0.656762,-0.311694,-1.033034,-0.661439,-0.438862,-0.387644,0.047816,0.087377,-0.810649,0.033399,0.074576,-0.018160,0.509074,0.162434,0.161917,0.056998,3.794344,0.211822,0.284176,0.127004,2020
250,2020-11-01,0.035609,0.020428,0.066995,33261.000000,10925.961905,0.328492,0.093750,0.034289,0.088955,156019.60,41860.358513,0.268302,0.048923,0.016697,0.059058,132847.263158,43621.619466,0.328359,0.135922,0.037263,0.062144,662161.80,227303.923553,0.343276,0.043654,0.025020,0.061891,363700.157895,126527.347412,0.347889,0.106384,0.050961,0.089720,1729085.00,458496.468977,0.265167,0.038975,0.021779,0.062394,...,-0.224428,0.226537,-0.189847,-0.493892,-0.374457,0.209801,-0.214972,-0.353500,-0.495375,-0.627578,-1.577418,0.047020,-0.123875,-0.700796,-0.050296,-0.243207,-0.281748,-0.655830,-0.541839,-0.567060,-0.290972,-0.918694,-0.504094,-0.362033,0.179219,0.351692,0.016309,-0.135714,0.008162,0.279844,0.484486,0.488686,0.149895,0.155317,0.066868,3.706326,0.218629,0.267074,0.126563,2020


In [None]:
data_standardized.to_csv('all_futures_data.csv')

# A quick look at the raw data

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
df = symbols_daily_data[2]
fig = fig.add_trace(go.Candlestick(x=df.index,
                open=df['Open'],
                high=df['High'],
                low=df['Low'],
                close=df['Close'],
                ),secondary_y=True)
fig.add_trace(go.Bar(x=df.index, y=df['Volume']),
               secondary_y=False)

fig.show()