In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
from pandas.tseries.offsets import BDay
import re
import csv
import os

import warnings
warnings.filterwarnings("ignore")


List of all Settlements archives

In [4]:
lst =os.listdir('data/Settlements')
lst[0:10]

['Corn_19_1_2018_ags_settlements.txt',
 'Wheat_2_6_2017_ags_settlements.txt',
 '13_11_2017_ags_settlements.txt',
 'Soybean Oil_26_9_2017_ags_settlements.txt',
 '21_8_2018_eonly_settlements.txt',
 'Soybean Oil_24_6_2019_ags_settlements.txt',
 'Wheat_17_4_2019_ags_settlements.txt',
 '7_11_2016_ags_settlements.txt',
 '16_7_2018_ags_settlements.txt',
 'Soybeans_13_12_2016_ags_settlements.txt']

Getting first entry in file name to identify agricultural contacts

In [5]:
archs = pd.DataFrame([l.split('_') for l in lst])
list(set(archs[0]))[0:10]

['30-Year T-Note',
 '9',
 'Corn',
 '19',
 '13',
 '21',
 'S&P',
 '2',
 'SnP',
 'Soybean Oil']

Identifying agg contracts and separating by asset, sorting by date

In [6]:
archs_corn = archs[ (archs[0]=='Corn') | (archs[0]=='corn') ]
archs_wheat = archs[ (archs[0]=='Wheat') | (archs[0]=='wheat') ]
archs_soybeans = archs[ (archs[0]=='Soybeans') | (archs[0]=='soybeans') ]

arches_corn = archs_corn.sort_values(by=[3,2,1])
arches_wheat = archs_wheat.sort_values(by=[3,2,1])
arches_soybeans = archs_soybeans.sort_values(by=[3,2,1])

print("# of contracts corn: ",arches_corn.shape[0])
print("# of contracts wheat: ",arches_wheat.shape[0])
print("# of contracts soybeans: ",arches_soybeans.shape[0])

#first Corn contract
lst[arches_corn.index[0]]

# of contracts corn:  457
# of contracts wheat:  424
# of contracts soybeans:  440


'corn_11_10_2016_settlements.txt'

Creating a dict with keys the months in the Settlements file format, and values as contract expirations in datetime format. Agricultural products expire on the 14th of every month, or the previous business day if a weekend/holiday

In [7]:

mon = ['JAN','FEB',"MAR",'APR','MAY','JUN','JLY','AUG','SEP','OCT','NOV','DEC']
dummy_dict = {i : mon[i-1] for i in range(1,13)}
exp_dict = {}
for year in [16,17,18,19,20]:
    for month in range(1,13):
        exp_dict[mon[month - 1]+str(year)] = datetime(2000 + year, month, 15) - BDay(1)

exp_dict['JAN16']

Timestamp('2016-01-14 00:00:00')

In [8]:
def scrape_settlements(asset):

    asset_full_dict = {}
    if asset == 'Corn':
        arches = arches_corn
    elif asset == 'Wheat':
        arches = arches_wheat
    else:
        arches = arches_soybeans
        
    #List with dates from each settlement file name in datetime
    dates_list = []
    for ind in arches.index:
        dates_list.append(datetime(int(arches[3].loc[ind]), 
                               int(arches[2].loc[ind]), int(arches[1].loc[ind])))    
        
    for arch_ind, arch in enumerate(tqdm(arches.index)):
        
        try:
            date_dict = {}
            doc_date = dates_list[arch_ind]
            doc = []
            with open('data/Settlements/' + lst[arch]) as f:
                for row in f:
                    doc.append(row) 

            #Identifying lead options contracts 

            if asset == 'Corn':
                cont_name = 'PY'
            elif asset == 'Wheat':
                cont_name = 'WZ'
            else:
                cont_name = 'CZO'
            
            # Finding indices inside the document for each option expiration for lead contracts
            py = [d.startswith(cont_name) for d in doc] 
            py_index = pd.DataFrame(doc)[py].index

            # Putting document in a DataFrame
            text = [re.sub("\s+", ",", d.strip()).split(',') for d in doc]
            df = pd.DataFrame(text)
            df = df[[0,1,2,3,4,8,10]]
            df['expiration'] = np.zeros(df.shape[0])
            df['future'] = np.zeros(df.shape[0])
            df['date'] = dates_list[arch_ind]

            # Converting column 8 (yesterday's settlement prices) into a float with same format as strike column, 
            # or empty string if non numerical data
            for ind in df.index:
                num = str(df[[8]].iloc[ind]).split()[1].split('\'')
                try:
                    if num[0] == '':
                        num[0] = 0
                    num = 10*float(num[0]) + float(num[1])
                    df[8].loc[ind] = num
                except:
                    pass

            # Identifying futures prices for the day
            futs = df[[0,8]][1:6]
            futs = futs.set_index(0)

            # Identifying if call or put
            py_index_call = py_index[df.iloc[py_index][4] == 'CALL']
            py_index_put = py_index[df.iloc[py_index][4] == 'PUT']

            # Looking at the 5 most recent options contracts if I don't have the futures price for the date, discard
            for py_i, py_ind in enumerate([py_index_call, py_index_put]):
                vol_dict = {}
                for i in range(5):
                    last_ind = py_ind[i]
                    exp = str(df.iloc[py_ind[i]][1])
                    if exp in futs.index:
                        while df[0][last_ind + 1].isnumeric():
                            last_ind += 1 
                        df.expiration[py_index[i]:last_ind] = exp_dict[exp]
                        df.future = float(futs.loc[exp])
                        vol_dict[i] = df[py_ind[i]:last_ind][1:].dropna()
                        vol_dict[i] = vol_dict[i][[0,8,10,'expiration','future','date']]
                        vol_dict[i] = vol_dict[i].rename(columns={0:'strike',8:'settle',10:'oi'})
                        vol_dict[i].strike = vol_dict[i].strike.astype('float')
                        vol_dict[i].settle = vol_dict[i].settle.astype('float')
                        vol_dict[i].oi = vol_dict[i].oi.astype('int')
                    else:
                        vol_dict[i] = ''
                if py_i == 0:
                    date_dict['Call'] = vol_dict
                else:
                    date_dict['Put'] = vol_dict
            asset_full_dict[doc_date] = date_dict
        except:
            print(arch)
    return asset_full_dict



In [9]:
corn_dict = scrape_settlements('Corn')

100%|██████████| 457/457 [19:21<00:00,  2.54s/it]


In [10]:
wheat_dict = scrape_settlements('Wheat')

 86%|████████▋ | 366/424 [18:38<01:40,  1.73s/it]

727


 89%|████████▉ | 377/424 [18:55<01:16,  1.62s/it]

1504


100%|██████████| 424/424 [20:05<00:00,  2.84s/it]


In [11]:
soybeans_dict = scrape_settlements('Soybeans')

 86%|████████▋ | 380/440 [18:56<02:29,  2.49s/it]

1698


 89%|████████▊ | 390/440 [19:18<01:57,  2.36s/it]

951


100%|██████████| 440/440 [21:02<00:00,  2.87s/it]


In [13]:
import pickle

with open('data/corn_dict.pickle', 'wb') as handle:
    pickle.dump(corn_dict, handle)
with open('data/wheat_dict.pickle', 'wb') as handle:
    pickle.dump(wheat_dict, handle)
with open('data/soybeans_dict.pickle', 'wb') as handle:
    pickle.dump(soybeans_dict, handle)