# Daily Index Scraper

Each file under the daily index is associated with multiple filings for a broad range of companies. 

Link: https://www.sec.gov/Archives/edgar/daily-index/

In [None]:
# import some libraries
import requests
from bs4 import BeautifulSoup
import urllib

# creating function for joining urls together
def make_url(base, components):
    
    url = base
    # append components to the base
    for comp in components:
        url = '{}/{}'.format(url, comp)
    
    return url

# Declaring user agent header for the content, this header is additional info passed to the request
headers = {'user-agent':'William',
          'Host':'www.sec.gov'}

# declaring base url and components to test the make url function
base_url = r"https://www.sec.gov/Archives/edgar/daily-index/"
components = ['2022', 'QTR3', 'company.20220701.idx']
make_url(base_url, components)

### Now let's build the url for each year and quarter
We can append 2022 and 'index.json' as the components to crawl into that directory first

In [None]:
# Building the year url
year_url = make_url(base_url, ['2022','index.json'])

# Saving the content from a GET request of the filing url
content = requests.get(year_url, headers=headers)

# Convert the content to JSON format
decode = content.json()

for item in decode['directory']['item']:
    
    print('-'*100)
    print('Pulling url for quarter: {}'.format(item['name']))
          
    qtr_url = make_url(base_url, ['2022', item['name'], 'index.json'])
    
    print(qtr_url)
    
    file_content = requests.get(qtr_url, headers=headers)
    decode_file = file_content.json()
          
    print('-'*100)
    print('Pulling files')
    
    for file in decode_file['directory']['item']:
        file_url = make_url(base_url, ['2022', item['name'], file['name']])
        print(file_url)

### Now let's make a request to one of those files
this will allow us to view what each file looks like

In [105]:
# File name variable
file_name = "master.20220728.idx"
# initiating example url from above
file_url = f"https://www.sec.gov/Archives/edgar/daily-index//2022/QTR3/{file_name}"

# make a request for the file url
file_content = requests.get(file_url, headers=headers).content

# writing the content to a text file
with open(f'{file_name}', 'wb') as f:
    f.write(file_content)

In [112]:
# reading the content of the text file we just created
with open(f'{file_name}', 'rb') as f:
    byte_data = f.read()

# time to decode the byte data
data = byte_data.decode('utf-8')

# separating data sections
data = data.split('--------------------------------------------------------------------------------')

# beginning cleaning process of the data
data_format = data[1:]
data_format = data_format[0].splitlines()
data_format = data_format[1:]
master_data = []

# looping through the data list 
for index, item in enumerate(data_format): 
    item_list = item.split('|')
    CIK = item_list[0]
    company_name = item_list[1]
    form_type = item_list[2]
    date_issued = item_list[3]
    file_url = "https://www.sec.gov/Archives/" + item_list[4]
    
    # fetching all forms that are 10-Q or 8-K
    if form_type == '10-Q' or form_type == '8-K':
        print(company_name + '  -----  ' + form_type) 
        print(file_url)

CORE LABORATORIES N V  -----  10-Q
https://www.sec.gov/Archives/edgar/data/1000229/0000950170-22-013393.txt
TUCSON ELECTRIC POWER CO  -----  10-Q
https://www.sec.gov/Archives/edgar/data/100122/0000100122-22-000031.txt
DUKE REALTY Ltd PARTNERSHIP/  -----  8-K
https://www.sec.gov/Archives/edgar/data/1003410/0000783280-22-000049.txt
SOUTHERN Co GAS  -----  10-Q
https://www.sec.gov/Archives/edgar/data/1004155/0000092122-22-000039.txt
SOUTHERN Co GAS  -----  8-K
https://www.sec.gov/Archives/edgar/data/1004155/0000092122-22-000041.txt
OCEANFIRST FINANCIAL CORP  -----  8-K
https://www.sec.gov/Archives/edgar/data/1004702/0001004702-22-000109.txt
PG&E Corp  -----  10-Q
https://www.sec.gov/Archives/edgar/data/1004980/0001004980-22-000112.txt
PG&E Corp  -----  8-K
https://www.sec.gov/Archives/edgar/data/1004980/0001004980-22-000111.txt
COLUMBUS MCKINNON CORP  -----  10-Q
https://www.sec.gov/Archives/edgar/data/1005229/0001005229-22-000240.txt
COLUMBUS MCKINNON CORP  -----  8-K
https://www.sec.gov