In [1]:
import requests
import pandas as pd
import os 
import numpy as np

## This code file mainly does two things:

1. Download the masterIndex files from SEC EDGAR, the 'Full and Quarterly Indexes' link of EDGAR can be found here:

    https://www.sec.gov/Archives/edgar/full-index/

    We need to go the specific year and quarter of each folder, and download the masterIndex files from it. 
    The time frame for this project is from 2018 Q1 to 2022 Q4

2. Download the 10Q filings based on masterIndexes:
    
    Each masterIndex row has the following attributes: cik, name, form, filing date and path of a report.Therefore, each row corresponds to a 10Q .txt file. 

    We write two layers of loop: we loop through each masterIndex file in each folder, and within each file, we loop through each row and retrieve the download link of that file and download it.

In the end, I stored my downloaded raw 10Q .txt files in the folder '10Q', with 7320 files, occupying roughly 101GB. 
(This downloading process took roughly 2.5 hours and I had to change the drive to store it half way)


## Download master Index files

In [2]:
heads = {'Host': 'www.sec.gov', 'Connection': 'close',
         'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest',
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
         }

def download_master(year):
    for qtr in range(1, 5):
        url = f"https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{qtr}/master.idx"
        response = requests.get(url, headers=heads)
        print(url)
        response.raise_for_status()

        # local directory to store the downloaded files 
        # In my case, I save all the files under my 'masterIndexes' folder
        down_direct = r"D:\NLP-HW1\masterIndexes"

        with open(f'{down_direct}/master{year}QTR{qtr}.idx', 'wb') as f:
            f.write(response.content)

# download masterIndex from 2018 to 2022
start_year =2018
end_year = 2022
for i in range(start_year,end_year+1):
    download_master(i)

https://www.sec.gov/Archives/edgar/full-index/2018/QTR1/master.idx
https://www.sec.gov/Archives/edgar/full-index/2018/QTR2/master.idx
https://www.sec.gov/Archives/edgar/full-index/2018/QTR3/master.idx
https://www.sec.gov/Archives/edgar/full-index/2018/QTR4/master.idx
https://www.sec.gov/Archives/edgar/full-index/2019/QTR1/master.idx
https://www.sec.gov/Archives/edgar/full-index/2019/QTR2/master.idx
https://www.sec.gov/Archives/edgar/full-index/2019/QTR3/master.idx
https://www.sec.gov/Archives/edgar/full-index/2019/QTR4/master.idx
https://www.sec.gov/Archives/edgar/full-index/2020/QTR1/master.idx
https://www.sec.gov/Archives/edgar/full-index/2020/QTR2/master.idx
https://www.sec.gov/Archives/edgar/full-index/2020/QTR3/master.idx
https://www.sec.gov/Archives/edgar/full-index/2020/QTR4/master.idx
https://www.sec.gov/Archives/edgar/full-index/2021/QTR1/master.idx
https://www.sec.gov/Archives/edgar/full-index/2021/QTR2/master.idx
https://www.sec.gov/Archives/edgar/full-index/2021/QTR3/master

## Download 10 Q for SP500

In [4]:
# read in the sp500_cik file, while is obtained from the sp500_crsp.ipynb
sp500_cik = pd.read_csv('sp500_ciks.csv')
sp500_cik.rename(columns={"cik": "CIK"}, inplace=True)

# get the master_list
master_list = os.listdir('masterIndexes') # save the path folder 

In [47]:
# need to have this headers, Google for "My User Agent" and paste the result as the dictionary values 

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

# function to download files based on url, write it to a directory, return a local_filename
def download_file(url, directory, local_filename):
    # Make directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)
    
    with requests.get(url, stream=True, headers=headers) as r: # headers
        r.raise_for_status()
        with open(os.path.join(directory, local_filename), 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename

 
# outer loop: loop through the master_list, and read the masterIndexes of each file into a dataframe
for master_file_name in master_list:
    with open('masterIndexes/'+master_file_name, 'r') as f:
        lines = f.readlines()
    # Find the line index where the table starts (where 'CIK' is found)
    start_line = next(i for i, line in enumerate(lines) if 'CIK' in line)

    # Read the table into a dataframe 
    df = pd.read_csv('masterIndexes/'+master_file_name, delimiter='|', skiprows=start_line+2, 
                    names=['CIK', 'Company Name', 'Form Type', 'Date Filed', 'Filename'])
    df = df.dropna()
    # filter for Form Type == 10-Q
    df = df[df['Form Type'] == '10-Q']
    # df = df[df['CIK'].isin(sp500_cik['cik'])]
    df = pd.merge(df,sp500_cik, how='inner', on='CIK')

    # update the filename 
    df['Filename'] = 'https://www.sec.gov/Archives/' + df['Filename']
    # local Directory to store the downloaded txt files  
    directory = r"D:\NLP-HW1\10Q\\" + master_file_name[:-4] +'_10Q'

    # inner loop: for each link in a specific master_df, call the download_file() function and download its 10Q
    for link in df['Filename']:
        row = df[df['Filename']==link].copy()
        # name each file to be a combination of 'cik-ticker-date', to facilitate later stock returns download based on date
        filename = str(row.iloc[0,0]) + '-' + str(row.iloc[0,-2]) + '-' + str(row.iloc[0,3]) + '.txt'
        # Call the download file function
        download_file(link, directory, filename)
        print("finish dowloading :" + link)



finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1001250/0001104659-18-006059.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1002047/0001564590-18-002876.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/100493/0000100493-18-000017.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1013237/0001437749-18-000456.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1024478/0001024478-18-000006.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1037038/0001037038-18-000005.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1048695/0001048695-18-000005.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1048911/0001564590-18-006410.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/106040/0000106040-18-000010.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/10795/0000010795-18-000008.txt
finish dowloadinghttps://www.sec.gov/Archives/edgar/data/1090872/0001090872-18-000004.txt
finish dowload

In the end, we have 20 subfolders under the '10Q' folder, which stores the 10Q files of different SP500 companies in different quarter periods. 

We have a total number of 7320 files, less than the expected 10,000 files but still a fairly decent amount, occupying 101GB.