In [1]:
# Importing built-in libraries 
import re
import os
from time import gmtime, strftime
from datetime import datetime, timedelta
import unicodedata

# Importing libraries you may need to install
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm

In [2]:
# from wikipedia download current SP 500 list with symbol/ticker and CIK
cwd=os.getcwd()
res=requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
# if requests fail, show the failure
if res.status_code != 200:
    text='Request to list SP 500 companies fails with error code' + str(res.status_code)
    print(text)

In [3]:
# sptable is the table with SP500 company list
soup= bs.BeautifulSoup(res.text,'lxml')
tables=soup.find_all('table')
sptable = pd.read_html(str(tables[0]), header=0)[0]
sptable.to_csv('sp500_companylist.csv',index=False)

In [4]:
# CIK in sptable is an integer which does not have preceding zeros, need to convert to string with preceding zeros
print('CIK data type:',type(sptable['CIK'][0]))
sptable['CIK']=[str(cik).zfill(10) for cik in sptable['CIK']]
print('CIK data type after modification:',type(sptable['CIK'][0]))

CIK data type: <class 'numpy.int64'>
CIK data type after modification: <class 'str'>


In [5]:
# the folder that you want to save the 10k and 10q reports for SP500 companies (need to change to your own folder)
pathname_10k = '/Users/oliviali/Downloads/10K'
pathname_10q = '/Users/oliviali/Downloads/10Q'

In [6]:
def WriteLogFile(log_file_name, text):
    
    '''
    Helper function.
    Writes a log file with all notes and
    error messages from a scraping "session".
    
    Parameters
    ----------
    log_file_name : str
        Name of the log file (should be a .txt file).
    text : str
        Text to write to the log file.
        
    Returns
    -------
    None.
    
    '''
    
    with open(log_file_name, "a") as log_file:
        log_file.write(text)

    return

In [7]:
def Scrape10K(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name):
    
    '''
    Scrapes all 10-Ks and 10-K405s for a particular 
    CIK from EDGAR.
    
    Parameters
    ----------
    browse_url_base : str
        Base URL for browsing EDGAR.
    filing_url_base : str
        Base URL for filings listings on EDGAR.
    doc_url_base : str
        Base URL for one filing's document tables
        page on EDGAR.
    cik : str
        Central Index Key.
    log_file_name : str
        Name of the log file (should be a .txt file).
        
    Returns
    -------
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(cik)
    except OSError:
        print("Already scraped CIK", cik)
        return
    
    # If we haven't, go into the directory for that CIK
    os.chdir(cik)
    
    print('Scraping CIK', cik)
    
    # Request list of 10-K filings
    res = requests.get(browse_url_base % cik)
    
    # If the request failed, log the failure and exit
    if res.status_code != 200:
        os.chdir('..')
        os.rmdir(cik) # remove empty dir
        text = "Request failed with error code " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base % cik) + '\n'
        WriteLogFile(log_file_name, text)
        return

    # If the request doesn't fail, continue...
    
    # Parse the response HTML using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

    # Extract all tables from the response
    html_tables = soup.find_all('table')
    
    # Check that the table we're looking for exists
    # If it doesn't, exit
    if len(html_tables)<3:
        os.chdir('..')
        return
    
    # Parse the Filings table
    filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
    filings_table['Filings'] = [str(x) for x in filings_table['Filings']]

    # Get 10-K, 10-K/A, 10-K405 and 10-KT document filings
    filings_table = filings_table[(filings_table['Filings'] == '10-K')|(filings_table['Filings'] == '10-K405')|\
                                 (filings_table['Filings'] == '10-K/A')|(filings_table['Filings'] == '10-KT')]

    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base % (cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base % (cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue

        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-K,10-K405, 10-KT and 10-K/A entries for the filing
        docs_table = docs_table[(docs_table['Type'] == '10-K')|(docs_table['Type'] == '10-K405')|\
                                (docs_table['Type'] == '10-K/A')|(docs_table['Type'] == '10-KT')]
        
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 or 10-K/A entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = str(docs_table['Document']).split(' ')[0]   # required for recent submissions
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: %s and Acc_No: %s is unavailable' % (cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        
        # Request the file
        file = requests.get(doc_url_base % (cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base % (cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
        
        # If it succeeds, keep going...
        
        # Save the file in appropriate format: if it is a modified report, _a is noted; otherwise _r is noted
        if docs_table['Type']=='10-K/A':
            if '.txt' in docname:
        # Save text as TXT
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'a'+'.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
        # Save text as HTML
                date = str(row['Filing Date'])
                filename = cik + '_' + date  + '_'+'a'+'.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        elif docs_table['Type']=='10-KT':
            if '.txt' in docname:
        # Save text as TXT
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'t'+'.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
        # Save text as HTML
                date = str(row['Filing Date'])
                filename = cik + '_' + date  + '_'+'t'+'.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        else:
            if '.txt' in docname:
        # Save text as TXT
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'r'+'.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
        # Save text as HTML
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'r'+'.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        
    # Move back to the main 10-K directory
    os.chdir('..')
        
    return

In [8]:
def Scrape10Q(browse_url_base, filing_url_base, doc_url_base, cik, log_file_name):
    
    '''
    Scrapes all 10-Qs for a particular CIK from EDGAR.
    
    Parameters
    ----------
    browse_url_base : str
        Base URL for browsing EDGAR.
    filing_url_base : str
        Base URL for filings listings on EDGAR.
    doc_url_base : str
        Base URL for one filing's document tables
        page on EDGAR.
    cik : str
        Central Index Key.
    log_file_name : str
        Name of the log file (should be a .txt file).
        
    Returns
    -------
    None.
    
    '''
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(cik)
    except OSError:
        print("Already scraped CIK", cik)
        return
    
    # If we haven't, go into the directory for that CIK
    os.chdir(cik)
    
    print('Scraping CIK', cik)
    
    # Request list of 10-Q filings
    res = requests.get(browse_url_base % cik)
    
    # If the request failed, log the failure and exit
    if res.status_code != 200:
        os.chdir('..')
        os.rmdir(cik) # remove empty dir
        text = "Request failed with error code " + str(res.status_code) + \
               "\nFailed URL: " + (browse_url_base % cik) + '\n'
        WriteLogFile(log_file_name, text)
        return
    
    # If the request doesn't fail, continue...

    # Parse the response HTML using BeautifulSoup
    soup = bs.BeautifulSoup(res.text, "lxml")

    # Extract all tables from the response
    html_tables = soup.find_all('table')
    
    # Check that the table we're looking for exists
    # If it doesn't, exit
    if len(html_tables)<3:
        print("table too short")
        os.chdir('..')
        return
    
    # Parse the Filings table
    filings_table = pd.read_html(str(html_tables[2]), header=0)[0]
    filings_table['Filings'] = [str(x) for x in filings_table['Filings']]

    # Get 10-Q， 10-QT and 10-Q/A document filings
    filings_table = filings_table[(filings_table['Filings'] == '10-Q')|(filings_table['Filings'] == '10-Q/A')\
                                 |(filings_table['Filings'] == '10-QT')]

    # If filings table doesn't have any
    # 10-Ks or 10-K405s, exit
    if len(filings_table)==0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    filings_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in filings_table['Description']]

    # Iterate through each filing and 
    # scrape the corresponding document...
    for index, row in filings_table.iterrows():
        
        # Get the accession number for the filing
        acc_no = str(row['Acc_No'])
        
        # Navigate to the page for the filing
        docs_page = requests.get(filing_url_base % (cik, acc_no))
        
        # If request fails, log the failure
        # and skip to the next filing    
        if docs_page.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(docs_page.status_code) + \
                   "\nFailed URL: " + (filing_url_base % (cik, acc_no)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
            
        # If request succeeds, keep going...
        
        # Parse the table of documents for the filing
        docs_page_soup = bs.BeautifulSoup(docs_page.text, 'lxml')
        docs_html_tables = docs_page_soup.find_all('table')
        if len(docs_html_tables)==0:
            continue
        docs_table = pd.read_html(str(docs_html_tables[0]), header=0)[0]
        docs_table['Type'] = [str(x) for x in docs_table['Type']]
        
        # Get the 10-Q entries for the filing
        docs_table = docs_table[(docs_table['Type'] == '10-Q')|(docs_table['Type'] == '10-Q/A')|\
                               (docs_table['Type'] == '10-QT')]
        
        # If there aren't any 10-K or 10-K405 entries,
        # skip to the next filing
        if len(docs_table)==0:
            continue
        # If there are 10-K or 10-K405 entries,
        # grab the first document
        elif len(docs_table)>0:
            docs_table = docs_table.iloc[0]
        
        docname = str(docs_table['Document']).split(' ')[0]   # required for recent submissions
        
        # If that first entry is unavailable,
        # log the failure and exit
        if str(docname) == 'nan':
            os.chdir('..')
            text = 'File with CIK: %s and Acc_No: %s is unavailable' % (cik, acc_no) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue       
        
        # If it is available, continue...
        
        # Request the file
        file = requests.get(doc_url_base % (cik, acc_no.replace('-', ''), docname))
        
        # If the request fails, log the failure and exit
        if file.status_code != 200:
            os.chdir('..')
            text = "Request failed with error code " + str(file.status_code) + \
                   "\nFailed URL: " + (doc_url_base % (cik, acc_no.replace('-', ''), docname)) + '\n'
            WriteLogFile(log_file_name, text)
            os.chdir(cik)
            continue
            
        # If it succeeds, keep going...
        
        # Save the file in appropriate format
        if docs_table['Type']=='10-Q/A':
            if '.txt' in docname:
        # Save text as TXT
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'a'+'.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
        # Save text as HTML
                date = str(row['Filing Date'])
                filename = cik + '_' + date  + '_'+'a'+'.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        elif docs_table['Type']=='10-QT':
            if '.txt' in docname:
        # Save text as TXT
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'t'+'.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
        # Save text as HTML
                date = str(row['Filing Date'])
                filename = cik + '_' + date  + '_'+'t'+'.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        else:
            if '.txt' in docname:
        # Save text as TXT
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'r'+'.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
        # Save text as HTML
                date = str(row['Filing Date'])
                filename = cik + '_' + date + '_'+'r'+'.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        
    # Move back to the main 10-Q directory
    os.chdir('..')
        
    return

In [None]:
# Run the function to scrape 10-Ks

# Define parameters
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-K'
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

# Set correct directory
os.chdir(pathname_10k)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
with open(log_file_name, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(sptable['CIK']):
    Scrape10K(browse_url_base=browse_url_base_10k, 
          filing_url_base=filing_url_base_10k, 
          doc_url_base=doc_url_base_10k, 
          cik=cik,
          log_file_name=log_file_name)

 62%|██████▏   | 312/505 [00:00<00:00, 1235.77it/s]

Already scraped CIK 0000066740
Already scraped CIK 0000001800
Already scraped CIK 0001551152
Already scraped CIK 0000815094
Already scraped CIK 0001467373
Already scraped CIK 0000718877
Already scraped CIK 0000796343
Already scraped CIK 0000002488
Already scraped CIK 0001158449
Already scraped CIK 0000874761
Already scraped CIK 0000004977
Already scraped CIK 0001090872
Already scraped CIK 0000002969
Already scraped CIK 0001086222
Already scraped CIK 0000766421
Already scraped CIK 0000915913
Already scraped CIK 0001035443
Already scraped CIK 0000899866
Already scraped CIK 0001097149
Already scraped CIK 0001579241
Already scraped CIK 0001101215
Already scraped CIK 0000352541
Already scraped CIK 0000899051
Already scraped CIK 0001652044
Already scraped CIK 0001652044
Already scraped CIK 0000764180
Already scraped CIK 0001018724
Already scraped CIK 0001748790
Already scraped CIK 0001002910
Already scraped CIK 0000006201
Already scraped CIK 0000004904
Already scraped CIK 0000004962
Already 

Already scraped CIK 0000865752
Already scraped CIK 0001059556
Already scraped CIK 0000895421
Already scraped CIK 0001285785
Already scraped CIK 0000068505
Already scraped CIK 0001408198
Already scraped CIK 0001623613
Already scraped CIK 0001120193
Already scraped CIK 0001021860
Already scraped CIK 0001002047
Already scraped CIK 0001065280
Already scraped CIK 0000814453
Already scraped CIK 0001164727
Already scraped CIK 0001564708
Already scraped CIK 0001564708
Already scraped CIK 0000753308
Already scraped CIK 0001492633
Already scraped CIK 0000320187
Already scraped CIK 0001111711
Already scraped CIK 0000072207
Already scraped CIK 0000072333
Already scraped CIK 0000702165
Already scraped CIK 0000073124
Already scraped CIK 0001133421
Already scraped CIK 0000849399
Already scraped CIK 0001513761
Already scraped CIK 0001013871
Already scraped CIK 0000073309
Already scraped CIK 0001045810
Already scraped CIK 0000906163
Already scraped CIK 0000898173
Already scraped CIK 0000797468
Already 

 87%|████████▋ | 440/505 [00:05<00:00, 80.73it/s]  

Scraping CIK 0000096943


 87%|████████▋ | 441/505 [00:27<07:09,  6.71s/it]

Scraping CIK 0000097476


 88%|████████▊ | 442/505 [00:49<11:47, 11.24s/it]

Scraping CIK 0000217346


 88%|████████▊ | 443/505 [01:14<15:58, 15.45s/it]

Scraping CIK 0000097745


 88%|████████▊ | 444/505 [01:41<19:16, 18.96s/it]

Scraping CIK 0000098246


 88%|████████▊ | 445/505 [02:03<19:41, 19.69s/it]

Scraping CIK 0000109198


 88%|████████▊ | 446/505 [02:21<19:04, 19.40s/it]

Scraping CIK 0000916365


 89%|████████▊ | 447/505 [02:41<18:45, 19.41s/it]

Scraping CIK 0001466258


 89%|████████▊ | 448/505 [02:52<16:03, 16.90s/it]

Scraping CIK 0001260221


 89%|████████▉ | 449/505 [03:07<15:15, 16.34s/it]

Scraping CIK 0000086312


 89%|████████▉ | 450/505 [03:32<17:17, 18.86s/it]

Scraping CIK 0000092230


 89%|████████▉ | 451/505 [03:54<18:00, 20.02s/it]

Scraping CIK 0001418091


 90%|████████▉ | 452/505 [04:02<14:19, 16.22s/it]

Scraping CIK 0000100493


 90%|████████▉ | 453/505 [04:26<16:04, 18.55s/it]

Scraping CIK 0000074208


 90%|████████▉ | 454/505 [04:51<17:31, 20.61s/it]

Scraping CIK 0001403568


 90%|█████████ | 455/505 [05:23<19:57, 23.96s/it]

Scraping CIK 0000036104


 90%|█████████ | 456/505 [05:43<18:37, 22.81s/it]

Scraping CIK 0001336917


 90%|█████████ | 457/505 [06:00<16:53, 21.11s/it]

Already scraped CIK 0001336917
Scraping CIK 0000100885


 91%|█████████ | 459/505 [06:26<14:16, 18.63s/it]

Scraping CIK 0000100517


 91%|█████████ | 460/505 [06:49<15:01, 20.02s/it]

Scraping CIK 0000731766


 91%|█████████▏| 461/505 [07:11<15:02, 20.52s/it]

Scraping CIK 0001090727


 91%|█████████▏| 462/505 [07:30<14:27, 20.17s/it]

Scraping CIK 0001067701


 92%|█████████▏| 463/505 [07:55<15:03, 21.52s/it]

Scraping CIK 0000352915


 92%|█████████▏| 464/505 [08:19<15:16, 22.35s/it]

Scraping CIK 0000005513


 92%|█████████▏| 465/505 [08:42<15:04, 22.61s/it]

Scraping CIK 0000103379


 92%|█████████▏| 466/505 [09:07<15:01, 23.11s/it]

Scraping CIK 0001035002


 92%|█████████▏| 467/505 [09:28<14:19, 22.61s/it]

Scraping CIK 0000203527


 93%|█████████▎| 468/505 [09:55<14:39, 23.77s/it]

Scraping CIK 0000740260


 93%|█████████▎| 469/505 [10:24<15:13, 25.38s/it]

Scraping CIK 0001014473


 93%|█████████▎| 470/505 [10:45<14:07, 24.20s/it]

Scraping CIK 0001442145


 93%|█████████▎| 471/505 [10:56<11:29, 20.28s/it]

Scraping CIK 0000732712


 93%|█████████▎| 472/505 [11:19<11:36, 21.12s/it]

Scraping CIK 0000875320


 94%|█████████▎| 473/505 [11:40<11:15, 21.11s/it]

Scraping CIK 0001339947


 94%|█████████▍| 474/505 [11:56<09:58, 19.31s/it]

Scraping CIK 0001403161


 94%|█████████▍| 475/505 [12:10<08:59, 17.98s/it]

Scraping CIK 0000899689


 94%|█████████▍| 476/505 [12:47<11:26, 23.69s/it]

Scraping CIK 0001396009


 94%|█████████▍| 477/505 [13:00<09:31, 20.42s/it]

Scraping CIK 0000011544


 95%|█████████▍| 478/505 [13:20<09:04, 20.16s/it]

Scraping CIK 0000943452


 95%|█████████▍| 479/505 [13:43<09:11, 21.21s/it]

Scraping CIK 0000104169


 95%|█████████▌| 480/505 [14:04<08:48, 21.13s/it]

Scraping CIK 0001618921


 95%|█████████▌| 481/505 [14:10<06:35, 16.48s/it]

Scraping CIK 0001001039


 95%|█████████▌| 482/505 [14:33<07:03, 18.43s/it]

Scraping CIK 0000823768


 96%|█████████▌| 483/505 [14:56<07:15, 19.79s/it]

Scraping CIK 0001000697


 96%|█████████▌| 484/505 [15:17<07:04, 20.21s/it]

Scraping CIK 0000783325


 96%|█████████▌| 485/505 [15:42<07:12, 21.62s/it]

Scraping CIK 0000072971


 96%|█████████▌| 486/505 [16:04<06:55, 21.85s/it]

Scraping CIK 0000766704


 96%|█████████▋| 487/505 [16:33<07:09, 23.86s/it]

Scraping CIK 0000105770


 97%|█████████▋| 488/505 [16:54<06:33, 23.13s/it]

Scraping CIK 0000106040


 97%|█████████▋| 489/505 [17:16<06:01, 22.60s/it]

Scraping CIK 0001365135


 97%|█████████▋| 490/505 [17:30<05:01, 20.11s/it]

Scraping CIK 0001636023


 97%|█████████▋| 491/505 [17:34<03:32, 15.16s/it]

Scraping CIK 0000106535


 97%|█████████▋| 492/505 [17:56<03:43, 17.19s/it]

Scraping CIK 0000106640


 98%|█████████▊| 493/505 [18:17<03:40, 18.37s/it]

Scraping CIK 0000107263


 98%|█████████▊| 494/505 [18:41<03:43, 20.29s/it]

Scraping CIK 0001140536


 98%|█████████▊| 495/505 [19:05<03:33, 21.33s/it]

Scraping CIK 0001174922


 98%|█████████▊| 496/505 [19:36<03:38, 24.23s/it]

Scraping CIK 0000072903


 98%|█████████▊| 497/505 [20:04<03:22, 25.25s/it]

Scraping CIK 0000108772


 99%|█████████▊| 498/505 [20:31<03:00, 25.79s/it]

Scraping CIK 0000743988


 99%|█████████▉| 499/505 [20:51<02:24, 24.13s/it]

Scraping CIK 0001524472


 99%|█████████▉| 500/505 [21:00<01:37, 19.49s/it]

Scraping CIK 0001041061


 99%|█████████▉| 501/505 [21:21<01:20, 20.10s/it]

Scraping CIK 0000877212


In [None]:
# Run the function to scrape 10-Qs

# Define parameters
browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-Q&count=1000'
filing_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base_10q = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

# Set correct directory (fill this out yourself!)
os.chdir(pathname_10q)

# Initialize log file
# (log file name = the time we initiate scraping session)
time = strftime("%Y-%m-%d %Hh%Mm%Ss", gmtime())
log_file_name = 'log '+time+'.txt'
log_file = open(log_file_name, 'a')
log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(sptable['CIK']):
    Scrape10Q(browse_url_base=browse_url_base_10q, 
          filing_url_base=filing_url_base_10q, 
          doc_url_base=doc_url_base_10q, 
          cik=cik,
          log_file_name=log_file_name)