
# SEC Scraper for a single company

### Important resources
- SEC Documentation: https://www.sec.gov/os/accessing-edgar-data
- Youtube Tutorial: https://www.youtube.com/playlist?list=PLcFcktZ0wnNl5X7Qn1JM4jhrIOBsNj1qa 


In [55]:
# import some libraries
import requests
from bs4 import BeautifulSoup

# base url 
base_url = r"https://www.sec.gov/Archives/edgar/data/"

# CIK number for GOOGL
cik_num = "1652044"

# Combining base url and CIK num to create a filing url
filing_url = base_url + cik_num + r"/index.json"
filing_url

'https://www.sec.gov/Archives/edgar/data/1652044/index.json'

In [59]:
# Declaring user agent header for the content, this header is additional info passed to the request
headers = {'user-agent':'William',
          'Host':'www.sec.gov'}

# Saving the content from a GET request of the filing url
content = requests.get(filing_url, headers=headers)

# Convert the content to JSON format
decode = content.json()

# Retrieve specific filing number from the decode dictionary
filing_number = decode['directory']['item'][1]['name']

number_url = base_url + cik_num + "/" + filing_number + r"/index.json"
number_url

'https://www.sec.gov/Archives/edgar/data/1652044/000120919122045299/index.json'

### Repeat the process

We now have constructed the path to a specific filing name. 
We will need to do another GET request to view the files under that directory

In [60]:
# Saving the content from a GET request of the filing number url
content = requests.get(number_url, headers=headers)

# Convert the content to JSON format
document = content.json()

# Retrieve specific filing number from the decode dictionary
for doc in document['directory']['item']:
    if doc['type'] != 'image2.gif':
        print(doc)
        doc_name = doc['name']
        document_url = base_url + cik_num + "/" + filing_number + "/" + doc_name
        print(document_url)

{'last-modified': '2022-08-10 18:18:15', 'name': '0001209191-22-045299-index-headers.html', 'type': 'text.gif', 'size': ''}
https://www.sec.gov/Archives/edgar/data/1652044/000120919122045299/0001209191-22-045299-index-headers.html
{'last-modified': '2022-08-10 18:18:15', 'name': '0001209191-22-045299-index.html', 'type': 'text.gif', 'size': ''}
https://www.sec.gov/Archives/edgar/data/1652044/000120919122045299/0001209191-22-045299-index.html
{'last-modified': '2022-08-10 18:18:15', 'name': '0001209191-22-045299.txt', 'type': 'text.gif', 'size': ''}
https://www.sec.gov/Archives/edgar/data/1652044/000120919122045299/0001209191-22-045299.txt
{'last-modified': '2022-08-10 18:18:15', 'name': 'doc4.xml', 'type': 'text.gif', 'size': '8252'}
https://www.sec.gov/Archives/edgar/data/1652044/000120919122045299/doc4.xml
