# SEC EDGAR
- SEC Edgar is a database repository of all filings made by publicly traded companies in the United States
- Explore the website first: https://www.sec.gov/edgar/searchedgar/companysearch.html

# Part 1
- For a given company (CIK), get the URLs of all their filings

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import lxml

In [2]:
"""
Please uncomment the below line and change the associated email address to your GWU email id

"""
headers={'User-Agent' : 'George Washington University, vineetb@gwu.edu'}

In [4]:
#the base url to find out archival filing data
baseurl="https://www.sec.gov/Archives/edgar/data/"

#use one example company, Apple's CIK
ciknum="320193"

In [5]:
#the url for the list of all filings made by Apple
list_of_filings_url=baseurl+ciknum+"/index.json"

content=requests.get(list_of_filings_url, headers=headers)
parse_content=content.json()

#parse_content is a dictionary where the value is a list of filings
parse_content

{'directory': {'item': [{'last-modified': '2023-04-18 18:31:39',
    'name': '000032019323000057',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-17 18:31:10',
    'name': '000032019323000055',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-13 17:18:05',
    'name': '000195824423000454',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-10 18:30:24',
    'name': '000032019323000053',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-06 17:48:09',
    'name': '000195824423000440',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-04 18:34:40',
    'name': '000032019323000051',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-04 18:33:48',
    'name': '000032019323000050',
    'type': 'folder.gif',
    'size': ''},
   {'last-modified': '2023-04-04 18:32:43',
    'name': '000032019323000049',
    'type': 'folder.gif',
    'size': ''},
   {'last-m

In [6]:
parse_content.keys()

dict_keys(['directory'])

In [7]:
len(parse_content)

1

- The value for "directory" key is another dictionary where the first key is "item"
- The "item" value is a list of filings with various information for each filing
- We just need the name of the filing number to get the data for that filing

In [9]:
parse_content['directory'].keys()

dict_keys(['item', 'name', 'parent-dir'])

In [10]:
len(parse_content['directory']['item'])

1975

In [11]:
type(parse_content['directory']['item'])

list

In [16]:
parse_content['directory']['item'][0]

{'last-modified': '2023-04-18 18:31:39',
 'name': '000032019323000057',
 'type': 'folder.gif',
 'size': ''}

In [19]:
parse_content['directory']['item'][0]['name']

'000032019323000057'

In [20]:
#Let's just use the first filing for now, do the loop later
filenum=parse_content['directory']['item'][0]['name']
indiv_filing_url=baseurl+ciknum+"/"+filenum+"/index.json"
indiv_filing_url

'https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/index.json'

In [23]:
indiv_content=requests.get(indiv_filing_url, headers=headers)
parse_indiv_content=indiv_content.json()
parse_indiv_content

{'directory': {'item': [{'last-modified': '2023-04-18 18:31:39',
    'name': '0000320193-23-000057-index-headers.html',
    'type': 'text.gif',
    'size': ''},
   {'last-modified': '2023-04-18 18:31:39',
    'name': '0000320193-23-000057-index.html',
    'type': 'text.gif',
    'size': ''},
   {'last-modified': '2023-04-18 18:31:39',
    'name': '0000320193-23-000057.txt',
    'type': 'text.gif',
    'size': ''},
   {'last-modified': '2023-04-18 18:31:39',
    'name': 'wf-form4_168185708431590.xml',
    'type': 'text.gif',
    'size': '14591'}],
  'name': '/Archives/edgar/data/320193/000032019323000057',
  'parent-dir': '/Archives/edgar/data/320193'}}

In [34]:
#doc_name=parse_indiv_content['directory']['item'][0]['name']
#doc_name
for filing in parse_indiv_content['directory']['item']:
    doc_name=filing['name']
    doc_url=baseurl+ciknum+"/"+filenum+"/"+doc_name
    print(doc_url)

https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057-index-headers.html
https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057-index.html
https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057.txt
https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/wf-form4_168185708431590.xml


In [33]:

doc_url

'https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057-index-headers.html'

In [10]:
import time
headers={'User-Agent' : 'George Washington University, vineetb@gwu.edu'}
baseurl="https://www.sec.gov/Archives/edgar/data/"

#use one example company, Apple's CIK
ciknum="320193"
list_of_filings_url=baseurl+ciknum+"/index.json"

content=requests.get(list_of_filings_url, headers=headers)
parse_content=content.json()

allfiling_URLs=[]
#parse_content is a dictionary where the value is a list of filings
for i in parse_content['directory']['item']:
    filenum=i['name']
    indiv_filing_url=baseurl+ciknum+"/"+filenum+"/index.json"
    time.sleep(1)
    indiv_content=requests.get(indiv_filing_url, headers=headers)
    parse_indiv_content=indiv_content.json()
    
    for filing in parse_indiv_content['directory']['item']:
        doc_name=filing['name']
        doc_url=baseurl+ciknum+"/"+filenum+"/"+doc_name   
        allfiling_URLs.append(doc_url)

In [11]:
allfiling_URLs

['https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057-index-headers.html',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057-index.html',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/0000320193-23-000057.txt',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000057/wf-form4_168185708431590.xml',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000055/0000320193-23-000055-index-headers.html',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000055/0000320193-23-000055-index.html',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000055/0000320193-23-000055.txt',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000055/wf-form4_168177065948283.xml',
 'https://www.sec.gov/Archives/edgar/data/320193/000195824423000454/0001958244-23-000454-index-headers.html',
 'https://www.sec.gov/Archives/edgar/data/320193/000195824423000454/0001958244

# Part 2
- For a given day or year/quarter, get the URLs of all filings
- Filter by filing type (say, keep only the 10-Ks)

In [12]:
# The url here is the daily index (for all the filings on a given day) or the full index for all the filings in a year/quarter
# We will start with the daily index to keep is small and simple

baseurl=r"https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/"
ext="index.json"
content=requests.get(baseurl+ext, headers=headers)
parse_content=content.json()
parse_content

{'directory': {'item': [{'last-modified': '04/01/2020 10:07:41 PM',
    'name': 'company.20200401.idx',
    'type': 'file',
    'href': 'company.20200401.idx',
    'size': '688 KB'},
   {'last-modified': '04/02/2020 10:06:40 PM',
    'name': 'company.20200402.idx',
    'type': 'file',
    'href': 'company.20200402.idx',
    'size': '881 KB'},
   {'last-modified': '04/03/2020 10:06:28 PM',
    'name': 'company.20200403.idx',
    'type': 'file',
    'href': 'company.20200403.idx',
    'size': '909 KB'},
   {'last-modified': '04/06/2020 10:05:43 PM',
    'name': 'company.20200406.idx',
    'type': 'file',
    'href': 'company.20200406.idx',
    'size': '438 KB'},
   {'last-modified': '04/07/2020 10:06:01 PM',
    'name': 'company.20200407.idx',
    'type': 'file',
    'href': 'company.20200407.idx',
    'size': '497 KB'},
   {'last-modified': '04/08/2020 10:06:19 PM',
    'name': 'company.20200408.idx',
    'type': 'file',
    'href': 'company.20200408.idx',
    'size': '339 KB'},
   {'la

In [15]:
parse_content['directory']['item'][0]

{'last-modified': '04/01/2020 10:07:41 PM',
 'name': 'company.20200401.idx',
 'type': 'file',
 'href': 'company.20200401.idx',
 'size': '688 KB'}

In [17]:
#Build a list of files within the 2020QTR2 folder
list_of_files=[]
masteridx_files=[]
for file in parse_content['directory']['item']:
    fileurl=baseurl+file['name']
    list_of_files.append(fileurl)
    if "master" in file['name']:
        masteridx_files.append(fileurl)
masteridx_files

['https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200401.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200402.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200403.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200406.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200407.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200408.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200409.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200410.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200413.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200414.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200415.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR2/master.20200416.idx',
 'https://www.sec.gov/Archiv

In [21]:
masteridx=requests.get(masteridx_files[0], headers=headers)
masteridx.text

"Description:           Daily Index of EDGAR Dissemination Feed\nLast Data Received:    Apr  1, 2020\nComments:              webmaster@sec.gov\nAnonymous FTP:         ftp://ftp.sec.gov/edgar/\n \nCIK|Company Name|Form Type|Date Filed|File Name\n--------------------------------------------------------------------------------\n1000229|CORE LABORATORIES N V|SC 13G/A|20200401|edgar/data/1000229/0000869178-20-000110.txt\n1000230|OPTICAL CABLE CORP|8-K|20200401|edgar/data/1000230/0001437749-20-006737.txt\n1000275|ROYAL BANK OF CANADA|424B2|20200401|edgar/data/1000275/0001140361-20-007714.txt\n1000275|ROYAL BANK OF CANADA|FWP|20200401|edgar/data/1000275/0001140361-20-007642.txt\n1000275|ROYAL BANK OF CANADA|FWP|20200401|edgar/data/1000275/0001140361-20-007660.txt\n1000275|ROYAL BANK OF CANADA|FWP|20200401|edgar/data/1000275/0001140361-20-007662.txt\n1000275|ROYAL BANK OF CANADA|FWP|20200401|edgar/data/1000275/0001140361-20-007664.txt\n1000275|ROYAL BANK OF CANADA|FWP|20200401|edgar/data/10002

In [24]:
cik_start=masteridx.text.find("CIK")
cik_start

190

In [36]:
data=masteridx.text[cik_start:]
datarows=data.split("\n")
actualdata=datarows[2:]
dailyfilings=pd.DataFrame([row.split('|') for row in actualdata])
dailyfilings.columns=datarows[0].split('|')
dailyfilings[dailyfilings['Form Type']=="10-K"]

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,File Name
507,1161582,"GROWLIFE, INC.",10-K,20200401,edgar/data/1161582/0001654954-20-003682.txt
1174,1394638,CREATIVE LEARNING Corp,10-K,20200401,edgar/data/1394638/0001731122-20-000338.txt
1409,1446159,Predictive Oncology Inc.,10-K,20200401,edgar/data/1446159/0001171843-20-002217.txt
1725,1522767,MARIMED INC.,10-K,20200401,edgar/data/1522767/0001493152-20-005546.txt
1728,1524025,"TILLY'S, INC.",10-K,20200401,edgar/data/1524025/0001628280-20-004436.txt
1976,1565228,"Vislink Technologies, Inc.",10-K,20200401,edgar/data/1565228/0001493152-20-005537.txt
1988,1567503,"TurnKey Capital, Inc.",10-K,20200401,edgar/data/1567503/0001553350-20-000289.txt
2077,1584549,"Village Farms International, Inc.",10-K,20200401,edgar/data/1584549/0001193125-20-094235.txt
2260,1630176,GREY CLOAK TECH INC.,10-K,20200401,edgar/data/1630176/0001520138-20-000130.txt
2412,1654672,"Pineapple Express, Inc.",10-K,20200401,edgar/data/1654672/0001493152-20-005517.txt


In [34]:
datarows[0]

'CIK|Company Name|Form Type|Date Filed|File Name'

#### Task: Create a pandas dataframe that contains the above information (CIK, Name, Form Type, Date, URL) for all filings in 2020Q2

- Keep only the 10-K filings