In [1]:
#required library

import pandas as pd 
import requests
from bs4 import BeautifulSoup

In [3]:
#define the base url needed to create the file url
base_url = r"https://www.sec.gov"

#convert a normal url to a document url 
normal_url = r"https://www.sec.gov/Archives/edgar/data/1265107/0001265107-19-000004.txt"
normal_url = normal_url.replace('-','').replace('.txt','/index.json')

# define a url that leads to a 10k document landing page
documents_url = r"https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/index.json"

#request the url and decode it
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
content = requests.get(documents_url, headers=headers).json()

for file in content['directory']['item']:
    
    #grab the filing summary and create a new url leading to the file so we can download it
    if file['name']=='FilingSummary.xml':
        xml_summary = base_url + content['directory']['name'] + '/' + file['name']
        
        print('-'*100)
        print('File Name: '+ file['name'])
        print('File Path: '+ xml_summary)

----------------------------------------------------------------------------------------------------
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/FilingSummary.xml


In [4]:
#define a new base url that represents the filing folder. This will come in handy when we will download the reports.
base_url = xml_summary.replace('FilingSummary.xml','')

#request and parse the content
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
content = requests.get(xml_summary, headers=headers).content
soup = BeautifulSoup(content, 'lxml')

#find the 'myreports' tag becasue this contains all the individual reports submitted
reports = soup.find('myreports')

# I want a list to store all the individual components of the report, so create the master list.
master_reports = []

# loop through each report in the 'myreports' tag but avoid the last one as this will cause an error.
for report in reports.find_all('report')[:-1]:

    # let's create a dictionary to store all the different parts we need.
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text
    
    # append the dictionary to the master list.
    master_reports.append(report_dict)

    #print the info to the user
    print('-'*100)
    print(base_url+report.htmlfilename.text)
    print(report.longname.text)
    print(report.shortname.text)
    print(report.menucategory.text)
    print(report.position.text)

----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R1.htm
0001000 - Document - Document and Entity Information
Document and Entity Information
Cover
1
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R2.htm
1001000 - Statement - Consolidated Balance Sheets
Consolidated Balance Sheets
Statements
2
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R3.htm
1001501 - Statement - Consolidated Balance Sheets (Parenthetical)
Consolidated Balance Sheets (Parenthetical)
Statements
3
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/

In [6]:
#create the list to hold the statements url 
statements_url=[]

for report_dict in master_reports:
    
    #define the statements we want to look for 
    item_1 = r"Consolidated Balance Sheets"
    item_2 = r"Consolidated Statements of Operations and Comprehensive Income (Loss)"
    item_3 = r"Consolidated Statements of Cash Flows"
    item_4 = r"Consolidated Statements of Stockholder's (Deficit) Equity"
    
    #store the item in a list
    report_list = [item_1, item_2, item_3, item_4]
    
    #if the short name can be found in the report list
    if report_dict['name_short'] in report_list:
        
        #print some info and store it in the statements_url
        print('-'*100)
        print(report_dict['name_short'])
        print(report_dict['url'])
        
        statements_url.append(report_dict['url'])

----------------------------------------------------------------------------------------------------
Consolidated Balance Sheets
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R2.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Operations and Comprehensive Income (Loss)
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R4.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Cash Flows
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R5.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Stockholder's (Deficit) Equity
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R6.htm


In [9]:
#let's assume we want all the statements in a single data set
statements_data = []

#loop through each statements url
for statement in statements_url:
    
    #define a dictionary that will store the different parts of the statement
    statement_data = {}
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    # request the statement file content
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
    content = requests.get(statement, headers=headers).content
    report_soup = BeautifulSoup(content, 'html')

    # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        # first let's get all the elements.
        cols = row.find_all('td')
        
        # if it's a regular row and not a section or a table header
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        # if it's a regular row and a section but not a table header
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        # finally if it's not any of those it must be a header
        elif (len(row.find_all('th')) != 0):            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
            
        else:            
            print('We encountered an error.')

    # append it to the master list.
    statements_data.append(statement_data)  
    

In [11]:
# Grab the proper components
income_header =  statements_data[1]['headers'][1]
income_data = statements_data[1]['data']

# Put the data in a DataFrame
income_df = pd.DataFrame(income_data)

# Display
print('-'*100)
print('Before Reindexing')
print('-'*100)
display(income_df.head())

# Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
income_df.index = income_df[0]
income_df.index.name = 'Category'
income_df = income_df.drop(0, axis = 1)

# Display
print('-'*100)
print('Before Regex')
print('-'*100)
display(income_df.head())

# Get rid of the '$', '(', ')', and convert the '' to NaNs.
income_df = income_df.replace('[\$,)]','', regex=True )\
                     .replace( '[(]','-', regex=True)\
                     .replace( '', 'NaN', regex=True)

# Display
print('-'*100)
print('Before type conversion')
print('-'*100)
display(income_df.head())

# everything is a string, so let's convert all the data to a float.
income_df = income_df.astype(float)

# Change the column headers
income_df.columns = income_header

# Display
print('-'*100)
print('Final Product')
print('-'*100)

# show the df
income_df

# drop the data in a CSV file if need be.
income_df.to_csv('income_state.csv')

----------------------------------------------------------------------------------------------------
Before Reindexing
----------------------------------------------------------------------------------------------------


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Net revenue,"$ 134,436","$ 137,156","$ 135,013","$ 133,753","$ 133,546","$ 138,211","$ 140,498","$ 141,200","$ 540,358","$ 553,455","$ 570,372"
1,Cost of services,,,,,,,,,128939,119193,115236
2,"Selling, general and administrative, including...",,,,,,,,,118940,155902,114152
3,Radio conversion costs,,,,,,,,,0,450,18422
4,"Amortization of subscriber accounts, deferred ...",,,,,,,,,211639,236788,246753


----------------------------------------------------------------------------------------------------
Before Regex
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Net revenue,"$ 134,436","$ 137,156","$ 135,013","$ 133,753","$ 133,546","$ 138,211","$ 140,498","$ 141,200","$ 540,358","$ 553,455","$ 570,372"
Cost of services,,,,,,,,,128939,119193,115236
"Selling, general and administrative, including stock-based and long-term incentive compensation",,,,,,,,,118940,155902,114152
Radio conversion costs,,,,,,,,,0,450,18422
"Amortization of subscriber accounts, deferred contract acquisition costs and other intangible assets",,,,,,,,,211639,236788,246753


----------------------------------------------------------------------------------------------------
Before type conversion
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Net revenue,134436.0,137156.0,135013.0,133753.0,133546.0,138211.0,140498.0,141200.0,540358,553455,570372
Cost of services,,,,,,,,,128939,119193,115236
"Selling, general and administrative, including stock-based and long-term incentive compensation",,,,,,,,,118940,155902,114152
Radio conversion costs,,,,,,,,,0,450,18422
"Amortization of subscriber accounts, deferred contract acquisition costs and other intangible assets",,,,,,,,,211639,236788,246753


----------------------------------------------------------------------------------------------------
Final Product
----------------------------------------------------------------------------------------------------
