In [1]:
%reset -f
import requests
import urllib.request
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import math
from re import search
from re import sub


#####################################################################
################### Attributions ####################################
# Kevin Markham's Data School https://www.youtube.com/watch?v=zXif_9RVadI
# Theodore Petrou's Pandas Cookbook
# Wes McKinney's Python for Data Analysis
# Alvin Zuyin Zheng's "Python and Web Data Extraction: Introduction"
# Beautiful Soup Documentation 
# Scraping EDGAR with Python by Rasha Ashraf
# Sigma Coding Scraping SEC XBRL Documents Parts 1-4 
##################################################################
###############################################################################

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Set the url to EDGAR's company.idx file for the first quarter of 2020. The company.idx file will be read in 
# as a fixed-width text file, the header will be removed, and then the program can iterate through a table
# which will contain the company name, the CIK, and the url with a link to the .txt file of the company filing. 

url = 'https://www.sec.gov/Archives/edgar/full-index/2020/QTR1/company.idx'
urllib.request.urlretrieve(url, 'c:/web_scraping/company.idx')

('c:/web_scraping/company.idx', <http.client.HTTPMessage at 0x1d087463700>)

In [3]:
# Read in the fixed-width company.idx
df = pd.read_fwf('company.idx', colspecs=[
                 (0, 61), (62, 73), (74, 85), (86, 97), (98, 142)])


# Rows 0 through 6 are not needed since they are a header, row 7 is variable names, and row 8 is dashes that 
# should be gotten rid of.
df.head(15)

Unnamed: 0,Description: Master Index of EDGAR Dissemination Fe,d by Compan,Name,Unnamed: 3,Unnamed: 4
0,"Last Data Received: March 31, 2020",,,,
1,Comments: webmaster@sec.gov,,,,
2,Anonymous FTP: ftp://ftp.sec.gov/edgar/,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,Company Name,Form Type,CIK,Date Filed,File Name
8,----------------------------------------------...,-----------,-----------,-----------,-------------------------------------------
9,&VEST Domestic Fund II LP,D,1800903,2020-01-27,edgar/data/1800903/0001800903-20-000001.txt


In [4]:
df = df.iloc[7:]

df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Description: Master Index of EDGAR Dissemination Fe,d by Compan,Name,Unnamed: 3,Unnamed: 4
0,Company Name,Form Type,CIK,Date Filed,File Name
1,----------------------------------------------...,-----------,-----------,-----------,-------------------------------------------
2,&VEST Domestic Fund II LP,D,1800903,2020-01-27,edgar/data/1800903/0001800903-20-000001.txt
3,&VEST Offshore Fund II L.P.,D,1800902,2020-01-27,edgar/data/1800902/0001800902-20-000001.txt
4,"&vest Domestic Fund II KPIV, L.P.",D,1802417,2020-02-06,edgar/data/1802417/0001802417-20-000001.txt


In [5]:
df = df.drop([1])

# Now the DataFrame is in the format that is needed.
df.head()


Unnamed: 0,Description: Master Index of EDGAR Dissemination Fe,d by Compan,Name,Unnamed: 3,Unnamed: 4
0,Company Name,Form Type,CIK,Date Filed,File Name
2,&VEST Domestic Fund II LP,D,1800903,2020-01-27,edgar/data/1800903/0001800903-20-000001.txt
3,&VEST Offshore Fund II L.P.,D,1800902,2020-01-27,edgar/data/1800902/0001800902-20-000001.txt
4,"&vest Domestic Fund II KPIV, L.P.",D,1802417,2020-02-06,edgar/data/1802417/0001802417-20-000001.txt
5,"024 Pharma, Inc.",8-K/A,1307969,2020-02-20,edgar/data/1307969/0001683168-20-000541.txt


In [6]:
# Reset the indices 
df.reset_index(inplace=True, drop=True)
df.head(15)


Unnamed: 0,Description: Master Index of EDGAR Dissemination Fe,d by Compan,Name,Unnamed: 3,Unnamed: 4
0,Company Name,Form Type,CIK,Date Filed,File Name
1,&VEST Domestic Fund II LP,D,1800903,2020-01-27,edgar/data/1800903/0001800903-20-000001.txt
2,&VEST Offshore Fund II L.P.,D,1800902,2020-01-27,edgar/data/1800902/0001800902-20-000001.txt
3,"&vest Domestic Fund II KPIV, L.P.",D,1802417,2020-02-06,edgar/data/1802417/0001802417-20-000001.txt
4,"024 Pharma, Inc.",8-K/A,1307969,2020-02-20,edgar/data/1307969/0001683168-20-000541.txt
5,1 800 FLOWERS COM INC,10-Q,1084869,2020-02-07,edgar/data/1084869/0001437749-20-002005.txt
6,1 800 FLOWERS COM INC,4,1084869,2020-02-28,edgar/data/1084869/0001437749-20-003844.txt
7,1 800 FLOWERS COM INC,8-K,1084869,2020-01-30,edgar/data/1084869/0001157523-20-000125.txt
8,1 800 FLOWERS COM INC,8-K,1084869,2020-02-18,edgar/data/1084869/0001157523-20-000215.txt
9,1 800 FLOWERS COM INC,SC 13G,1084869,2020-02-14,edgar/data/1084869/0001398344-20-003415.txt


In [7]:
# Make the variable names equal to the contents of row 0, then remove row 0. 
df.columns = df.iloc[0]
df = df.drop([0])
df.head()

Unnamed: 0,Company Name,Form Type,CIK,Date Filed,File Name
1,&VEST Domestic Fund II LP,D,1800903,2020-01-27,edgar/data/1800903/0001800903-20-000001.txt
2,&VEST Offshore Fund II L.P.,D,1800902,2020-01-27,edgar/data/1800902/0001800902-20-000001.txt
3,"&vest Domestic Fund II KPIV, L.P.",D,1802417,2020-02-06,edgar/data/1802417/0001802417-20-000001.txt
4,"024 Pharma, Inc.",8-K/A,1307969,2020-02-20,edgar/data/1307969/0001683168-20-000541.txt
5,1 800 FLOWERS COM INC,10-Q,1084869,2020-02-07,edgar/data/1084869/0001437749-20-002005.txt


In [8]:
# Create a boolean series to identify 10-K's, which are the annual reports. 
bool = df['Form Type'] == '10-K'



In [9]:
df[bool].head()


Unnamed: 0,Company Name,Form Type,CIK,Date Filed,File Name
33,"10x Genomics, Inc.",10-K,1770787,2020-02-27,edgar/data/1770787/0001193125-20-052640.txt
143,"1347 Property Insurance Holdings, Inc.",10-K,1591890,2020-03-30,edgar/data/1591890/0001493152-20-005206.txt
250,1847 Holdings LLC,10-K,1599407,2020-03-30,edgar/data/1599407/0001213900-20-007912.txt
260,"1895 Bancorp of Wisconsin, Inc.",10-K,1751692,2020-03-30,edgar/data/1751692/0001564590-20-014188.txt
290,1Life Healthcare Inc,10-K,1404123,2020-03-27,edgar/data/1404123/0001564590-20-013666.txt


In [10]:
# Retain only 10-K's in the dataset. 
df= df[bool]
df.reset_index(inplace=True, drop=True)
df.head(50)

Unnamed: 0,Company Name,Form Type,CIK,Date Filed,File Name
0,"10x Genomics, Inc.",10-K,1770787,2020-02-27,edgar/data/1770787/0001193125-20-052640.txt
1,"1347 Property Insurance Holdings, Inc.",10-K,1591890,2020-03-30,edgar/data/1591890/0001493152-20-005206.txt
2,1847 Holdings LLC,10-K,1599407,2020-03-30,edgar/data/1599407/0001213900-20-007912.txt
3,"1895 Bancorp of Wisconsin, Inc.",10-K,1751692,2020-03-30,edgar/data/1751692/0001564590-20-014188.txt
4,1Life Healthcare Inc,10-K,1404123,2020-03-27,edgar/data/1404123/0001564590-20-013666.txt
5,1ST CONSTITUTION BANCORP,10-K,1141807,2020-03-16,edgar/data/1141807/0001141807-20-000005.txt
6,1ST SOURCE CORP,10-K,34782,2020-02-20,edgar/data/34782/0000034782-20-000035.txt
7,1st FRANKLIN FINANCIAL CORP,10-K,38723,2020-03-30,edgar/data/38723/0001376474-20-000072.txt
8,"20/20 Global, Inc.",10-K,1763329,2020-03-30,edgar/data/1763329/0001445866-20-000291.txt
9,"22nd Century Group, Inc.",10-K,1347858,2020-03-11,edgar/data/1347858/0001104659-20-031934.txt


In [11]:
# web address for FCCY https://www.sec.gov/Archives/edgar/data/1141807/0001141807-20-000005.txt, so you can see 
# the CIK without leading zeros goes before the url with the CIK and leading zeros.

# Set a string equal to the start of all url's. 
begin = 'https://www.sec.gov/Archives/'

# Create an empty dataframe.
pl0 = pd.DataFrame()

# Initialize an empty list to track exceptions from the loop. 
exceptions=[]

In [12]:
# Run the loop for the first 10 firms 
for i in range(0, 10):

    try:
        test = begin + df['File Name'].loc[i]
        first_word_name= df['Company Name'].loc[i].split(" ")[0]
        first_word_name= first_word_name.lower()
        print(first_word_name)
        print(i, test)

        u= requests.get(test)


        # The html5lib seemed like the only parser that would pull in the entire xbrl data.
        # The html and lxml seemed like they ignored half of the text file, but I can't explain this. 
        soup = BeautifulSoup(u.text, 'html5lib')

        results = soup.find_all(['table'])

        # Below looks for the tables that could be the income statement. First, the titles are converted to lowercase 
        # and then single spaces, double spaces, and new line characteres are removed. Note that due to the substantial 
        # amount of variation in income statement titles in the 10-K's, I created a series of "if" statements to try
        # an pick up the various titles I saw from manually inspecting annual reports. 
        
        table_list = []
        for i, j in enumerate(results):
            j = str(j)
            j= j.lower()
            j= j.replace(" ", "")
            j= j.replace("  ", "")
            j= j.replace("/n", "")
            if search("consolidatedstatementsofincome",  j):
                table_list.append(i)
            if search("consolidatedstatementsofoperations",  j):
                table_list.append(i)
            if search("consolidatedstatementsofcomprehensive",  j):
                table_list.append(i)
            if search("statementofincome",  j):
                table_list.append(i)
            if search("statementsofincome",  j):
                table_list.append(i)
            if search("statementofoperations",  j):
                table_list.append(i)
            if search("statementsofoperations",  j):
                table_list.append(i)
            if search("conslidatedstatementofincome",  j):
                table_list.append(i)
            if search("consolidatedstatementofoperations",  j):
                table_list.append(i)
            if search("consolidatedstatementofcomprehensive",  j):
                table_list.append(i)
            if search("consolidatedstatementofearnings", j):
                table_list.append(i)
            if search("consolidatedstatementsofearnings", j):
                table_list.append(i)
            if search("consolidatedstatementsofloss", j):
                table_list.append(i)
            if search("consolidatedstatementofloss", j):
                table_list.append(i)

        # The loop below contains scaffolding with print statements that show the results the loop is returning. Even though
        # this makes the notebook larger, I thought it was helpful to include this for clarity. Note that some firms use
        # NetIncomeLoss as the Net Income tag whereas others use ProfitLoss. 
        
        tds = []
        pulled = []
        sub_list = []
        for i, j in enumerate(table_list):
            print(j)
            temp_table = results[j]
            temp_results = temp_table.find_all(['tr'])
            for x, y in enumerate(temp_results):
                y_str = str(y)
                # Modification if one were searching for earnings per share (i.e., EPS) -> 
                # if search("'defref_us-gaap_EarningsPerShareBasic',",  y_str):
                if search("'defref_us-gaap_NetIncomeLoss',",  y_str):
                    print('condition met', i, j, x, y, len(y))
                    sub_list.append(x)
                    pulled.append(y)
                    td = y.find_all('td')
                    tds.append(td)
                # Modifcation if one were searching for earnings per share (i.e., EPS) -> 
                # if search("'defref_us-gaap_EarningsPerShareBasicAndDiluted',",  y_str):
                if search("'defref_us-gaap_ProfitLoss',",  y_str):
                    print('condition met', i, j, x, y, len(y))
                    sub_list.append(x)
                    pulled.append(y)
                    td = y.find_all('td')
                    tds.append(td)
                    
        # For firms that are reporting three years for their income statement, there will be a lenfth of 4 based on the above. 
        # The first hit will be for the XBRL tag searched for, here NetIncomeLoss or ProfitLoss, and the next three for each 
        # of the three reported years. However, if the firm was only in operation for two years, the length would be 3, etc. 
        # The "if" statements below are trying to appropriately deal with the length of time the firm has been in operations
        # with a maximum of 3 years reported and a minimum of a single year being reported. 
        
        td_keeper = None
        for i, j in enumerate(tds):
            print(i, len(j) == 4)
            if len(j) == 4:
                td_keeper = i
            if (td_keeper == None) & (len(j) == 3):
                td_keeper = i
            if (td_keeper == None) & (len(j) == 2):
                td_keeper = i
            if (td_keeper == None) & (len(j) == 1):
                td_keeper = i

        balances= tds[td_keeper]


        url= test
        start = 'https://www.sec.gov/Archives/edgar/data/'
        # remove above then keep everything until / and that will be the CIK.
        url2 = url.replace(start, '')
        url2
        sep = '/'
        # Split on seperator one time, and keep the fist element.

        cik = url2.split(sep)[0]
        cik

        if len(balances)==4:
            df_temp= pd.DataFrame(data=[[cik, 'defref_us-gaap_NetIncomeLoss', balances[-3], balances[-2], balances[-1]]], columns=['cik', 'xbrl_tag', 'cy', 'l_cy', 'l2_cy'])
            pl0= pl0.append(df_temp)
            del df_temp
        if len(balances)==3:
            df_temp= pd.DataFrame(data=[[cik, 'defref_us-gaap_NetIncomeLoss', balances[-2], balances[-1], 'null']], columns=['cik', 'xbrl_tag', 'cy', 'l_cy', 'l2_cy'])
            pl0= pl0.append(df_temp)
            del df_temp
        if len(balances)==2:
            df_temp= pd.DataFrame(data=[[cik, 'defref_us-gaap_NetIncomeLoss', balances[-1], 'null', 'null']], columns=['cik', 'xbrl_tag', 'cy', 'l_cy', 'l2_cy'])
            pl0= pl0.append(df_temp)
            del df_temp
        if (len(balances)==0) | (len(balances) >4):
            df_temp= pd.DataFrame(data=[[cik, 'defref_us-gaap_NetIncomeLoss', 'null', 'null', 'null']], columns=['cik', 'xbrl_tag', 'cy', 'l_cy', 'l2_cy'])
            pl0= pl0.append(df_temp)
            del df_temp

    except Exception:
        continue



10x
0 https://www.sec.gov/Archives/edgar/data/1770787/0001193125-20-052640.txt
151
151
183
183
541
541
1077
condition met 6 1077 19 <tr class="rou">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="top.Show.showAR( this, 'defref_us-gaap_NetIncomeLoss', window );">Net loss</a></td>
<td class="num">(31,251)<span></span>
</td>
<td class="num">(112,485)<span></span>
</td>
<td class="num">(18,762)<span></span>
</td>
</tr> 9
1077
condition met 7 1077 19 <tr class="rou">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="top.Show.showAR( this, 'defref_us-gaap_NetIncomeLoss', window );">Net loss</a></td>
<td class="num">(31,251)<span></span>
</td>
<td class="num">(112,485)<span></span>
</td>
<td class="num">(18,762)<span></span>
</td>
</tr> 9
1124
1124
1124
1124
1203
1203
1235
1235
1235
1235
1465
1465
1555
1555
1644
1644
1644
1644
0 True
1 True
1347
1 https://www.sec.gov/Archives/edgar

107
107
108
108
110
110
492
492
635
condition met 8 635 9 <tr class="rou">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="top.Show.showAR( this, 'defref_us-gaap_NetIncomeLoss', window );">Net Income</a></td>
<td class="nump">$ 3,244<span></span>
</td>
<td class="nump">$ 3,623<span></span>
</td>
<td class="nump">$ 3,370<span></span>
</td>
<td class="nump">$ 3,397<span></span>
</td>
<td class="nump">$ 3,313<span></span>
</td>
<td class="nump">$ 4,011<span></span>
</td>
<td class="nump">$ 1,871<span></span>
</td>
<td class="nump">$ 2,853<span></span>
</td>
<td class="nump">$ 574<span></span>
</td>
<td class="nump">$ 2,486<span></span>
</td>
<td class="nump">$ 1,919<span></span>
</td>
<td class="nump">$ 1,949<span></span>
</td>
<td class="nump">13,634<span></span>
</td>
<td class="nump">12,048<span></span>
</td>
<td class="nump">6,928<span></span>
</td>
</tr> 33
condition met 8 635 23 <tr class="rou">
<td class="pl" style="border-bo

7
7
310
310
335
condition met 4 335 22 <tr class="reu">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="top.Show.showAR( this, 'defref_us-gaap_ProfitLoss', window );">Net income</a></td>
<td class="nump">$ 4,803<span></span>
</td>
<td class="nump">$ 61,202<span></span>
</td>
</tr> 7
335
condition met 5 335 22 <tr class="reu">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="top.Show.showAR( this, 'defref_us-gaap_ProfitLoss', window );">Net income</a></td>
<td class="nump">$ 4,803<span></span>
</td>
<td class="nump">$ 61,202<span></span>
</td>
</tr> 7
0 False
1 False
22nd
9 https://www.sec.gov/Archives/edgar/data/1347858/0001104659-20-031934.txt
100
100
107
107
155
155
409
condition met 6 409 28 <tr class="reu">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="top.Show.showAR( this, 'defref_us-gaap_NetIncomeLoss', window

In [13]:
# Examine the dataframe created from the loops. Note that "cy" is the current year's net income, "l_cy" is lagged net income,
# and "l2_cy" is the second lag of net income. For a firm that filed in 2020 and reported 2019, 2018, and 2017 on the 
# income statement "cy" is the 2019 Net Income and "l2_cy" is the 2017 Net Income. 

pl0

Unnamed: 0,cik,xbrl_tag,cy,l_cy,l2_cy
0,1770787,defref_us-gaap_NetIncomeLoss,"[(31,251), [], \n]","[(112,485), [], \n]","[(18,762), [], \n]"
0,1591890,defref_us-gaap_NetIncomeLoss,"[311, [], \n]","[804, [], \n]",
0,1599407,defref_us-gaap_NetIncomeLoss,"[(3,381,423), [], \n]","[(1,541,873), [], \n]",
0,1751692,defref_us-gaap_NetIncomeLoss,"[$ 449, [], \n]","[$ (19), [], \n]",
0,1404123,defref_us-gaap_NetIncomeLoss,"[$ (52,554), [], \n]","[$ (44,415), [], \n]","[$ (30,797), [], \n]"
0,1141807,defref_us-gaap_NetIncomeLoss,"[$ 13,634, [], \n]","[$ 12,048, [], \n]","[$ 6,928, [], \n]"
0,34782,defref_us-gaap_NetIncomeLoss,"[$ 92,015, [], \n]","[$ 82,414, [], \n]","[$ 68,051, [], \n]"
0,38723,defref_us-gaap_NetIncomeLoss,"[$ 13,348,373, [], \n]","[$ 17,340,931, [], \n]","[$ 14,905,754, [], \n]"
0,1763329,defref_us-gaap_NetIncomeLoss,"[$ 4,803, [], \n]","[$ 61,202, [], \n]",
0,1347858,defref_us-gaap_NetIncomeLoss,"[(26,558,544), [], \n]","[(7,966,911), [], \n]","[(13,029,117), [], \n]"


In [14]:
pl0['cy']

0        [(31,251), [], \n]
0             [311, [], \n]
0     [(3,381,423), [], \n]
0           [$ 449, [], \n]
0      [$ (52,554), [], \n]
0        [$ 13,634, [], \n]
0        [$ 92,015, [], \n]
0    [$ 13,348,373, [], \n]
0         [$ 4,803, [], \n]
0    [(26,558,544), [], \n]
Name: cy, dtype: object

In [15]:
# The output above looks like a list object, but let's take a closer look at the first and second observations in the 
# dataset. 

pl0['cy'].iloc[0]

<td class="num">(31,251)<span></span>
</td>

In [16]:
pl0['cy'].iloc[1]

<td class="nump">311<span></span>
</td>

In [17]:
# You can see above that the class is sometimes "num" and sometimes "nump" in the underlying 10-K. Below retains the Net Income
# and discards the rest. 
pl0['cy']= pl0['cy'].apply(lambda x: str(x).replace('<td class="nump">', '').split('<span>')[0])
pl0['cy']= pl0['cy'].apply(lambda x: str(x).replace('<td class="num">', '').split('<span>')[0])


pl0['l_cy']= pl0['l_cy'].apply(lambda x: str(x).replace('<td class="nump">', '').split('<span>')[0])
pl0['l_cy']= pl0['l_cy'].apply(lambda x: str(x).replace('<td class="num">', '').split('<span>')[0])

pl0['l2_cy']= pl0['l2_cy'].apply(lambda x: str(x).replace('<td class="nump">', '').split('<span>')[0])
pl0['l2_cy']= pl0['l2_cy'].apply(lambda x: str(x).replace('<td class="num">', '').split('<span>')[0])

In [18]:
pl0

Unnamed: 0,cik,xbrl_tag,cy,l_cy,l2_cy
0,1770787,defref_us-gaap_NetIncomeLoss,"(31,251)","(112,485)","(18,762)"
0,1591890,defref_us-gaap_NetIncomeLoss,311,804,
0,1599407,defref_us-gaap_NetIncomeLoss,"(3,381,423)","(1,541,873)",
0,1751692,defref_us-gaap_NetIncomeLoss,$ 449,$ (19),
0,1404123,defref_us-gaap_NetIncomeLoss,"$ (52,554)","$ (44,415)","$ (30,797)"
0,1141807,defref_us-gaap_NetIncomeLoss,"$ 13,634","$ 12,048","$ 6,928"
0,34782,defref_us-gaap_NetIncomeLoss,"$ 92,015","$ 82,414","$ 68,051"
0,38723,defref_us-gaap_NetIncomeLoss,"$ 13,348,373","$ 17,340,931","$ 14,905,754"
0,1763329,defref_us-gaap_NetIncomeLoss,"$ 4,803","$ 61,202",
0,1347858,defref_us-gaap_NetIncomeLoss,"(26,558,544)","(7,966,911)","(13,029,117)"


In [19]:
'''
############################################################
Discussion: 
############################################################

1) No exceptions noted reconciling the above output to the actual 10-K's. 

2) When I ran this on the first 100 CIK's, I ended up getting data for around 88 of them. The above code was built from 
looking at the exceptions in the exceptions list, identifying what tripped the code, and modifying the loop for future runs. 

3) It appears that there is variation in how the tag is applied to Net Income Attributable to Stockholders vs. Non-Controlling
Interests. See CIK's 1599407 and 1404123 for examples. 

4) Some firms have financial statements that contain lengths > 4 because they also show monthly or quarterly breakdowns in 
addition to the annual numbers. Examples include CIK's 1158449, 1420565, 1423689, and 824142. Refer to "td_keeper" in block [12]
to see that I am keeping 4 or less and the appliacable discussion. 

5) Some firms appear to not have XBRL tagged documents. I can't explain this, but it appears that CIK's 1366928, 1539816, and
1775098 are examples of this. 

6) CIK 1514281 had five tags for the thee-year income statement and not 4. This could be an isolated incident so I didn't 
code around it, but it is something to be aware of. 
'''

'\n############################################################\nDiscussion: \n############################################################\n\n1) No exceptions noted reconciling the above output to the actual 10-K\'s. \n\n2) When I ran this on the first 100 CIK\'s, I ended up getting data for around 88 of them. The above code was built from \nlooking at the exceptions in the exceptions list, identifying what tripped the code, and modifying the loop for future runs. \n\n3) It appears that there is variation in how the tag is applied to Net Income Attributable to Stockholders vs. Non-Controlling\nInterests. See CIK\'s 1599407 and 1404123 for examples. \n\n4) Some firms have financial statements that contain lengths > 4 because they also show monthly or quarterly breakdowns in \naddition to the annual numbers. Examples include CIK\'s 1158449, 1420565, 1423689, and 824142. Refer to "td_keeper" in block [12]\nto see that I am keeping 4 or less and the appliacable discussion. \n\n5) Some fir

In [20]:
'''
#######################
Remaining Issues 
#######################

1) Pulling the "units" of the income statement to make reported numbers comparable. Some firms report in dollars, thousands
of dollars, millions, etc. 

2) Capturing the SIC code for each firm. I wasn't able to think of a good way to do this. 

3) Efficiency. My loops are slow. It seems like there should be a way to find the income statements faster, but this is
the best I could come up with. When I was scraping full income statements and not using the XBRL tags, that approach took
over a full week for the loop to run. 

4) Modifying the code to capture multiple financial statement variables on each pass. 

5) Conditioning on the QTR of the EDGAR fillings to perhaps create the applicable years rather than the "cy", "l_cy" convention
currently used. 

'''

'\n#######################\nRemaining Issues \n#######################\n\n1) Pulling the "units" of the income statement to make reported numbers comparable. Some firms report in dollars, thousands\nof dollars, millions, etc. \n\n2) Capturing the SIC code for each firm. I wasn\'t able to think of a good way to do this. \n\n3) Efficiency. My loops are slow. It seems like there should be a way to find the income statements faster, but this is\nthe best I could come up with. When I was scraping full income statements and not using the XBRL tags, that approach took\nover a full week for the loop to run. \n\n4) Modifying the code to capture multiple financial statement variables on each pass. \n\n5) Conditioning on the QTR of the EDGAR fillings to perhaps create the applicable years rather than the "cy", "l_cy" convention\ncurrently used. \n\n'