## Note 4/15/2022:  Attempted to read concordance file and use those expaths to extract columns. However, The team weight in to use other methods to extract data.

# Ideas
* what if we try using a 2015 schema to parse a 2020 file?
* This might be crazy, what if we created a function to get the full xml tree?


# Environment Preparation
Install necessary libraries and make our directories

In [1]:
!pip install xmltodict
!pip install irsx
!mkdir xml_files
!export IRSX_CACHE_DIRECTORY=/content/xml_files

You should consider upgrading via the '/Users/adriankwoo/PycharmProjects/foundation_grantee/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/adriankwoo/PycharmProjects/foundation_grantee/venv/bin/python -m pip install --upgrade pip' command.[0m
mkdir: xml_files: File exists


# Get Files from IRS
Here we will download all of the xml files that are currently available on the IRS' website.

In [1]:
import json
import xmltodict
import requests 
import csv
import re
import os
import xml.etree.ElementTree as ET
import pandas as pd

# function to download all of the zip files for a particular year
def get_xml_files(year):
    # the starting file count is always 1
    count = 1
    # flag variable to stop downloading 
    valid_file = True
    # get the page that has all of the links to download
    download_page = requests.get("https://www.irs.gov/charities-non-profits/form-990-series-downloads")
    # as long as there is a file to download 
    while valid_file:
        # check to see if the url that we want to download is available
        if download_page.text.find(f"https://apps.irs.gov/pub/epostcard/990/xml/{year}/download990xml_{year}_{count}.zip") != -1:
            # get the zip file
            get_request = requests.get(f"https://apps.irs.gov/pub/epostcard/990/xml/{year}/download990xml_{year}_{count}.zip", stream=True)
            # as we stream the file write it in chunks to disk
            print(f"Processing: {year}_{count}_990.zip")
            if os.path.isfile(f"{year}_{count}_990.zip"):
                print("Already downloaded")
                count += 1
                continue
            if count > 1:
                print("AW: download only 1 file for testing purpose")
                return
         
            with open(f"{year}_{count}_990.zip", 'wb') as fd:
                for chunk in get_request.iter_content(chunk_size=128):
                    fd.write(chunk)
            # when we are done downloading the file increment the counter and do it again        
            count += 1

        # if we didn't find the link we are looking for then we have downloaded all
        # available zip files from the IRS for that year    
        else:
            valid_file = False



## Getting XML metadata file
We have available to use a metadata file. Unfortunately it isn't being maintained anymore and will impact all returns that need to be parsed after 2015.


In [2]:
## Download Meta Data from Github
!curl https://raw.githubusercontent.com/Nonprofit-Open-Data-Collective/irs-efile-master-concordance-file/master/efiler_master_concordance.csv > efiler_master_concordance.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
curl: (60) SSL: no alternative certificate subject name matches target host name 'raw.githubusercontent.com'
More details here: https://curl.se/docs/sslcerts.html

curl failed to verify the legitimacy of the server and therefore could not
establish a secure connection to it. To learn more about this situation and
how to fix it, please visit the web page mentioned above.


In [381]:
class Build_metadata():
    
    def __init__(self, tax_form, version = None):
        self.tax_form = tax_form
        self.file_path = os.getcwd() + '/' + 'efiler_master_concordance.csv'
        self.newest_version = None
        self.metadata_extracted = None
        self.attr_version = {}
        self.version = version
        self.attr_lookup_dict = {}
        self.full_attr_list = []
    

    def extract_original_metadata(self):
        ## extract metadata from concordance file clean format

        ## Read File
        with open(self.file_path, 'r') as f:
            lines = csv.reader(f,
                              delimiter=',',
                              quotechar='"',
                              lineterminator='\n',
                              quoting=csv.QUOTE_MINIMAL)
            lines = list(lines)

        meta_data = {}


        for i in range(1, len(lines)):
            
            split_line = lines[i]
            attr_name = split_line[0]
            ## Filter only forms that needed
            if split_line[4] not in self.tax_form:
                continue
            
            if attr_name not in self.full_attr_list:
                self.full_attr_list.append(split_line[0])
            
            for version in split_line[11].split(';'):
                version = version.strip(' ').lower()
                # we needed to add a . to make a valid xpath, remove the /Return/ root, and add the namespace to all elements
                temp = [version, '.'+split_line[10].replace('/Return/','/').replace('/','/xmlns:')]
                if attr_name not in self.attr_version:
                    self.attr_version[attr_name] = []
                ## todo: need to fix those versions that have weird formats
                ## regular expression to fix incorrect formats
                m = re.findall(r"\d{4}v\d\.\d", version)
                for clean_version in m:
                    if clean_version not in meta_data:
                        meta_data[clean_version] = {attr_name: temp}
                    else:
                        # we needed to add a . to make a valid xpath, remove the /Return/ root, and add the namespace to all elements
                        meta_data[clean_version][attr_name] = temp

                    self.attr_version[attr_name].append(clean_version)

        self.metadata_extracted = sorted(meta_data.items(), key=lambda x: x[0])
        print(self.metadata_extracted[-1][0])
        self.newest_version = self.metadata_extracted[-1][0]
        self.metadata_extracted = dict(self.metadata_extracted)

        ## dedupe + sort versions. attr_sorted_version = {'attr': [versions1, versions2, etc]}
        for k,v in self.attr_version.items():
            self.attr_version[k] = sorted(set(v))

    def get_metadata_extracted(self):
        return self.metadata_extracted
    
#     def get_attr_lookup_dict(self):
#         return self.attr_lookup_dict

    def build_each_version_lookup(self, version):
        ## Store xpath into the following format.  {version: {variable_name:  [version,xpath]}}

        for k,v in self.metadata_extracted.items():
            if version == k:
                self.attr_lookup_dict[version] = v
        
        for attr in self.full_attr_list:
            
            ##AW: Skip attribute when concordance file does not have version at all
            if not self.attr_version[attr]:
                continue

            if attr not in self.attr_lookup_dict[version]:
                try:
                    tmp = next(x for i,x in enumerate(self.attr_version[attr]) if x > version)
                except:
                    tmp = None
                    
                
                next_version = tmp or self.attr_version[attr][-1]
                next_version_xpath = self.metadata_extracted[next_version][attr][1]
                
                self.attr_lookup_dict[version][attr] = [next_version, next_version_xpath]
                
        return self.attr_lookup_dict

    def build_attr_dict(self):
        for k,v in self.metadata_extracted.items():
            self.build_each_version_lookup(k)


    def get_attr_lookup_dict(self, in_version):
        ## todo: build logic to ensure in_version has proper format
        # Use latest version if no version exists

        tmp = None
        if in_version not in self.metadata_extracted:
            print(f"Can't find version: {in_version}")
            try:
                tmp = next(x for i,x in enumerate(self.metadata_extracted) if x > in_version)
            except:
                tmp = self.newest_version
                
        in_version = tmp or in_version
        print(f"using version: {in_version}")
        
        return self.attr_lookup_dict[in_version]

In [382]:
build_metadata = Build_metadata(['F990', 'SCHED-I'])

In [383]:
build_metadata.extract_original_metadata()

2015v3.0


In [384]:
build_metadata.build_attr_dict()

In [420]:
attr_lookup_metadata = build_metadata.get_attr_lookup_dict('2012v3.0')

using version: 2012v3.0


In [421]:
len(attr_lookup_metadata)

2747

In [422]:
attr_lookup_metadata

{'F9_03_PC_ACTOTHACTCOD': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:ActivityOther/xmlns:ActivityCode'],
 'F9_04_PC_FINASTMTATTA': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:FinancialStmtAttached'],
 'F9_04_PC_FOREIGACTIVI': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:ForeignActivities'],
 'F9_06_PZ_FOREIGOFFICE': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990EZ/xmlns:ForeignOffice'],
 'F9_07_PC_AVHOPEWEREEL': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:Form990PartVIISectionA/xmlns:AverageHoursPerWeekRelated'],
 'F9_07_PC_NABUBUNALIIN1': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:Form990PartVIISectionA/xmlns:NameBusiness/xmlns:BusinessNameLine1'],
 'F9_07_PC_NABUBUNALIIN2': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:Form990PartVIISectionA/xmlns:NameBusiness/xmlns:BusinessNameLine2'],
 'F9_12_PC_FSABFSBOTH': ['2012v3.0',
  './xmlns:ReturnData/xmlns:IRS990/xmlns:FSAuditedBasis/xmlns:FinancialStatementBoth'],
 'F9_12_PC_FSAB

## It's our data and we want it now
With the files downloaded and the xpaths gotten, hopefully, we will start to parse the xml files and get our values.

In [413]:
def get_data_elements(input_dlr_list):
    #manual process
    if input_dlr_list:
        dir_list = input_dlr_list
    else:
        dir_list = os.listdir("./xml_files")
    # namespace dict
    # we need this because all of the files have a default namespace that will 
    # get applied to all elements in the xml file
    namespaces = {"xmlns": "http://www.irs.gov/efile"}
    results = {}
    # for every file in the xml_files directory
    for file in dir_list[:100]:
#       print(file)
        # parse it
        tree = ET.parse('./xml_files/'+file)
        # get the root of the tree
        root = tree.getroot()
        # get the xpaths to use for the data elements
        
        metadata_dictionary = build_metadata.get_attr_lookup_dict(root.attrib['returnVersion'])
#         metadata_dictionary = get_xml_metadata(root.attrib['returnVersion'], form)
#         print(root.attrib['returnVersion']
        #             , metadata_dictionary)
#            )
        tmp = {}
      # for each data element
        for key,value in metadata_dictionary.items():
            # find it in the xml tree
            element = root.find(value[1],namespaces)
            # if it is present and not None
            if element != None:
                tmp[key] = root.find(value[1],namespaces).text
              # save the data into an array?
        results[file] = tmp
    return results


In [406]:
def clean_up_directory():
  os.remove('./xml_files')

# Putting it all together
This section is about putting all of the functions that we have created together and extracting all of the years of data and putting them into a data format that we will use to join and refine the data.

## Adrian Testing

In [28]:
## Adrian testing

#years = [2015,2016,2017,2018,2019,2020,2021]
years = [2020]
# years = [2019]
for year in years:
    get_xml_files(year)
    
# unzip all of the zip files into the xml_files folder
# !unzip -q -o '*.zip' -d ./xml_files

ConnectionError: HTTPSConnectionPool(host='www.irs.gov', port=443): Max retries exceeded with url: /charities-non-profits/form-990-series-downloads (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fdd84344048>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
# !unzip -q -o '*.zip' -d ./xml_files


3 archives were successfully processed.


In [None]:
# get_xml_metadata('2014v5.0')

In [409]:
# AW: the ones with version
results = get_data_elements([
#                                 '201502549349100300_public.xml', 
                             '201502549349300000_public.xml'
                            ])

using version: 2013v4.0
2013v4.0


In [410]:
for k,v in results.items():
    print(k, v)

201502549349300000_public.xml {'F9_12_PC_ACCTCOMREV': 'false', 'F9_10_PC_ACCTSPAYABLEBOY': '168', 'F9_10_PC_ACCTSPAYABLEEOY': '9051', 'F9_10_PC_ACCTSRECEIVEEOY': '0', 'F9_04_PC_ACTPARTNER': 'false', 'F9_09_PC_ADANDPROMOPSE': '1280', 'F9_09_PC_ADANDPROMOTE': '1280', 'F9_00_PC_SUBORDINCLUDED': 'false', 'F9_08_PC_OTHERCONTRIBREV': '811995', 'F9_09_PC_OTHEREXPMAG': '2484', 'F9_09_PC_OTHEREXPPSE': '149867', 'F9_09_PC_OTHEREXPTE': '149867', 'F9_06_PC_DISCLOSEINT': 'false', 'F9_05_PC_COMPLYBACKUP': 'true', 'F9_09_PC_BENEFITSPAIDTE': '0', 'F9_04_PC_BUSTRANFAM': 'false', 'F9_04_PC_BUSTRANENT': 'false', 'F9_04_PC_BUSTRANOFFICER': 'false', 'F9_10_PC_CASHNOINTRESTBOY': '200916', 'F9_10_PC_CASHNOINTRESTEOY': '195731', 'F9_06_PC_CHNGDOCS': 'false', 'F9_04_PC_COLLECARTRT': 'false', 'F9_09_PC_COMPCUROFFICERFR': '11194', 'F9_09_PC_COMPCUROFFICERMAG': '11194', 'F9_09_PC_COMPCUROFFICERPSE': '89554', 'F9_09_PC_COMPCUROFFICERTE': '111942', 'F9_09_PC_COMPDISQUALTE': '0', 'F9_07_PC_OFDIR1A5': 'false', 'F9_06

In [414]:
# AW: without versons
results = get_data_elements(['201900089349301540_public.xml', '201900079349100050_public.xml'])

Can't find version: 2018v3.0
using version: 2015v3.0
Can't find version: 2017v2.3
using version: 2015v3.0


In [415]:
for k,v in results.items():
    print(k, v)

201900089349301540_public.xml {'F9_12_PC_ACCTCOMREV': 'false', 'F9_04_PC_ACTPARTNER': 'false', 'F9_09_PC_ADANDPROMOPSE': '90', 'F9_09_PC_ADANDPROMOTE': '90', 'F9_08_PC_OTHERCONTRIBREV': '35067', 'F9_09_PC_OTHEREXPPSE': '17773', 'F9_09_PC_OTHEREXPTE': '17773', 'F9_04_PC_BUSTRANFAM': 'false', 'F9_04_PC_BUSTRANENT': 'true', 'F9_04_PC_BUSTRANOFFICER': 'true', 'F9_10_PC_CASHNOINTRESTBOY': '175671', 'F9_10_PC_CASHNOINTRESTEOY': '199590', 'F9_06_PC_CHNGDOCS': 'false', 'F9_04_PC_COLLECARTRT': 'false', 'F9_07_PC_OFDIR1A5': 'false', 'F9_06_PC_COMPPROCCEO': 'false', 'F9_06_PC_COMPPROCOTH': 'false', 'F9_06_PC_CONFINTPOL': 'false', 'F9_04_PC_CONSEREASEME': 'false', 'F9_04_PC_CONAUDFINSTM': 'false', 'F9_04_PC_CREDITCOUNSE': 'false', 'F9_06_PC_DECSUBAPPROV': 'false', 'F9_04_PC_ARTCONT': 'false', 'F9_04_PC_NONCASHCONT': 'false', 'F9_06_PC_DELMAJDUTY': 'false', 'F9_04_PC_DESINSSECIND': 'true', 'F9_04_PC_OWNDISRGDENT': 'false', 'F9_06_PC_DOCRTNPOL': 'false', 'F9_04_PC_DONOADVIFUND': 'false', 'F9_06_PC_M

In [103]:
# AW: without versons
# results = get_data_elements(['202100139349100300_public.xml', '202100079349100000_public.xml'], ['F990', 'SCHED-I'])

In [104]:
# for k,v in results.items():
#     print(k, v)

In [416]:
# AW: without versons
results = get_data_elements(['202100079349100000_public.xml'])

Can't find version: 2019v5.1
using version: 2015v3.0


In [417]:
for k,v in results['202100079349100000_public.xml'].items():
    print(k, ":", v)

F9_01_PF_ANREEXADNEIN : 0
F9_01_PF_ARECPDCHRTBL : 0
F9_01_PF_ARECPREXPNSS : 9246
F9_01_PF_ARECRREXPNSS : 8917
F9_01_PF_AREEROEXPENS : -394
F9_01_PF_ANREEXNEININ : 0
F9_01_PF_AREOEREXPNSS : 65
F9_01_PF_ARESBNRIND : X
F9_01_PF_ARETEDCHRTBL : 0
F9_01_PF_ARETENIINCM : 0
F9_01_PF_ARETEREXPNSS : 9311
F9_01_PF_ARETNIINCOME : 0
F9_01_PF_ANREEXTOREEX : 8917
F9_01_PF_ARETOEDCHRTB : 0
F9_01_PF_ARETOENIINCM : 0
F9_01_PF_ARETOEREXPNS : 65
F9_03_PF_CINAFBEROEXP : -394
F9_03_PF_CINAFBSUBTOT : 9325
F9_03_PF_CINAFBTNABOY : 9719
F9_03_PF_CINAFBTNAEOY : 9325
F9_11_PF_DIAMDIASADDJ : 125
F9_11_PF_DIAMDIBEADDJ : 125
F9_11_PF_DIAMDIBEDEED : 125
F9_11_PF_DIAMMIINREET : 125
F9_06_PF_ETBOIIIIETAX : 0
F9_06_PF_ETBOIISATAX : 0
F9_06_PF_ETBOIITBOIIN : 0
F9_06_PF_ETBOIITUSECT1 : 0
F9_00_PF_ASSEOYOYY : 9325
F9_02_PF_BASHCABOOYY : 1487
F9_02_PF_BASHCAEOOYY : 1093
F9_02_PF_BSOABOY : 7205
F9_02_PF_BSOAEOY : 7205
F9_02_PF_BSSATCIBOY : 1027
F9_02_PF_BSSATCIEOY : 1027
F9_02_PF_BSTABOY : 9719
F9_02_PF_BSTAEOY : 9325
F9_02_

In [418]:
# AW: without versons
results = get_data_elements(['201502969349301405_public.xml'])

using version: 2012v3.0


In [419]:
for k,v in results['201502969349301405_public.xml'].items():
    print(k, ":", v)

F9_04_PC_FINASTMTATTA : true
F9_04_PC_FOREIGACTIVI : true
F9_12_PC_FSABFSCONSOL : X
F9_04_PC_FUNDRAACTIVI : false
F9_04_PC_GAMINGAMING : false
F9_04_PC_GRANTSTOIVID : false
F9_04_PC_GRANTOORORGA : true
F9_04_PC_HOSPITALOSPI : true
F9_04_PC_INDAUDFINSTM : false
F9_04_PC_LOBBYIACTIVI : true
F9_04_PC_MOTHKTTOINND : false
F9_04_PC_MOTHKTTOORRG : false
F9_04_PC_POLITIACTIVI : false
F9_04_PC_PROFESFUNDRA : false
F9_04_PC_REPOFIN4FOOT : false
F9_04_PC_REPINVOTHSEC : false
F9_04_PC_REPLANBLDEQU : true
F9_04_PC_REPOOTHEASSE : true
F9_04_PC_REPOOTHELIAB : true
F9_04_PC_REPPRORELINV : false
F9_01_PC_RELEEXCYY : -4171168
F9_01_PC_RELEEXPRYEEA : -3989420
F9_04_PC_SCHEBREQREQU : true
F9_04_PC_SCHEJREQREQU : true
F9_04_PC_SCHOOLCHOOL : false
F9_04_PC_TAXEEXEMBOND : false
F9_04_PC_TERMPERMENDO : true
F9_03_PC_TOOTPRSEEXXP : 358682
F9_03_PC_TOOTPRSEGRRN : 22170
F9_03_PC_TOOTPRSEREEV : 323020
F9_04_PC_ACCOUNRECEIV : 
        
F9_00_PC_PRINCIPALUSCITY : BRONX
F9_00_PC_PRINCIPALUSSTATE : NY
F9_00_PC_PRINC

In [11]:
%ls xml_files/201502969349301405_public.xml

xml_files/201502969349301405_public.xml


In [1]:
# a#years = [2015,2016,2017,2018,2019,2020,2021]
# years = [2019]
# for year in years:
#   get_xml_files(year)
#   # results = get_data_elements()
#   # print(results)
#   #clean_up_directory()
# # unzip all of the zip files into the xml_files folder
# !unzip -q -o '*.zip' -d ./xml_files
# for year in years:
#   results = get_data_elements()
#   # print(results)
#   df = pd.DataFrame(data=results)
#   df.head()
#   with open(f'./{year}_results.json','w') as f:
#     f.write(json.dumps(results))

In [None]:
# !rm -rf ./xml_files
# !mkdir ./xml_files
df.head()

Unnamed: 0,F9_00_HD_FILEREIN
0,363192151
1,831080523
2,591377498
3,383544723
4,900533069


## End Result
[{"F9_00_HD_FILEREIN": "352222596"}, {"F9_00_HD_FILEREIN": "272348616"}, {"F9_00_HD_FILEREIN": "111310340"}, {"F9_00_HD_FILEREIN": "541833299"}, {"F9_00_HD_FILEREIN": "840730973"}, {"F9_00_HD_FILEREIN": "274984752"}, {"F9_00_HD_FILEREIN": "510151095"}, {"F9_00_HD_FILEREIN": "810350430"}, {"F9_00_HD_FILEREIN": "273413890"}, {"F9_00_HD_FILEREIN": "236411295"}, {"F9_00_HD_FILEREIN": "581917328"}, {"F9_00_HD_FILEREIN": "520715089"}, {"F9_00_HD_FILEREIN": "710951453"}, {"F9_00_HD_FILEREIN": "113679500"}, {"F9_00_HD_FILEREIN": "237424374"}, {"F9_00_HD_FILEREIN": "752717838"}, {"F9_00_HD_FILEREIN": "450406158"}, {"F9_00_HD_FILEREIN": "061051588"}, {"F9_00_HD_FILEREIN": "160613575"}, {"F9_00_HD_FILEREIN": "900448658"}, {"F9_00_HD_FILEREIN": "832328780"}, {"F9_00_HD_FILEREIN": "363977640"}, {"F9_00_HD_FILEREIN": "463001851"}, {"F9_00_HD_FILEREIN": "831836736"}, {"F9_00_HD_FILEREIN": "043182537"}, {"F9_00_HD_FILEREIN": "042160716"}, {"F9_00_HD_FILEREIN": "391460399"}, {"F9_00_HD_FILEREIN": "356065637"}, {"F9_00_HD_FILEREIN": "741601060"}, {"F9_00_HD_FILEREIN": "576033523"}, {"F9_00_HD_FILEREIN": "237098710"}, {"F9_00_HD_FILEREIN": "475305938"}, {"F9_00_HD_FILEREIN": "870238633"}, {"F9_00_HD_FILEREIN": "541564906"}, {"F9_00_HD_FILEREIN": "814552155"}, {"F9_00_HD_FILEREIN": "341840443"}, {"F9_00_HD_FILEREIN": "134078531"}, {"F9_00_HD_FILEREIN": "251845550"}, {"F9_00_HD_FILEREIN": "680437840"}, {"F9_00_HD_FILEREIN": "473586125"}, {"F9_00_HD_FILEREIN": "841588173"}, {"F9_00_HD_FILEREIN": "591689369"}, {"F9_00_HD_FILEREIN": "716057382"}, {"F9_00_HD_FILEREIN": "204249277"}, {"F9_00_HD_FILEREIN": "461086177"}, {"F9_00_HD_FILEREIN": "330216607"}, {"F9_00_HD_FILEREIN": "580558285"}, {"F9_00_HD_FILEREIN": "930245665"}, {"F9_00_HD_FILEREIN": "741948396"}, {"F9_00_HD_FILEREIN": "141706748"}, {"F9_00_HD_FILEREIN": "581902569"}, {"F9_00_HD_FILEREIN": "262704632"}, {"F9_00_HD_FILEREIN": "261904746"}, {"F9_00_HD_FILEREIN": "460964040"}, {"F9_00_HD_FILEREIN": "263721539"}, {"F9_00_HD_FILEREIN": "812951319"}, {"F9_00_HD_FILEREIN": "223426417"}, {"F9_00_HD_FILEREIN": "541197325"}, {"F9_00_HD_FILEREIN": "310884250"}, {"F9_00_HD_FILEREIN": "208527502"}, {"F9_00_HD_FILEREIN": "581986546"}, {"F9_00_HD_FILEREIN": "951648184"}, {"F9_00_HD_FILEREIN": "473752645"}, {"F9_00_HD_FILEREIN": "930977878"}, {"F9_00_HD_FILEREIN": "880277449"}, {"F9_00_HD_FILEREIN": "396035684"}, {"F9_00_HD_FILEREIN": "421363581"}, {"F9_00_HD_FILEREIN": "570726213"}, {"F9_00_HD_FILEREIN": "461161895"}, {"F9_00_HD_FILEREIN": "800481815"}, {"F9_00_HD_FILEREIN": "454535664"}, {"F9_00_HD_FILEREIN": "270370317"}, {"F9_00_HD_FILEREIN": "384013791"}, {"F9_00_HD_FILEREIN": "416032698"}, {"F9_00_HD_FILEREIN": "420924381"}, {"F9_00_HD_FILEREIN": "472158694"}, {"F9_00_HD_FILEREIN": "237267645"}, {"F9_00_HD_FILEREIN": "721192862"}, {"F9_00_HD_FILEREIN": "134156877"}, {"F9_00_HD_FILEREIN": "954405467"}, {"F9_00_HD_FILEREIN": "200531481"}, {"F9_00_HD_FILEREIN": "203979613"}, {"F9_00_HD_FILEREIN": "042748995"}, {"F9_00_HD_FILEREIN": "226058144"}, {"F9_00_HD_FILEREIN": "825480444"}, {"F9_00_HD_FILEREIN": "431857630"}, {"F9_00_HD_FILEREIN": "830995556"}, {"F9_00_HD_FILEREIN": "630920064"}, {"F9_00_HD_FILEREIN": "264263977"}, {"F9_00_HD_FILEREIN": "822181243"}, {"F9_00_HD_FILEREIN": "464684942"}, {"F9_00_HD_FILEREIN": "461351210"}, {"F9_00_HD_FILEREIN": "521007762"}, {"F9_00_HD_FILEREIN": "366124610"}, {"F9_00_HD_FILEREIN": "814136522"}, {"F9_00_HD_FILEREIN": "204789447"}, {"F9_00_HD_FILEREIN": "631106495"}, {"F9_00_HD_FILEREIN": "273047161"}, {"F9_00_HD_FILEREIN": "810813160"}, {"F9_00_HD_FILEREIN": "470467467"}]

Basically only one field is found in 100 documents. We will need to figure out something else.

# IRSx


In [None]:
from irsx.xmlrunner import XMLRunner
from irsx.settings import INDEX_DIRECTORY
!irsx_index --verbose --year=2019
INDEX_2019= os.path.join(INDEX_DIRECTORY, 'index_2019.csv')
np_2019 = pd.read_csv(INDEX_2019)
xml_runner = XMLRunner()
parsed_filing = xml_runner.run_filing('201923179349304472')
# Is there a Schedule A there at all?
schedule_list = parsed_filing.list_schedules()
print(schedule_list)
business_name = ''
business_ein = ''
if 'ReturnHeader990x' in schedule_list:
    
    # store the output in this dict
    outputdata = {}
    # assign some initial values from the input csv
    parsed_sked = parsed_filing.get_parsed_sked('ReturnHeader990x')
    business_name = parsed_sked[0]['schedule_parts']['returnheader990x_part_i']['BsnssNm_BsnssNmLn1Txt']
    business_ein = parsed_sked[0]['schedule_parts']['returnheader990x_part_i']['Flr_EIN']

with open('./201923179349304472.json', 'w') as f:
  f.write(json.dumps(parsed_filing.get_raw_irs_dict()))

Getting index file for year: 2019 remote=https://s3.amazonaws.com/irs-form-990/index_2019.csv local=/usr/local/lib/python3.7/dist-packages/irsx/CSV/index_2019.csv
Beginning streaming download of https://s3.amazonaws.com/irs-form-990/index_2019.csv
Total file size: 50.48 MB
Download completed to /usr/local/lib/python3.7/dist-packages/irsx/CSV/index_2019.csv in 0:00:03.960710
['ReturnHeader990x', 'IRS990', 'IRS990ScheduleA', 'IRS990ScheduleB', 'IRS990ScheduleD', 'IRS990ScheduleG', 'IRS990ScheduleL', 'IRS990ScheduleO']


In [None]:
from irsx.settings import INDEX_DIRECTORY
import json
import xmltodict
import requests 
import csv
import re
import os
import xml.etree.ElementTree as ET
import pandas as pd
from irsx.xmlrunner import XMLRunner

!irsx_index --verbose --year=2019
xml_runner = XMLRunner()
INDEX_2019= os.path.join(INDEX_DIRECTORY, 'index_2019.csv')

def get_grantee_info(rows, business_name, business_ein):
    # store the output in this dict
    outputdata = {}
    output = []
    if 'PFGrntOrCntrbtnPdDrYr' in rows.keys():
      for row in rows['PFGrntOrCntrbtnPdDrYr']:
        if 'GrntOrCntrbtnPdDrYr_RcpntPrsnNm' in row:
              outputdata['ein'] = business_ein
              outputdata['Foundation Name'] = business_name
              outputdata['Grantee'] = row['GrntOrCntrbtnPdDrYr_RcpntPrsnNm']
              outputdata['City'] = row['RcpntUSAddrss_CtyNm']
              outputdata['State'] = row['RcpntUSAddrss_SttAbbrvtnCd']
              outputdata['Purpose'] = row['GrntOrCntrbtnPdDrYr_GrntOrCntrbtnPrpsTxt']
              outputdata['Amount'] = row['GrntOrCntrbtnPdDrYr_Amt']
              outputdata['Paid'] = 'True'
              outputdata['Future Pay'] = 'False'
              #outputdata['Tax period 990-PF'] = row['TAX_PERIOD_x']
              output.append(outputdata)

    if 'PFGrntOrCntrApprvFrFt' in rows.keys():
      for row in rows['PFGrntOrCntrApprvFrFt']:
        if 'GrntOrCntrbtnPdDrYr_RcpntPrsnNm' in row:
              outputdata['ein'] = business_ein
              outputdata['Foundation Name'] = business_name
              outputdata['Grantee'] = row['GrntOrCntrApprvFrFt_RcpntPrsnNm']
              outputdata['City'] = row['RcpntUSAddrss_CtyNm']
              outputdata['State'] = row['RcpntUSAddrss_SttAbbrvtnCd']
              outputdata['Purpose'] = row['GrntOrCntrApprvFrFt_GrntOrCntrbtnPrpsTxt']
              outputdata['Amount'] = row['GrntOrCntrApprvFrFt_Amt']
              outputdata['Paid'] = 'False'
              outputdata['Future Pay'] = 'True'
              #outputdata['Tax period 990-PF'] = row['TAX_PERIOD_x']
              output.append(outputdata)

    return output

np_2019 = pd.read_csv(INDEX_2019)
forms = []
for id in np_2019['OBJECT_ID']:
    try:
      parsed_filing = xml_runner.run_filing(id)
      schedule_list = parsed_filing.list_schedules()
      business_name = ''
      business_ein = ''
      if 'ReturnHeader990x' in schedule_list:
        # assign some initial values from the input csv
        parsed_sked = parsed_filing.get_parsed_sked('ReturnHeader990x')
        business_name = parsed_sked[0]['schedule_parts']['returnheader990x_part_i']['BsnssNm_BsnssNmLn1Txt']
        business_ein = parsed_sked[0]['schedule_parts']['returnheader990x_part_i']['Flr_EIN']
      if 'IRS990PF' in parsed_filing.list_schedules():
          results = get_grantee_info(parsed_filing.get_parsed_sked('IRS990PF')[0]['groups'],business_name,business_ein)
          forms.append(results)
    except:      
      print('aaaaaaa')

with open('./forms_990.json', 'w') as f:
  f.write(json.dumps(forms))


aaaaaaa


# PDF text extraction
Just putting the start of what might be a backup plan here. Hopefully we don't have to use it because it will be a little more intesive than just XML parsing.

In [None]:
!wget https://apps.irs.gov/pub/epostcard/990/2021/01/download990pdf_01_2021_prefixes_01-11.zip

In [None]:
#!mkdir pdfs
#!unzip download990pdf_01_2021_prefixes_01-11.zip -d ./pdfs