# Dump salaries from 990PF in OR (continuing from part 1)

In [82]:
import unicodecsv as csv
from irsx.xmlrunner import XMLRunner
import pandas as pd

In [76]:
# filers = pd.read_csv("orefilers.csv")
# pf = filers["RETURN_TYPE"] == "990PF"
# reader = filers[pf]
# reader.to_csv('orefilers_pf.csv',index=False)

In [83]:
# read the whole file of orgs with efilings from part 1 here, it's not very long
file_rows = [] 
# We're using the output of part 1
with open('orefilers_pf.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        file_rows.append(row)
#         return_type = row['RETURN_TYPE']
#         if return_type == "990PF":
#             file_rows.append(row)
        

In [84]:
# the name of the output file
outfilename ="pf_comp_1.csv"
outfile = open(outfilename , 'wb')

# the header rows as they'll appear in the output
headers = ["period", "ein", "object_id", "taxpayer_name", "name", "title", "avg_hours", "comp_amt"]
# start up a dictwriter, ignore extra rows
dw = csv.DictWriter(outfile, headers, extrasaction='ignore')
dw.writeheader()

In [85]:
# get an XMLRunner -- this is what actually does the parsing
xml_runner = XMLRunner()

## Figure out what to extract

Data from each repeating group should go to it's own file, otherwise it won't make sense.

To figure out what to capture, I started by looking at schedule J: http://www.irsx.info/#IRS990ScheduleJ
Then I went to the table details and picked the rows I wanted from the repeating group:
http://www.irsx.info/metadata/groups/SkdJRltdOrgOffcrTrstKyEmpl.html

Note that it's common for director/employee names in schedule J to get listed as businessname.

Also note that IRSx checks to see if a file has been downloaded before fetching it. Running this the first time will be slow if the filings aren't already downloaded, but much faster if they've already been downloaded.

-------NOTE--------
Edited to get compensation from EZ partIV, not sked j


In [90]:
DEMO_MAX = 10
num_rows = 0

for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    print(this_object_id)
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 
    
    schedule_list = parsed_filing.list_schedules()
    if 'IRS990PF' in schedule_list:
        
        # store the output in this dict
        outputdata = {}
        # assign some initial values from the input csv
        outputdata['period'] = row['TAX_PERIOD_x']
        outputdata['ein'] = row['EIN']
        outputdata['object_id'] = row['OBJECT_ID']
        outputdata['taxpayer_name'] = row['TAXPAYER_NAME']
        
        # some schedules can appear multiple times, but irs990 only appears once
        # so we grab the first one 
        parsed_main = parsed_filing.get_parsed_sked('IRS990PF')[0] 
        
        # now use the table name we looked up -- that's where we find it
        try:
            # repeating groups are returned as an array of dicts
            employee_groups = parsed_main['groups']['PFCmpnstnHghstPdEmpl']
        except KeyError:
            print("No PFCmpnstnHghstPdEmpl found in %s skipping" % this_object_id)
            continue
          
        # read through each employee and pull out the data we want
        for employee_group in employee_groups:
            
            # That leaves the following values to come from schedule J if there is one
            # "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"
            # those keys come from the headers we gave dictwriter before
            outputdata['name'] = employee_group.get('CmpnstnHghstPdEmpl_PrsnNm')
#             outputdata['business_name1'] = employee_group.get('BsnssNmLn1')
#             outputdata['business_name2'] = employee_group.get('BsnssNmLn2')
            outputdata['title'] = employee_group.get('CmpnstnHghstPdEmpl_TtlTxt')
            outputdata['avg_hours'] = employee_group.get('CmpnstnHghstPdEmpl_AvrgHrsPrWkDvtdTPsRt')  
            outputdata['comp_amt'] = employee_group.get('CmpnstnHghstPdEmpl_CmpnstnAmt')
        
            dw.writerow(outputdata)
            print("filing %s" % this_object_id)
    else:
        print("No 990PF in filing %s, skipping" % this_object_id)

    # Don't run endlessly during a demo:
    if(num_rows == 99):
       break
    if num_rows%100==0:
        print("Processed %s filings" % num_rows)

201703139349302875
No 990EZ in filing 201703139349302875, skipping
201843129349302544
Filing version 2017v2.3 isn't supported for this operation
No 990EZ in filing 201843129349302544, skipping
201743139349302804
No 990EZ in filing 201743139349302804, skipping
201843129349302194
Filing version 2017v2.3 isn't supported for this operation
No 990EZ in filing 201843129349302194, skipping
201713139349302526
No 990EZ in filing 201713139349302526, skipping
201833129349302398
Filing version 2017v2.3 isn't supported for this operation
No 990EZ in filing 201833129349302398, skipping
201733149349302708
No 990EZ in filing 201733149349302708, skipping
201803129349302655
Filing version 2017v2.3 isn't supported for this operation
No 990EZ in filing 201803129349302655, skipping
201743139349302594
No 990EZ in filing 201743139349302594, skipping
201843129349302299
Filing version 2017v2.3 isn't supported for this operation
No 990EZ in filing 201843129349302299, skipping
201703139349302525
No 990EZ in fili

Filing version 2017v2.2 isn't supported for this operation
No 990EZ in filing 201812829349300536, skipping


In [91]:
# close the outfile
outfile.close()

sked_j_ore_efilers = pd.read_csv(outfilename)
sked_j_ore_efilers

Unnamed: 0,period,ein,object_id,taxpayer_name,name,title,avg_hours,comp_amt
