# Dump salaries from 990EZ in OR (continuing from part 1)

In [37]:
import unicodecsv as csv
from irsx.xmlrunner import XMLRunner
import pandas as pd

In [2]:
# filers = pd.read_csv("orefilers.csv")
# ez = filers["RETURN_TYPE"] == "990EZ"
# reader = filers[ez]
# reader.to_csv('orefilers_ez.csv',index=False)

In [38]:
# read the whole file of orgs with efilings from part 1 here, it's not very long
file_rows = [] 
# We're using the output of part 1
with open('orefilers.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        return_type = row['RETURN_TYPE']
        if return_type == "990EZ":
            file_rows.append(row)
        


In [39]:
# the name of the output file
outfilename ="ez_comp_1.csv"
outfile = open(outfilename , 'wb')

# the header rows as they'll appear in the output
headers = ["ein", "object_id", "exec_comp"]
# start up a dictwriter, ignore extra rows
dw = csv.DictWriter(outfile, headers, extrasaction='ignore')
dw.writeheader()

In [40]:
# get an XMLRunner -- this is what actually does the parsing
xml_runner = XMLRunner()

## Figure out what to extract

Data from each repeating group should go to it's own file, otherwise it won't make sense.

To figure out what to capture, I started by looking at schedule J: http://www.irsx.info/#IRS990ScheduleJ
Then I went to the table details and picked the rows I wanted from the repeating group:
http://www.irsx.info/metadata/groups/SkdJRltdOrgOffcrTrstKyEmpl.html

Note that it's common for director/employee names in schedule J to get listed as businessname.

Also note that IRSx checks to see if a file has been downloaded before fetching it. Running this the first time will be slow if the filings aren't already downloaded, but much faster if they've already been downloaded.

-------NOTE--------
Edited to get compensation from EZ partIV, not sked j


In [41]:
DEMO_MAX = 40
num_rows = 0

for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 
    
    schedule_list = parsed_filing.list_schedules()
    exec_comp = 0
    if 'IRS990EZ' in schedule_list:
        
        
        # store the output in this dict
        outputdata = {}
        # assign some initial values from the input csv
        outputdata['ein'] = row['EIN']
        outputdata['object_id'] = row['OBJECT_ID']
        
        # some schedules can appear multiple times, but irs990 only appears once
        # so we grab the first one 
        parsed_main = parsed_filing.get_parsed_sked('IRS990EZ')[0] 
        
        
        
        
        
        
        # now use the table name we looked up -- that's where we find it
        try:
            # repeating groups are returned as an array of dicts
            employee_groups = parsed_main['groups']['EZOffcrDrctrTrstEmpl']
        except KeyError:
            print("No EZCmpnstnHghstPdEmpl found in %s skipping" % this_object_id)
            continue
          
        # read through each employee and pull out the data we want
        for employee_group in employee_groups:
            
            # That leaves the following values to come from schedule J if there is one
            # "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"
            # those keys come from the headers we gave dictwriter before
#             outputdata['name'] = employee_group.get('PrsnNm')
#             outputdata['title'] = employee_group.get('TtlTxt')
#             outputdata['avg_hours'] = employee_group.get('AvrgHrsPrWkDvtdTPsRt')  
#             outputdata['comp_amt'] = employee_group.get('CmpnstnAmt')
            comp = int(employee_group.get('CmpnstnAmt'))
            if comp>0:
                exec_comp += comp
            if employee_group.get('EmplyBnftPrgrmAmt'):
                exec_comp += int(employee_group.get('EmplyBnftPrgrmAmt'))
            if employee_group.get('ExpnsAccntOthrAllwncAmt'):
                exec_comp += int(employee_group.get('ExpnsAccntOthrAllwncAmt'))

#             ben = int(employee_group.get('EmplyBnftPrgrmAmt'))
#             xpns = int(employee_group.get('ExpnsAccntOthrAllwncAmt'))
#             if ben:
#                 print(this_object_id, ben)
#             print(this_object_id +" " + employee_group.get('EmplyBnftPrgrmAmt'))
#             print(this_object_id +" " + employee_group.get('ExpnsAccntOthrAllwncAmt'))
#             exec_comp += int(employee_group.get('CmpnstnAmt'))
#             exec_comp += int(employee_group.get('EmplyBnftPrgrmAmt'))
#             exec_comp += int(employee_group.get('ExpnsAccntOthrAllwncAmt'))
        
#             print("filing %s" % this_object_id)
#     else:
#         print("No 990EZ in filing %s, skipping" % this_object_id)

    # Don't run endlessly during a demo:
        outputdata['exec_comp'] = exec_comp
        dw.writerow(outputdata)
        
#     if(num_rows > DEMO_MAX):
#        break
    if num_rows%100==0:
        print("Processed %s filings" % num_rows)

Processed 100 filings
Processed 200 filings
Processed 300 filings
Processed 400 filings
Processed 500 filings
Processed 600 filings
Processed 700 filings
Processed 800 filings
Processed 900 filings
Processed 1000 filings
Processed 1100 filings
Processed 1200 filings
Processed 1300 filings
Processed 1400 filings
Processed 1500 filings
Processed 1600 filings
Processed 1700 filings
Processed 1800 filings
Processed 1900 filings


In [42]:
# close the outfile
outfile.close()

# sked_j_ore_efilers = pd.read_csv(outfilename)
# sked_j_ore_efilers.head()