# Dump salaries from schedule J in Portland, OR (continuing from part 1)

In [2]:
import unicodecsv as csv
from irsx.xmlrunner import XMLRunner
import pandas as pd

In [3]:
# read the whole file of orgs with efilings from part 1 here, it's not very long
file_rows = [] 
# We're using the output of part 1
with open('orefilers.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        file_rows.append(row)
        

In [9]:
# the name of the output file
outfilename ="comp_1.csv"
outfile = open(outfilename , 'wb')

# the header rows as they'll appear in the output
headers = ["period", "ein", "object_id", "taxpayer_name", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_comp"]
# start up a dictwriter, ignore extra rows
dw = csv.DictWriter(outfile, headers, extrasaction='ignore')
dw.writeheader()

In [10]:
# get an XMLRunner -- this is what actually does the parsing
xml_runner = XMLRunner()

## Figure out what to extract

Data from each repeating group should go to it's own file, otherwise it won't make sense.

To figure out what to capture, I started by looking at schedule J: http://www.irsx.info/#IRS990ScheduleJ
Then I went to the table details and picked the rows I wanted from the repeating group:
http://www.irsx.info/metadata/groups/SkdJRltdOrgOffcrTrstKyEmpl.html

Note that it's common for director/employee names in schedule J to get listed as businessname.

Also note that IRSx checks to see if a file has been downloaded before fetching it. Running this the first time will be slow if the filings aren't already downloaded, but much faster if they've already been downloaded.

-------NOTE--------
Edited to get compensation from part VII, not sked j


In [11]:
DEMO_MAX = 10
num_rows = 0

for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 
    
    schedule_list = parsed_filing.list_schedules()
    if 'IRS990' in schedule_list:
        
        # store the output in this dict
        outputdata = {}
        # assign some initial values from the input csv
        outputdata['period'] = row['TAX_PERIOD_x']
        outputdata['ein'] = row['EIN']
        outputdata['object_id'] = row['OBJECT_ID']
        outputdata['taxpayer_name'] = row['TAXPAYER_NAME']
        
        # some schedules can appear multiple times, but irs990 only appears once
        # so we grab the first one 
        parsed_main = parsed_filing.get_parsed_sked('IRS990')[0]
        
        
        
        
        
        # now use the table name we looked up -- that's where we find it
        try:
            # repeating groups are returned as an array of dicts
            employee_groups = parsed_main['groups']['Frm990PrtVIISctnA']
        except KeyError:
            print("No Frm990PrtVIISctnA found in %s skipping" % this_object_id)
            continue
          
        # read through each employee and pull out the data we want
        for employee_group in employee_groups:
            
            # That leaves the following values to come from schedule J if there is one
            # "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"
            # those keys come from the headers we gave dictwriter before
            outputdata['name'] = employee_group.get('PrsnNm')
            outputdata['business_name1'] = employee_group.get('BsnssNmLn1Txt')
            outputdata['business_name2'] = employee_group.get('BsnssNmLn2Txt')
            outputdata['title'] = employee_group.get('TtlTxt')
            outputdata['org_comp'] = employee_group.get('RprtblCmpFrmOrgAmt')  
            outputdata['related_comp'] = employee_group.get('RprtblCmpFrmRltdOrgAmt')
            outputdata['other_comp'] = employee_group.get('OthrCmpnstnAmt')
        
            dw.writerow(outputdata)
        
    else:
        print("No 990 in filing %s, skipping" % this_object_id)

    # Don't run endlessly during a demo:
#    if(num_rows > DEMO_MAX):
#        break
    if num_rows%100==0:
        print("Processed %s filings" % num_rows)

No 990 in filing 201713199349102316, skipping
No 990 in filing 201832689349100608, skipping
Processed 100 filings
No 990 in filing 201713189349100846, skipping
No 990 in filing 201810269349100601, skipping
No 990 in filing 201733189349100873, skipping
No 990 in filing 201821319349102262, skipping
No 990 in filing 201823109349100732, skipping
No 990 in filing 201812569349100761, skipping
Processed 200 filings
No 990 in filing 201703149349100630, skipping
No 990 in filing 201802679349100410, skipping
No 990 in filing 201812349349100716, skipping
No 990 in filing 201801349349102400, skipping
No 990 in filing 201832129349100143, skipping
No 990 in filing 201811999349100706, skipping
No 990 in filing 201841359349103919, skipping
No 990 in filing 201833199349104508, skipping
Processed 300 filings
No 990 in filing 201811319349101371, skipping
No 990 in filing 201831349349104508, skipping
No 990 in filing 201831069349100803, skipping
No 990 in filing 201842969349100319, skipping
No 990 in fili

No 990 in filing 201713189349206406, skipping
No 990 in filing 201842749349100109, skipping
No 990 in filing 201841999349100409, skipping
No 990 in filing 201820179349100402, skipping
Processed 2500 filings
No 990 in filing 201831279349200963, skipping
No 990 in filing 201843109349100034, skipping
No 990 in filing 201801319349101880, skipping
No 990 in filing 201733149349200003, skipping
No 990 in filing 201831039349100128, skipping
No 990 in filing 201821359349101692, skipping
Processed 2600 filings
No 990 in filing 201832189349100218, skipping
No 990 in filing 201821019349100717, skipping
No 990 in filing 201841909349100714, skipping
No 990 in filing 201811299349202046, skipping
No 990 in filing 201832199349200743, skipping
No 990 in filing 201811309349101756, skipping
No 990 in filing 201703189349103300, skipping
No 990 in filing 201843069349100434, skipping
No 990 in filing 201733199349207373, skipping
No 990 in filing 201841289349101654, skipping
Processed 2700 filings
No 990 in f

No 990 in filing 201831149349200333, skipping
No 990 in filing 201830549349200538, skipping
No 990 in filing 201842269349200334, skipping
No 990 in filing 201820459349200922, skipping
No 990 in filing 201802859349200420, skipping
No 990 in filing 201803099349200740, skipping
No 990 in filing 201802889349201385, skipping
No 990 in filing 201812179349200401, skipping
No 990 in filing 201810339349200731, skipping
No 990 in filing 201812549349200241, skipping
Processed 3400 filings
No 990 in filing 201840589349200009, skipping
No 990 in filing 201831949349200533, skipping
No 990 in filing 201713149349202311, skipping
No 990 in filing 201822489349200317, skipping
No 990 in filing 201831349349206118, skipping
No 990 in filing 201821359349202522, skipping
No 990 in filing 201832569349200738, skipping
No 990 in filing 201812829349200951, skipping
No 990 in filing 201821359349206322, skipping
No 990 in filing 201713339349200521, skipping
No 990 in filing 201812929349200616, skipping
No 990 in f

No 990 in filing 201801509349200020, skipping
No 990 in filing 201841449349200209, skipping
No 990 in filing 201820869349200112, skipping
No 990 in filing 201703269349200205, skipping
No 990 in filing 201822219349200952, skipping
No 990 in filing 201741609349200309, skipping
No 990 in filing 201801659349200545, skipping
No 990 in filing 201842269349201229, skipping
No 990 in filing 201821359349203387, skipping
No 990 in filing 201713149349202076, skipping
No 990 in filing 201811309349201141, skipping
No 990 in filing 201833009349200003, skipping
No 990 in filing 201822529349200352, skipping
No 990 in filing 201822559349200757, skipping
No 990 in filing 201733189349201313, skipping
No 990 in filing 201840309349200859, skipping
No 990 in filing 201822709349200047, skipping
No 990 in filing 201802219349100515, skipping
No 990 in filing 201842219349100614, skipping
No 990 in filing 201812489349100216, skipping
No 990 in filing 201820089349200762, skipping
No 990 in filing 20174319934920302

No 990 in filing 201832549349200048, skipping
No 990 in filing 201841349349101879, skipping
No 990 in filing 201733339349200013, skipping
No 990 in filing 201831319349200988, skipping
No 990 in filing 201840469349201214, skipping
No 990 in filing 201821359349204047, skipping
No 990 in filing 201821359349206142, skipping
No 990 in filing 201801919349200120, skipping
No 990 in filing 201831989349201063, skipping
No 990 in filing 201743179349201034, skipping
No 990 in filing 201811349349103801, skipping
No 990 in filing 201821359349207052, skipping
No 990 in filing 201723179349202442, skipping
No 990 in filing 201800229349200225, skipping
No 990 in filing 201803119349201765, skipping
No 990 in filing 201821349349204147, skipping
No 990 in filing 201733199349204478, skipping
No 990 in filing 201801349349203185, skipping
No 990 in filing 201743479349200029, skipping
No 990 in filing 201841349349202884, skipping
No 990 in filing 201801699349200745, skipping
No 990 in filing 20183135934920403

No 990 in filing 201743349349200504, skipping
No 990 in filing 201823129349200447, skipping
No 990 in filing 201821849349200737, skipping
No 990 in filing 201743139349202044, skipping
No 990 in filing 201831979349200538, skipping
No 990 in filing 201800799349200230, skipping
No 990 in filing 201713129349201961, skipping
No 990 in filing 201842839349200549, skipping
No 990 in filing 201832439349200128, skipping
No 990 in filing 201743469349200319, skipping
No 990 in filing 201823129349201437, skipping
No 990 in filing 201832259349200638, skipping
No 990 in filing 201822579349200957, skipping
No 990 in filing 201813129349202016, skipping
No 990 in filing 201842679349100354, skipping
No 990 in filing 201733149349201478, skipping
No 990 in filing 201813139349201801, skipping
No 990 in filing 201821909349200217, skipping
No 990 in filing 201743199349207029, skipping
No 990 in filing 201823039349201067, skipping
No 990 in filing 201841279349101389, skipping
No 990 in filing 20180143934920013

No 990 in filing 201802579349200805, skipping
No 990 in filing 201811729349200231, skipping
No 990 in filing 201703179349202145, skipping
No 990 in filing 201801309349200400, skipping
No 990 in filing 201801299349202090, skipping
No 990 in filing 201830239349200418, skipping
No 990 in filing 201800169349100105, skipping
No 990 in filing 201811109349200606, skipping
No 990 in filing 201842269349201069, skipping
No 990 in filing 201841099349200224, skipping
Processed 4400 filings
No 990 in filing 201820949349200727, skipping
No 990 in filing 201840459349200029, skipping
No 990 in filing 201820459349200117, skipping
No 990 in filing 201832769349200203, skipping
No 990 in filing 201830829349100613, skipping
No 990 in filing 201733249349200323, skipping
No 990 in filing 201822279349200797, skipping
No 990 in filing 201743049349201259, skipping
No 990 in filing 201800479349200415, skipping
No 990 in filing 201800729349200200, skipping
No 990 in filing 201812229349200111, skipping
No 990 in f

No 990 in filing 201810479349200521, skipping
No 990 in filing 201801169349200340, skipping
No 990 in filing 201703159349200620, skipping
No 990 in filing 201822559349200862, skipping
No 990 in filing 201703139349201980, skipping
No 990 in filing 201823199349208702, skipping
No 990 in filing 201832199349200448, skipping
No 990 in filing 201802199349200145, skipping
No 990 in filing 201811359349205291, skipping
No 990 in filing 201801369349200320, skipping
No 990 in filing 201803059349201130, skipping
No 990 in filing 201801709349200100, skipping
No 990 in filing 201822719349200312, skipping
No 990 in filing 201800109349200720, skipping
No 990 in filing 201841349349205294, skipping
No 990 in filing 201840159349200809, skipping
No 990 in filing 201830169349200238, skipping
No 990 in filing 201823029349201067, skipping
No 990 in filing 201801529349200215, skipping
No 990 in filing 201840689349200709, skipping
No 990 in filing 201821439349100612, skipping
No 990 in filing 20182157934920023

No 990 in filing 201733149349201303, skipping
No 990 in filing 201833129349201058, skipping
No 990 in filing 201830529349100308, skipping
No 990 in filing 201830749349200803, skipping
No 990 in filing 201800679349200235, skipping
No 990 in filing 201833199349201163, skipping
No 990 in filing 201842839349200314, skipping
No 990 in filing 201833199349103538, skipping
No 990 in filing 201801499349200125, skipping
No 990 in filing 201821209349100332, skipping
No 990 in filing 201840949349200619, skipping
No 990 in filing 201821349349202457, skipping
No 990 in filing 201843119349201284, skipping
No 990 in filing 201843119349201054, skipping
No 990 in filing 201840159349200509, skipping
No 990 in filing 201801309349201310, skipping
No 990 in filing 201800929349200630, skipping
No 990 in filing 201821369349200022, skipping
No 990 in filing 201723149349201062, skipping
No 990 in filing 201843069349201309, skipping
No 990 in filing 201801569349200115, skipping
No 990 in filing 20180151934920014

No 990 in filing 201821209349100017, skipping
No 990 in filing 201841359349206619, skipping
No 990 in filing 201713189349200131, skipping
No 990 in filing 201821919349200247, skipping
No 990 in filing 201703189349202635, skipping
No 990 in filing 201801349349205590, skipping
No 990 in filing 201801209349200020, skipping
No 990 in filing 201842499349200319, skipping
No 990 in filing 201840469349201259, skipping
No 990 in filing 201743199349204259, skipping
No 990 in filing 201802609349200500, skipping
No 990 in filing 201842849349200039, skipping
No 990 in filing 201743149349202079, skipping
No 990 in filing 201841179349200629, skipping
No 990 in filing 201810689349200026, skipping
No 990 in filing 201801349349205675, skipping
No 990 in filing 201823069349200322, skipping
No 990 in filing 201840649349200604, skipping
No 990 in filing 201743219349200139, skipping
No 990 in filing 201831299349202123, skipping
No 990 in filing 201713179349204076, skipping
No 990 in filing 20181312934920189

No 990 in filing 201831419349200038, skipping
No 990 in filing 201841389349200514, skipping
No 990 in filing 201722869349200877, skipping
No 990 in filing 201821009349200302, skipping
No 990 in filing 201713189349202526, skipping
No 990 in filing 201842749349200144, skipping
No 990 in filing 201821019349200757, skipping
No 990 in filing 201821359349205187, skipping
No 990 in filing 201822479349201007, skipping
No 990 in filing 201810129349200211, skipping
No 990 in filing 201843089349200204, skipping
No 990 in filing 201830539349200333, skipping
No 990 in filing 201841269349200254, skipping
No 990 in filing 201813129349201251, skipping
No 990 in filing 201831359349201253, skipping
No 990 in filing 201812399349200421, skipping
No 990 in filing 201801319349201440, skipping
No 990 in filing 201841379349200714, skipping
No 990 in filing 201733539349200313, skipping
No 990 in filing 201833129349201968, skipping
No 990 in filing 201811179349200781, skipping
No 990 in filing 20184134934920391

No 990 in filing 201703039349100525, skipping
No 990 in filing 201821799349200307, skipping
No 990 in filing 201743189349206164, skipping
No 990 in filing 201713149349202036, skipping
No 990 in filing 201821379349201007, skipping
No 990 in filing 201822069349200217, skipping
No 990 in filing 201711709349200566, skipping
No 990 in filing 201822819349200427, skipping
No 990 in filing 201801359349204005, skipping
No 990 in filing 201841349349202324, skipping
No 990 in filing 201810419349200406, skipping
No 990 in filing 201840419349200009, skipping
No 990 in filing 201841659349200134, skipping
No 990 in filing 201802339349200950, skipping
No 990 in filing 201821349349203222, skipping
No 990 in filing 201802579349200100, skipping
No 990 in filing 201703199349204765, skipping
No 990 in filing 201822339349200027, skipping
No 990 in filing 201713189349205591, skipping
No 990 in filing 201812609349200986, skipping
No 990 in filing 201810779349200106, skipping
No 990 in filing 20184077934920020

No 990 in filing 201811359349203481, skipping
No 990 in filing 201843069349100964, skipping
No 990 in filing 201743189349201389, skipping
No 990 in filing 201812909349200616, skipping
No 990 in filing 201811709349200521, skipping
No 990 in filing 201800589349200030, skipping
No 990 in filing 201820589349200437, skipping
No 990 in filing 201800319349100205, skipping
No 990 in filing 201802479349100115, skipping
No 990 in filing 201811389349200011, skipping
No 990 in filing 201832129349200238, skipping
No 990 in filing 201832269349200433, skipping
No 990 in filing 201802199349200525, skipping
No 990 in filing 201800759349200220, skipping
No 990 in filing 201800369349200735, skipping
No 990 in filing 201821459349200627, skipping
No 990 in filing 201801359349206230, skipping
No 990 in filing 201831359349104108, skipping
No 990 in filing 201820179349200312, skipping
No 990 in filing 201812759349200121, skipping
No 990 in filing 201800459349200990, skipping
No 990 in filing 20182130934920191

No 990 in filing 201741679349100204, skipping
No 990 in filing 201841349349103799, skipping
No 990 in filing 201833199349208558, skipping
No 990 in filing 201810829349200216, skipping
No 990 in filing 201831359349206963, skipping
No 990 in filing 201713129349200701, skipping
No 990 in filing 201802619349200025, skipping
No 990 in filing 201821079349101312, skipping
No 990 in filing 201812979349100711, skipping
No 990 in filing 201842299349200734, skipping
No 990 in filing 201831309349202248, skipping
No 990 in filing 201801319349202755, skipping
No 990 in filing 201801289349201815, skipping
No 990 in filing 201801239349200890, skipping
No 990 in filing 201811289349200506, skipping
No 990 in filing 201842829349100424, skipping
No 990 in filing 201833199349201738, skipping
No 990 in filing 201841649349100024, skipping
No 990 in filing 201811299349101686, skipping
No 990 in filing 201703179349200545, skipping
No 990 in filing 201832579349200713, skipping
No 990 in filing 20173345934920042

No 990 in filing 201832489349100213, skipping
No 990 in filing 201733189349101078, skipping
No 990 in filing 201713199349208611, skipping
No 990 in filing 201833199349207158, skipping
No 990 in filing 201743269349100504, skipping
No 990 in filing 201810789349200856, skipping
No 990 in filing 201801349349205520, skipping
No 990 in filing 201821659349200327, skipping
No 990 in filing 201832009349200768, skipping
No 990 in filing 201832399349200818, skipping
No 990 in filing 201820159349200802, skipping
No 990 in filing 201843099349100339, skipping
No 990 in filing 201840369349200764, skipping
No 990 in filing 201812279349100336, skipping
No 990 in filing 201703189349102545, skipping
No 990 in filing 201713079349201126, skipping
No 990 in filing 201823099349200407, skipping
No 990 in filing 201830449349200033, skipping
No 990 in filing 201713129349200526, skipping
No 990 in filing 201810409349100506, skipping
No 990 in filing 201800459349200970, skipping
No 990 in filing 20184121934920000

No 990 in filing 201733199349207658, skipping
No 990 in filing 201713189349202101, skipping
No 990 in filing 201733149349201643, skipping
No 990 in filing 201801349349201945, skipping
No 990 in filing 201723149349201347, skipping
No 990 in filing 201723179349201767, skipping
No 990 in filing 201723149349202207, skipping
No 990 in filing 201743199349206794, skipping
No 990 in filing 201723429349200612, skipping
No 990 in filing 201723219349200322, skipping
No 990 in filing 201703079349200200, skipping
No 990 in filing 201743199349208044, skipping
No 990 in filing 201732879349200223, skipping
No 990 in filing 201800399349200100, skipping
No 990 in filing 201820409349200427, skipping
No 990 in filing 201830439349200808, skipping
No 990 in filing 201743149349202119, skipping
No 990 in filing 201701709349200135, skipping
No 990 in filing 201743199349202844, skipping
No 990 in filing 201743199349208119, skipping
No 990 in filing 201820369349200542, skipping
No 990 in filing 20170303934920113

In [7]:
# close the outfile
outfile.close()

sked_j_ore_efilers = pd.read_csv(outfilename)
sked_j_ore_efilers.head()

Unnamed: 0,period,ein,object_id,taxpayer_name,name,business_name1,business_name2,title,org_comp,related_comp,other_comp
0,201612,941105628,201703139349302875,KAISER FOUNDATION HOSPITALS,Ramon F Baez,,,Director,0,152591,0
1,201612,941105628,201703139349302875,KAISER FOUNDATION HOSPITALS,Regina M Benjamin MD MBA,,,Director,0,197178,0
2,201612,941105628,201703139349302875,KAISER FOUNDATION HOSPITALS,Jeffrey E Epstein,,,Director,0,206871,0
3,201612,941105628,201703139349302875,KAISER FOUNDATION HOSPITALS,Leslie S Heisz,,,Director,0,215118,0
4,201612,941105628,201703139349302875,KAISER FOUNDATION HOSPITALS,David Hoffmeister,,,Director,0,225809,0
