In [26]:
from stream_extract2 import StreamExtractor
import unicodecsv as csv

output_streams = {
    'employees': {  
        'filename':'employees_detailed_tst', # will output to employees_detailedYYYY.csv where year is specified below
        'headers':["object_id", "name", "business_name1", "title", "org_comp", "related_comp", "other_cmp", "form", "source","ein"]
    },
    'salaries': {  
        'filename':'filer_comp_tst', # will output to filer_compYYYY.csv where year is specified below
        'headers':["year", "ein", "object_id", "form", "source", "compensation", "income", "revenue", "assets", "expenses"]
    },
    'states': {
        'filename':'filer_states_tst',
        'headers':["ein","state"]
    },
    'exempt': {
        'filename':'filer_exempt_tst',
        'headers':["ein","org527",'org501c','org4947','org501c3']
    },
    'expenses': {
        'filename':'filer_expenses_tst',
        'headers':["ein","object_id","prog_exp"]
    },
    'assets': {
        'filename':'filer_assets_tst',
        'headers':["ein","object_id","assets_pf"]
    }

}


In [27]:
# The format we're using is this
# The stream_key used must be defined in the output stream above.

data_capture_dict = {
    'ReturnHeader990x': {
        'parts': {
            'returnheader990x_part_i': {
                'stream_key': 'states',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'USAddrss_SttAbbrvtnCd':{'header':'state'},
            }

        },
    },
    'IRS990': {
        'parts': {
            'part_0': {
                'stream_key': 'exempt',
                'ein': {'header': 'ein'},
                'Orgnztn527Ind': {'header': 'org527'},
                'Orgnztn501cInd': {'header': 'org501c'},
                'Orgnztn49471NtPFInd': {'header': 'org4947'},
                'Orgnztn501c3Ind': {'header': 'org501c3'},

            },
            'part_i': {
                'stream_key': 'salaries',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'CYSlrsCmpEmpBnftPdAmt':{'header':'compensation','default':0},
                'CYRvnsLssExpnssAmt':{'header':'income','default':0},
                'CYTtlRvnAmt':{'header':'revenue','default':0},
                'TtlAsstsEOYAmt':{'header':'assets','default':0},
                'CYTtlExpnssAmt':{'header':'expenses','default':0},
            },
            'part_ix': {
                'stream_key': 'expenses',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'TtlFnctnlExpnss_TtlAmt': {'header':'func_exp','default':0},
                'TtlFnctnlExpnss_PrgrmSrvcsAmt': {'header':'prog_exp','default':0}
                
            }

        },
        ## The remaining logic is for capturing salaries wherever they appear in 
        ## the 990, 990PF and 990EZ
        'groups': {
             'Frm990PrtVIISctnA': {
                'stream_key': 'employees',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'BsnssNmLn1Txt':{'header':'business_name1'},
                'TtlTxt': {'header':'title'},
                'RprtblCmpFrmOrgAmt': {
                    'header':'org_comp',
                    'default':0  # set numeric if missing
                },
                'RprtblCmpFrmRltdOrgAmt': {
                    'header':'related_comp',
                    'default':0
                },
                'OthrCmpnstnAmt':{
                    'header':'other_cmp',
                    'default':0
                }
            }
        }
    },
    'IRS990EZ': {
        'parts': {
            'ez_part_i': {
                'stream_key': 'salaries',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'SlrsOthrCmpEmplBnftAmt':{'header':'compensation','default':0},
                'ExcssOrDfctFrYrAmt':{'header':'income','default':0},
                'TtlRvnAmt':{'header':'revenue','default':0},
                'NtAsstsOrFndBlncsEOYAmt':{'header':'assets','default':0},
                'TtlExpnssAmt':{'header':'expenses','default':0}
            }
        },
        'groups': {
            'EZOffcrDrctrTrstEmpl': {
                'stream_key': 'employees',
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'BsnssNmLn1': {'header':'business_name1'},


                'TtlTxt': {'header':'title'},
                'CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0
                },
                'related_comp': {'header':'related_comp','default':0},
                'composite': {  # other compensation includes benefits and other allowances for EZ, PF filers
                    'other_cmp': {
                        'EmplyBnftPrgrmAmt': {
                            'default':0
                        },
                        'ExpnsAccntOthrAllwncAmt': {
                            'default':0
                        }
                    }
                }
            },
            'EZCmpnstnHghstPdEmpl': {
                'stream_key': 'employees',
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'TtlTxt': {'header':'title'},
                'CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0
                },
                'related_comp': {'header':'related_comp','default':0},
                'composite': {
                    'other_cmp': {
                        'EmplyBnftsAmt': {
                            'default':0
                        },
                        'ExpnsAccntAmt': {
                            'default':0
                        }
                    }
                }
            }
        }
    },
    'IRS990PF': {
        'parts': {
            'pf_part_i': {
                'stream_key': 'salaries',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'CmpOfcrDrTrstRvAndExpnssAmt':{'header':'compensation','default':0},
                'income': {'header':'income','default':0},
                'TtlRvAndExpnssAmt':{'header':'revenue','default':0},
                'TtlExpnssRvAndExpnssAmt':{'header':'expenses','default':0}
            },
            'pf_part_ii': {
                'stream_key': 'assets',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'TtlAsstsEOYFMVAmt':{'header':'assets_pf','default':0},
            }
        },
        'groups': {
            'PFOffcrDrTrstKyEmpl': {
                'stream_key': 'employees',

                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'OffcrDrTrstKyEmpl_PrsnNm': {'header':'name'},
                'OffcrDrTrstKyEmpl_BsnssNmLn1': {'header':'business_name1'},
                'OffcrDrTrstKyEmpl_TtlTxt': {'header':'title'},
                'OffcrDrTrstKyEmpl_CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0  # set numeric if missing
                },
                'composite': {
                    'other_cmp': {
                        'OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt': {
                            'default':0
                        },
                        'OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt': {
                            'default':0
                        }
                    }
                }
            },
            'PFCmpnstnHghstPdEmpl': {
                'stream_key': 'employees',

                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'CmpnstnHghstPdEmpl_PrsnNm': {'header':'name'},
                'CmpnstnHghstPdEmpl_TtlTxt': {'header':'title'},
                'CmpnstnHghstPdEmpl_CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0  # set numeric if missing
                },
                'composite': {
                    'other_cmp': {
                        'CmpnstnHghstPdEmpl_EmplyBnftsAmt': {
                            'default':0
                        },
                        'CmpnstnHghstPdEmpl_ExpnsAccntAmt': {
                            'default':0
                        }
                    }
                }
            }
        }
    }
}



In [39]:
YEAR = 2018  # THIS MUST AGREE WITH OUR OTHER DATA
extractor = StreamExtractor(output_streams, data_capture_dict, YEAR)


Initializing output stream employees_detailed_tst2018.csv
Initializing output stream filer_comp_tst2018.csv
Initializing output stream filer_states_tst2018.csv
Initializing output stream filer_exempt_tst2018.csv
Initializing output stream filer_expenses_tst2018.csv
Initializing output stream filer_assets_tst2018.csv


In [38]:
# this_object_id = 201941309349303759
extractor.run_filing(201941309349303759)

In [24]:
print("done")

done
