In [1]:
from functools import reduce
from collections import defaultdict
import re
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag as BSTagType
import pickle

In [2]:
TRUNCATE_TO = 99
DISCLOSURE_FILENAME_COL_NAME = 'disclosureId'
RELEVANT_FIELDS = [
    'organizationName',
    'zip',
    'registrantGeneralDescription',
    'clientName',
    'clientZip',
    'clientGeneralDescription',
    'specific_issues',
    'reportYear',
    'reportType',
    'effectiveDate'
]

def row_from_path(path):
    with open(path) as f:
        disclosure = BeautifulSoup(f, 'lxml-xml')

    if len(list(disclosure.children)) == 0: # has XML declaration element
        with open(path) as f:
            next(f) # skip it
            disclosure = BeautifulSoup(f, 'lxml-xml')

    if 'children' in dir(list(disclosure.children)[0]):
        children = list(disclosure.children)[0].children
    else: # has XML-stylesheet declaration element
        children = list(disclosure.children)[1].children # skip it

    return {
        child.name: child.text # TODO: handle fields with non-text values (esp. children)
        for child
        in children
        if type(child) == BSTagType
    }

def collect_fields(paths):
    fields = set([DISCLOSURE_FILENAME_COL_NAME])
    for path in paths:
        for field in row_from_path(path).keys():
            fields.add(field)
    return fields

def add_row(collection, fields, row, filename):
    if not row:
        print('NO DATA ROW')
        return
    for field in fields:
        if field == DISCLOSURE_FILENAME_COL_NAME:
            collection[field].append(filename)
        elif field in row.keys():
            collection[field].append(row[field].replace('\n', ''))
        else:
            collection[field].append(None)

def filename_from_path(path):
    return re.search('\/([^\/]+)\.xml$', str(path))[1] # filename without extension

In [3]:
disclosure_subdirs = Path('../../data/lobbying_disclosures/').glob('*')
disclosure_paths = reduce(
    lambda collection, subdir : collection + list(Path(subdir).glob('*')),
    disclosure_subdirs,
    []
)

trunc_paths = disclosure_paths[:TRUNCATE_TO]
fields = collect_fields(trunc_paths)
disclosures_dict = {field: [] for field in fields}
for path in trunc_paths:
    add_row(
        disclosures_dict,
        fields,
        row_from_path(path),
        filename_from_path(path)
    )

In [4]:
disclosures = pd.DataFrame(disclosures_dict)

In [5]:
with open('disclosures.pkl', 'wb') as file:
    pickle.dump(disclosures, file)


In [6]:
disclosures[:4]

Unnamed: 0,disclosureId,city,clientState,firstName,clientGeneralDescription,affiliatedUrl,signedDate,clientAddress,address2,clientCity,...,reportYear,affiliatedOrgs,registrantGeneralDescription,principal_city,pages,foreignEntities,country,selfSelect,principal_zip,lobbyists
0,300027867,Greensburg,NC,,Motorsport racing,,02/14/2008,"148 Cedar Point Drive, Suite 103",,Mooresville,...,2007,,Government relations and lobbying,,2,,USA,N,,ScottHarshmanYYYYY
1,200041150,Washington,UT,,Media,,07/25/2007,4185 Harrison Blvd.,,Ogden,...,2007,,Lobbyist,,3,,USA,N,,AmyMehlmanPresidentYYYYYYYYYYY
2,200048174,Washington,VA,,science supplier focused on analytical instrum...,,08/08/2007,P.O. Box 117,Suite 1100,Midlothian,...,2007,,Lobbying firm,,3,,USA,N,,"JohnGreenYJulieDammannChief of Staff, Senator ..."
3,200028858,Washington,MA,,Clinical and research medical facility on diab...,,02/13/2007,One Joslin Place,,Boston,...,2007,,Public Affairs Consulting,,2,,USA,N,,Terry R.PeelYCraig A.MeyersYYYY
