In [1]:
from functools import reduce
from collections import defaultdict
import re
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag as BSTagType
import pickle

In [2]:
TRUNCATE_TO = 999
DISCLOSURE_FILENAME_COL_NAME = 'disclosureId'
RELEVANT_FIELDS = [
    'organizationName',
    'zip',
    'registrantGeneralDescription',
    'clientName',
    'clientZip',
    'clientGeneralDescription',
    'specific_issues',
    'reportYear',
    'reportType',
    'effectiveDate'
]

def row_from_path(path):
    with open(path) as f:
        disclosure = BeautifulSoup(f, 'lxml-xml')

    if len(list(disclosure.children)) == 0: # has XML declaration element
        with open(path) as f:
            next(f) # skip it
            disclosure = BeautifulSoup(f, 'lxml-xml')

    if 'children' in dir(list(disclosure.children)[0]):
        children = list(disclosure.children)[0].children
    else: # has XML-stylesheet declaration element
        children = list(disclosure.children)[1].children # skip it

    return {
        child.name: child.text # TODO: handle fields with non-text values (esp. children)
        for child
        in children
        if type(child) == BSTagType
    }

def collect_fields(paths):
    fields = set([DISCLOSURE_FILENAME_COL_NAME])
    for path in paths:
        for field in row_from_path(path).keys():
            fields.add(field)
    return fields

def add_row(collection, fields, row, filename):
    if not row:
        print('NO DATA ROW')
        return
    for field in fields:
        if field == DISCLOSURE_FILENAME_COL_NAME:
            collection[field].append(filename)
        elif field in row.keys():
            collection[field].append(row[field].replace('\n', ''))
        else:
            collection[field].append(None)

def filename_from_path(path):
    return re.search('\/([^\/]+)\.xml$', str(path))[1] # filename without extension

In [3]:
disclosure_subdirs = Path('data/lobbying_disclosures').glob('*')
disclosure_paths = reduce(
    lambda collection, subdir : collection + list(Path(subdir).glob('*')),
    disclosure_subdirs,
    []
)

trunc_paths = disclosure_paths[:TRUNCATE_TO]
fields = collect_fields(trunc_paths)
disclosures_dict = {field: [] for field in fields}
for path in trunc_paths:
    add_row(
        disclosures_dict,
        fields,
        row_from_path(path),
        filename_from_path(path)
    )

NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW
NO DATA ROW


In [7]:
disclosures = pd.DataFrame(disclosures_dict)

In [5]:
with open('disclosures.pkl', 'wb') as file:
    pickle.dump(disclosures, file)


In [8]:
disclosures[:4]

Unnamed: 0,signerEmail,clientZipExt,affiliatedOrgs,foreignEntities,printedName,clientState,registrantGeneralDescription,contactPhone,address1,organizationName,...,senateID,principal_zipext,zip,imported,prefix,prinClientZip,alis,principal_zip,clientAddress,prinClientZipExt
0,,,,,"Scott Harshman, President",NC,Government relations and lobbying,,217 Murdock Way,"Harshman Consulting, LLC",...,319494-0,,15601,N,,,BUDDEF,,"148 Cedar Point Drive, Suite 103",
1,,,,,"Amy R. Mehlman, President",UT,Lobbyist,,"1750 K St., NW Suite 350",Mehlman Capitol Strategies Inc.,...,,,20006,,,,TEC,,4185 Harrison Blvd.,
2,,,,,,VA,Lobbying firm,,"1111 19th Street, NW",Ogilvy Government Relations,...,48827,,20036,,,,HOMBUD,,P.O. Box 117,
3,,,,,,MA,Public Affairs Consulting,,"1317 F Street, NW, Suite 200","Edington, Peel & Associates, Inc.",...,,,20004,,,,BUDINDMED,,One Joslin Place,
