In [2]:
from functools import reduce
from collections import defaultdict
import re
from pathlib import Path
from datetime import datetime

import pickle
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag as BSTagType
import matplotlib.pyplot as plt

In [None]:
TRUNCATE_TO = None
DISCLOSURE_FILENAME_COL_NAME = 'disclosureId'
RELEVANT_FIELDS = [
    'organizationName',
    'zip',
    'registrantGeneralDescription',
    'clientName',
    'clientZip',
    'clientGeneralDescription',
    'specific_issues',
    'reportYear',
    'reportType',
    'effectiveDate'
]
DATA_DIR = '../../data/'

def find(condition, iterable):
    for item in iterable:
        if condition(item):
            return item
    return None

def child_with_name(name, children):
   return find(lambda child : child.name == name, children)

def elements_among(iterable):
    return [
        child for child in iterable
        if type(child) == BSTagType
    ]

def second_level_nodes(path):
    """
        Given the path to a lobbyist disclosure XML file, returns as BeautifulSoup elements for
        the first level of children within the disclosure node of the form, i.e. the first level
        of the main content of the form.
    """

    with open(path) as f:
        disclosure = BeautifulSoup(f, 'lxml-xml')

    if len(list(disclosure.children)) == 0: # first element was XML doc declaration
        with open(path) as f:
            next(f) # so skip it
            disclosure = BeautifulSoup(f, 'lxml-xml') # and use the second

    if 'children' in dir(list(disclosure.children)[0]):
        children = list(disclosure.children)[0].children
    else: # has XML-stylesheet declaration element
        children = list(disclosure.children)[1].children # skip it
    
    return elements_among(children)

def lobbyist_nodes_within(lobbyists_node):
    return [
        lobbyist_node
        for lobbyist_node
        in ((lobbyists_node and lobbyists_node.children) or [])
        if (
            type(lobbyist_node) == BSTagType
        ) and (
            re.match('\S', lobbyist_node.lobbyistFirstName.text) and
            re.match('\S', lobbyist_node.lobbyistLastName.text)
        )
    ]

def row_from_path(path):
    """
        Given the path to a lobbyist disclosure XML file, extracts and returns fields
        about the lobbying firm
    """

    disclosure_children = second_level_nodes(path)
    organization_name_node = child_with_name('organizationName', disclosure_children)
    client_name_node = child_with_name('clientName', disclosure_children)
    effective_date_node = child_with_name('effectiveDate', disclosure_children)
    lobbyists_node = child_with_name('lobbyists', disclosure_children)
    alis_node = child_with_name('alis', disclosure_children)

    return {
        'organization_name': organization_name_node and organization_name_node.text,
        'client_name': client_name_node and client_name_node.text,
        'effective_date': effective_date_node and effective_date_node.text,
        'lobbyists': len(lobbyist_nodes_within(lobbyists_node)),
        'alis': ','.join([
            ali_node.text
            for ali_node
            in elements_among((alis_node and alis_node.children) or [])
            if not re.match('^\s*$', ali_node.text)
        ])
        
    }

def collect_fields(paths):
    fields = set([DISCLOSURE_FILENAME_COL_NAME])
    for path in paths:
        for field in row_from_path(path).keys():
            fields.add(field)
    return fields

def add_row(collection, fields, row, filename):
    for field in fields:
        if field == DISCLOSURE_FILENAME_COL_NAME:
            collection[field].append(filename)
        elif field in row.keys():
            collection[field].append(row[field])
        else:
            collection[field].append(None)

def filename_from_path(path):
    'Given a path, returns the filename without the extension.'

    return re.search('\/([^\/]+)\.xml$', str(path))[1]

In [None]:
disclosure_subdirs = Path('../../data/lobbying_disclosures/').glob('*')
disclosure_paths = reduce(
    lambda collection, subdir : collection + list(Path(subdir).glob('*')),
    disclosure_subdirs,
    []
)

trunc_paths = disclosure_paths[:TRUNCATE_TO]

fields = [
    DISCLOSURE_FILENAME_COL_NAME,
    'organization_name',
    'client_name',
    'effective_date',
    'lobbyists',
    'alis'
]

disclosures_dict = {field: [] for field in fields}
error_count = 0
for path in trunc_paths:
    try:
        add_row(
            disclosures_dict,
            fields,
            row_from_path(path),
            filename_from_path(path)
        )
    except:
        error_count += 1
        print(f'error: {path}')

print(f'Errors: {error_count}')

In [None]:
disclosures = pd.DataFrame(disclosures_dict).set_index(DISCLOSURE_FILENAME_COL_NAME)

In [None]:
# fix typo
disclosures.effective_date = disclosures.effective_date.replace('03/031/2008', '03/31/2008')

# cast
disclosures.effective_date = pd.to_datetime(disclosures.effective_date)

In [None]:
disclosures['client_name'] = disclosures['client_name'].str.strip()
disclosures['organization_name'] = disclosures['organization_name'].str.strip()

In [None]:
# cull inadequate rows
disclosures = disclosures[ ~(
    (disclosures.client_name.isnull()) |
    (disclosures.organization_name.isnull()) | 
    (disclosures.organization_name == '') |
    (disclosures.lobbyists == 0)
)]

In [None]:
# there are a bunch of fake disclosures left behind by QA
qa_rows = disclosures[disclosures.organization_name.str.contains('QA')]

plt.hist(
    qa_rows.effective_date,
    bins=40,
    color='#301830',
    range=[
        datetime(2007, 1, 1),
        datetime(2018, 1, 1)        
    ]
)
plt.title('Erroneous Disclosures Left By Quality Assurers');
plt.savefig('QA_leftovers')

In [None]:
qa_rows

In [9]:
len(qa_rows)

39

In [10]:
# cull them
disclosures = disclosures[~ disclosures.organization_name.str.contains('QA')]

In [11]:
# junk entry (also QA?)
disclosures = disclosures[ ~(
    (disclosures.client_name == 'aaaddd') &
    (disclosures.organization_name == 'aaa')
)]

In [None]:
with open(f'{DATA_DIR}disclosures.pkl', 'wb') as file:
    pickle.dump(disclosures, file)

In [None]:
with open(f'{DATA_DIR}disclosures.pkl', 'rb') as file:
    disclosures = pickle.load(file)

In [None]:
len(disclosures)