In [25]:
import csv
import json
import time
import datetime
import pprint
import pyperclip
import math

def robust_eq(x, y):
    if x == y:
        return True
    if isinstance(x, float) and isinstance(y, float):
        if math.isnan(x) and math.isnan(y):
            return True
    return False

def convert_time(s):
    s = s.replace('0217', '2017')
    s = s.replace('Director', '')
    if s:
        return s
    else:
        return float('nan')
    
def convert_dollars(s):
    if '-' in s:
        salaries = s.split('-')
    else:
        salaries = [s]
    output = []
    for salary in salaries:
        as_int = salary.replace('$','').replace(',', '').replace('*', '')
        try:
            as_int = int(as_int)
        except:
            if(as_int):
                print(as_int)
            as_int = float('nan')
        output.append(as_int)

    if len(output) == 0:
        return [float('nan'), float('nan')]
    if len(output) == 1:
        return [output[0], output[0]]
    if len(output) == 2:
        return output
    if len(output) > 2:
        print('help')

def staffer_exists(row, output, agency_id, position_title):
    for existing_entry in output[agency_id]['positions'][position_title]:
        if all([robust_eq(convert_time(row[k]), existing_entry[k]) for k in ['start_date', 'end_date']]):
            if existing_entry['staffer_id'] == int(row['staffer_id']):
                return True
    return False
    

with open('outfile1.csv', 'r') as f:
    reader = list(csv.DictReader(f))
    # get all agency ids
    agency_ids = set()
    for row in reader:
        agency_ids.add(row['agency_id'])
    
    # build output
    output = {}
    for agency_id in agency_ids:
        output[agency_id] = {}
        output[agency_id]['positions'] = {}
        for row in reader:
            if row['agency_id'] != agency_id:
                continue
            # get agency name and slug    
            if 'agency_name' in output[agency_id]:
                assert row['agency_name'] == output[agency_id]['agency_name']
            if 'agency_slug' in output[agency_id]:
                assert row['agency_slug'] == output[agency_id]['agency_slug']    
            output[agency_id]['agency_name'] = row['agency_name']
            output[agency_id]['agency_slug'] = row['agency_slug']
            
            # populate positions
            position_titles = [row['position_title_1'], row['position_title_2'], row['position_title_3']]
            for position_title in position_titles:
                if not position_title:
                    continue
                output[agency_id]['positions'].setdefault(position_title, [])
                for x in output[agency_id]['positions'][position_title]:
                    if x['staffer_id'] == row['staffer_id']:
                        assert x['start_date'] == convert_time(row['start_date'])
                        assert x['end_date'] == convert_time(row['end_date'])
                        continue
                if not staffer_exists(row, output, agency_id, position_title):
                    output[agency_id]['positions'][position_title].append({
                        'staffer_id': int(row['staffer_id'])
                        , 'name': row['name']
                        , 'start_date': convert_time(row['start_date'])
                        , 'end_date': convert_time(row['end_date'])
                        , 'linkedin_url': row['linkedin_url']
                        , 'grade_level': convert_dollars(row['grade_level'])
                    })

with open('data2.js', 'w') as f:
    f.write('const data = ' + json.dumps(output, indent=4, sort_keys=True))

print('done!')

None
EX
AD
AD
AD
EF
EF
done!


In [22]:
float('nan') == float('nan')

False