In [1]:
from collections import Counter
import re
import datetime
import os

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_OUTPUT_FILENAME = 'misconduct_train.pkl'
TEST_OUTPUT_FILENAME = 'misconduct_test.pkl'

In [3]:
def normalize_column_name(string):
    return re.sub('\s', '_', string).lower()

In [4]:
misconduct = pd.read_csv('data/contractor_misconduct.csv')

In [5]:
# standardize column names
misconduct['Contractor_s'] = misconduct['Contractor(s)']
misconduct = misconduct.drop('Contractor(s)', axis=1)

misconduct.columns = [normalize_column_name(col) for col in misconduct.columns]

In [6]:
# restore improperly parsed values
misconduct['court_type'] = misconduct['court_type'].replace(np.nan, 'N/A')
# but clear these
misconduct['court_type'] = misconduct['court_type'].replace('Undisclosed/unknown', np.nan)
misconduct['total_penalties'] = misconduct['total_penalties'].replace(
    'Undisclosed/Unknown',
    np.nan
)
# (note capitalization discrepency)

In [7]:
# casting
misconduct.loc[:, 'total_penalties'] = misconduct.loc[:, 'total_penalties'].astype(float)
misconduct.loc[:, 'date'] = pd.to_datetime(
    misconduct.loc[:, 'date'],
    format='%m/%d/%Y'
)

In [8]:
# add indicator variables
cols_to_dummyize = [
    'misconduct_type',
    'court_type',
    'enforcement_agency',
    'contracting_party'
]

for col_name in cols_to_dummyize:
    misconduct = pd.concat(
        [
            misconduct,
            pd.get_dummies(
                misconduct[col_name],
                prefix=col_name
            )
        ],
        axis=1
    )

misconduct = misconduct.drop(cols_to_dummyize, axis=1)
misconduct.columns = [
    normalize_column_name(col)
    for col
    in misconduct.columns
]

In [9]:
misconduct_train, misconduct_test = train_test_split(
    misconduct,
    test_size=0.2,
    random_state=83
)

In [10]:
os.system(f'touch {TRAIN_OUTPUT_FILENAME}')
with open(TRAIN_OUTPUT_FILENAME, 'wb') as train_file:
    pickle.dump(misconduct_train, train_file)

os.system(f'touch {TEST_OUTPUT_FILENAME}')
with open(TEST_OUTPUT_FILENAME, 'wb') as test_file:
    pickle.dump(misconduct_test, test_file)