In [None]:
import numpy as np
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())

# Load business license data
licenses = pd.read_csv(os.path.join(root_path, "DATA/business_licenses.csv"))

# Load food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

# Load violation data
values = pd.read_csv(os.path.join(root_path, "DATA/violation_values.csv"))
counts = pd.read_csv(os.path.join(root_path, "DATA/violation_counts.csv"))

### CALCULATE FEATURES BASED ON FOOD INSPECTION DATA

In [19]:
# Create basis for model_data (risk not included in Chicago repository)
data = inspections[["inspection_date", "license", "inspection_id", "facility_type", "results", "risk"]]

In [20]:
# Merge with violation data
data = pd.merge(data, values, on="inspection_id")
data = pd.merge(data, counts, on="inspection_id")

In [31]:
# Create pass / fail flags
data["pass_flag"] = data.results.map(lambda x: 1 if x == "Pass" else 0)
data["fail_flag"] = data.results.map(lambda x: 1 if x == "Fail" else 0)

In [22]:
# Sort inspections by date
grouped = data.sort_values(by="inspection_date", inplace=True)

# Find previous inspections by shifting each license group
past_data = data.groupby("license").shift(1)

In [23]:
# Add past fails
data["past_fail"] = past_data.fail_flag

# Add past violation counts
data["past_critical"] = past_data.critical_count
data["past_serious"] = past_data.serious_count
data["past_minor"] = past_data.minor_count

In [24]:
# Select past values, rename
past_values = past_data[values.columns].drop("inspection_id", axis=1).add_prefix("p")

# Set default to 0
past_values.fillna(0, inplace=True)

# Add past values
data = data.join(past_values)

In [25]:
# Calculate time since previous inspection
deltas = pd.to_datetime(data.inspection_date) - pd.to_datetime(past_data.inspection_date)

# Add years since previous inspection
data["time_since_last"] = deltas.apply(lambda x: x.days / 365.25)

In [26]:
# Check if first record
data["first_record"] = data.time_since_last.map(lambda x: 1 if pd.isnull(x) else 0)

# Set time since last for first inspections to 2
data.time_since_last.fillna(2, inplace=True)

#
data["pass_flag"], data["fail_flag"] = pass_flags, fail_flags

In [30]:
data

Unnamed: 0,inspection_date,license,inspection_id,facility_type,results,risk,v_1,v_2,v_3,v_4,...,pv_38,pv_39,pv_40,pv_41,pv_42,pv_43,pv_44,pv_70,time_since_last,first_record
56202,2010-01-04T00:00:00,80916,104236,Restaurant,Fail,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56188,2010-01-05T00:00:00,30486,154219,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56189,2010-01-05T00:00:00,1904141,60244,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56190,2010-01-05T00:00:00,68917,114257,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56191,2010-01-05T00:00:00,1801271,67759,Restaurant,Pass w/ Conditions,Risk 1 (High),0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56192,2010-01-05T00:00:00,13541,54213,Grocery Store,Pass,Risk 3 (Low),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56193,2010-01-05T00:00:00,17145,124257,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56201,2010-01-05T00:00:00,43519,67743,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56195,2010-01-05T00:00:00,1879164,70273,Restaurant,Fail,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
56196,2010-01-05T00:00:00,1045634,60243,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1
