In [62]:
import numpy as np
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())

# Load business license data
licenses = pd.read_csv(os.path.join(root_path, "DATA/business_licenses.csv"))

# Load food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

# Load violation data
values = pd.read_csv(os.path.join(root_path, "DATA/violation_values.csv"))
counts = pd.read_csv(os.path.join(root_path, "DATA/violation_counts.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


### CALCULATE FEATURES BASED ON FOOD INSPECTION DATA

In [63]:
# Create basis for model_data (risk not included in Chicago repository)
data = inspections[["inspection_date", "license", "inspection_id", "facility_type", "results", "risk"]]

In [65]:
# Merge with violation data
data = pd.merge(data, values, on="inspection_id")
data = pd.merge(data, counts, on="inspection_id")

In [66]:
# Create pass / fail flags
pass_flags = data.results.map(lambda x: 1 if x == "Pass" else 0)
fail_flags = data.results.map(lambda x: 1 if x == "Fail" else 0)
data["pass_flags"], data["fail_flags"] = pass_flags, fail_flags

In [67]:
# Sort inspections by date
data.sort_values(by="inspection_date", inplace=True)

# Find previous inspection by shifting columns (grouped by license)
data["past_inspection_id"] = data.groupby("license").inspection_id.shift(1)

In [70]:
# Add previous violation values
past_values = values.add_prefix("past_")
pd.merge(data, past_values, on="past_inspection_id")

Unnamed: 0,inspection_date,license,inspection_id,facility_type,results,risk,v_1,v_2,v_3,v_4,...,past_v_36,past_v_37,past_v_38,past_v_39,past_v_40,past_v_41,past_v_42,past_v_43,past_v_44,past_v_70
0,2010-01-07T00:00:00,1979756,58235,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2010-01-11T00:00:00,1525,67820,Restaurant,Fail,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-02-01T00:00:00,1846828,68108,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,2010-02-03T00:00:00,1976154,134284,Restaurant,Fail,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010-02-17T00:00:00,1947441,134302,Restaurant,Pass w/ Conditions,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2010-03-09T00:00:00,1998519,78337,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2010-03-16T00:00:00,45044,58378,Restaurant,Pass,Risk 1 (High),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2010-03-25T00:00:00,14616,231280,Restaurant,Pass w/ Conditions,Risk 2 (Medium),0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2010-03-25T00:00:00,14616,78384,Restaurant,Pass,Risk 2 (Medium),0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,2010-03-25T00:00:00,14616,74360,Restaurant,Pass,Risk 3 (Low),0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Add previous violation counts
past_counts = counts.add_prefix("p")
pd.merge(data, past_values, left_on="past_inspection_id", right_on="inspection_id", suffixes=["", "_prev"])

In [52]:
past_columns = ["past_" + name for name in violation_matrix.columns]

In [54]:
violation_matrix.add_prefix("past_")

Unnamed: 0,past_v_1,past_v_2,past_v_3,past_v_4,past_v_5,past_v_6,past_v_7,past_v_8,past_v_9,past_v_10,...,past_v_41,past_v_42,past_v_43,past_v_44,past_v_45,past_v_70,past_critical_count,past_serious_count,past_minor_count,past_inspection_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2176589
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,2176568
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2176564
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2176561
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2176518
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2176507
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,2176510
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2176498
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,2176491
9,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,5.0,2176484
