### CALCULATE FEATURES BASED ON FOOD INSPECTION DATA

In [1]:
import numpy as np
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())

# Load food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

# Create basis for model_data (risk not included in Chicago repository)
data = inspections[["inspection_date", "license", "inspection_id", "facility_type", "results", "risk"]]

In [2]:
# Load violation data
values = pd.read_csv(os.path.join(root_path, "DATA/violation_values.csv"))
counts = pd.read_csv(os.path.join(root_path, "DATA/violation_counts.csv"))

# Merge with violation data
data = pd.merge(data, values, on="inspection_id")
data = pd.merge(data, counts, on="inspection_id")

In [3]:
# Create pass / fail flags
data["pass_flag"] = data.results.map(lambda x: 1 if x == "Pass" else 0)
data["fail_flag"] = data.results.map(lambda x: 1 if x == "Fail" else 0)

In [4]:
# Sort inspections by date
grouped = data.sort_values(by="inspection_date", inplace=True)

# Find previous inspections by shifting each license group
past_data = data.groupby("license").shift(1)

In [5]:
# Add past fails
data["past_fail"] = past_data.fail_flag

# Add past violation counts
data["past_critical"] = past_data.critical_count
data["past_serious"] = past_data.serious_count
data["past_minor"] = past_data.minor_count

In [6]:
# Select past violation values, remove inspection_id
past_values = past_data[values.columns].drop("inspection_id", axis=1).add_prefix("p")

# Set violation values to 0 for first inspections
past_values.fillna(0, inplace=True)

# Add past values to model data
data = data.join(past_values)

In [7]:
# Calculate time since previous inspection
deltas = pd.to_datetime(data.inspection_date) - pd.to_datetime(past_data.inspection_date)

# Add years since previous inspection
data["time_since_last"] = deltas.apply(lambda x: x.days / 365.25)

In [8]:
# Check if first record
data["first_record"] = data.time_since_last.map(lambda x: 1 if pd.isnull(x) else 0)

# Set time since last for first inspections to 2
data.time_since_last.fillna(2, inplace=True)

In [9]:
# REMEMBER TO INCLUDE PRIORITY FEATURE

### CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA

In [10]:
# Load business license data
licenses = pd.read_csv(os.path.join(root_path, "DATA/business_licenses.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Match by license number and when inspection_date
# falls wrt business license renewal cycle
def get_business_id(inspection):
    matches = licenses.loc[licenses.license_number == inspection.license]
    matches = matches.loc[
        (matches.license_start_date <= inspection.inspection_date) &
        (matches.expiration_date > inspection.inspection_date)
    ]
    if len(matches) > 1:
        return matches.iloc[0].id
    return np.nan

# Sort licenses to allow selection of first match
licenses.sort_values("license_start_date", inplace=True)

# 5 mins
data["business_id"] = data.apply(get_business_id, axis=1)

In [71]:
licenses.iloc[56390].expiration_date

'2013-08-15T00:00:00'

In [74]:
licenses.license_id

0           76522
1           84172
2          119268
3          168920
4          205258
5          205263
6          264943
7          275807
8          286129
9          310105
10        1104543
11        1112535
12        1115264
13        1115778
14        1121240
15        1140501
16        1140502
17        1172795
18        1201283
19        1202098
20        1205853
21        1215056
22        1307168
23        1307340
24        1309805
25        1309915
26        1313198
27        1324330
28        1334887
29        1452052
           ...   
939911    2173086
939912    2599046
939913    2596660
939914    2591409
939915    2597937
939916    2608754
939917    2595264
939918    2591931
939919    2578507
939920    2593105
939921    2596678
939922    2591925
939923    2603124
939924    2608726
939925    2593706
939926    2608668
939927    2608729
939928    2596499
939929    2597755
939930    1851303
939931    2495344
939932    2607656
939933    2603731
939934    2591471
939935    

### ATTACH KDE DATA

### ATTACH WEATHER DATA

In [11]:
# Load weather data
weather = pd.read_csv(os.path.join(root_path, "DATA/weather.csv"))

# Merge weather data with model data
data = pd.merge(data, weather, on="inspection_id")