### CALCULATE FEATURES BASED ON FOOD INSPECTION DATA

In [3]:
import numpy as np
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())

# Load food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

# Create basis for model_data
data = inspections.loc[:, ["inspection_id", "license", "inspection_date", "facility_type"]]

In [4]:
# Create pass / fail flags
data["pass_flag"] = inspections.results.apply(lambda x: 1 if x == "Pass" else 0)
data["fail_flag"] = inspections.results.apply(lambda x: 1 if x == "Fail" else 0)

# Create risk flags
data["risk_1"] = inspections.results.apply(lambda x: 1 if x == "Risk 1 (High)" else 0)
data["risk_2"] = inspections.results.apply(lambda x: 1 if x == "Risk 2 (Medium)" else 0)
data["risk_3"] = inspections.results.apply(lambda x: 1 if x == "Risk 3 (Low)" else 0)

In [5]:
# Load violation data
values = pd.read_csv(os.path.join(root_path, "DATA/violation_values.csv"))
counts = pd.read_csv(os.path.join(root_path, "DATA/violation_counts.csv"))

# Merge with violation data
data = pd.merge(data, values, on="inspection_id", how="left")
data = pd.merge(data, counts, on="inspection_id", how="left")

# Set default to 0
data.fillna(0, inplace=True)

In [6]:
# Sort inspections by date
grouped = data.sort_values(by="inspection_date", inplace=True)

# Find previous inspections by shifting each license group
past_data = data.groupby("license").shift(1)

In [7]:
# Add past fails
data["past_fail"] = past_data.fail_flag

# Add past violation counts
data["past_critical"] = past_data.critical_count
data["past_serious"] = past_data.serious_count
data["past_minor"] = past_data.minor_count

In [8]:
# Select past violation values, remove past inspection id
past_values = past_data[values.columns].drop("inspection_id", axis=1).add_prefix("p")

# Set violation values to 0 for first inspections
past_values.fillna(0, inplace=True)

# Add past values to model data
data = data.join(past_values)

In [9]:
# Calculate time since previous inspection
deltas = pd.to_datetime(data.inspection_date) - pd.to_datetime(past_data.inspection_date)

# Add years since previous inspection
data["time_since_last"] = deltas.apply(lambda x: x.days / 365.25)

In [10]:
# Check if first record
data["first_record"] = data.time_since_last.map(lambda x: 1 if pd.isnull(x) else 0)

# Set time since last for first inspections to 2
data.time_since_last.fillna(2, inplace=True)

### CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA

In [11]:
# Load business license data
licenses = pd.read_csv(os.path.join(root_path, "DATA/business_licenses.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
# Business licenses have numbers on end preventing simple match
# so using street number instead
def get_street_number(address):
    return address.split()[0]

licenses["street_number"] = licenses.address.apply(get_street_number)
inspections["street_number"] = inspections.address.apply(get_street_number)

In [16]:
# Match based on DBA name and street number
venue_matches = pd.merge(inspections, licenses, left_on=["dba_name", "street_number"], right_on=["doing_business_as_name", "street_number"])

# Match based on license numbers
license_matches = pd.merge(inspections, licenses, left_on="license", right_on="license_number")

# Join matches and reset index
matches = venue_matches.append(license_matches, sort=False)
matches.reset_index(drop=True, inplace=True)

# Drop duplicates and reindex
matches.drop_duplicates(["inspection_id", "id"], inplace=True)

# Restrict to matches where inspection falls within license period
matches = matches.loc[matches.inspection_date.between(matches.license_start_date, matches.expiration_date)]

In [17]:
# Convert dates to datetime format
matches.inspection_date = pd.to_datetime(matches.inspection_date)
matches.license_start_date = pd.to_datetime(matches.license_start_date)

def get_age_data(group):
    min_date = group.license_start_date.min()
    deltas = group.inspection_date - min_date
    group["age_at_inspection"] = deltas.apply(lambda x: x.days / 365.25)
    return group[["inspection_id", "age_at_inspection"]]

# Calculate (3 mins), drop duplicates
age_data = matches.groupby("license").apply(get_age_data).drop_duplicates()

In [19]:
# Merge in age_at_inspection &
# FILTER OUT ROWS WITHOUT LICENSE INFO
data = pd.merge(data, age_data, on="inspection_id")

In [69]:
backup = matches.copy()

In [None]:
# Make a key
# check if its in key, if so add the value of key
# that'll also help with already having the columns to make the matrix

In [29]:
categories = {
    "Consumption on Premises - Incidental Activity": "consumption_on_premises_incidental_activity",
    "Tobacco": "tobacco",
    "Package Goods": "package_goods",
    "Limited Business License": "limited_business_license",
    "Outdoor Patio": "outdoor_patio",
    "Public Place of Amusement": "public_place_of_amusement",
    "Children's Services Facility License": "childrens_services_facility_license",
    "Tavern": "tavern",
    "Regulated Business License": "regulated_business_license",
    "Filling Station": "filling_station",
    "Caterer's Liquor License": "caterers_liquor_license",
    "Mobile Food License": "mobile_food_license"
}

In [63]:
matches.license_description

5                              retail food establishment
8                              retail food establishment
11                             retail food establishment
13                             retail food establishment
16                             retail food establishment
18                             retail food establishment
21                             retail food establishment
44                             retail food establishment
55                             retail food establishment
71                             retail food establishment
79                              limited business license
88                              limited business license
95                             retail food establishment
109        consumption on premises - incidental activity
110                                        outdoor patio
111                            retail food establishment
112                            retail food establishment
113        consumption on premi

In [None]:
# remove nas, set max value to 1

### ATTACH KDE DATA

### ATTACH WEATHER DATA

In [None]:
# Load weather data
weather = pd.read_csv(os.path.join(root_path, "DATA/weather.csv"))

# Merge weather data with model data
data = pd.merge(data, weather, on="inspection_id")