### CALCULATE FEATURES BASED ON FOOD INSPECTION DATA

In [2]:
import numpy as np
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())

# Load food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

# Create basis for model_data (risk not included in Chicago repository)
data = inspections[["inspection_date", "license", "inspection_id", "facility_type"]]

In [3]:
len(data)

56559

In [4]:
len(inspections)

56559

In [5]:
len(values)

NameError: name 'values' is not defined

In [6]:
# Create pass / fail flags
data["pass_flag"] = inspections.results.map(lambda x: 1 if x == "Pass" else 0)
data["fail_flag"] = inspections.results.map(lambda x: 1 if x == "Fail" else 0)

# Create risk flags
data["risk_1"] = inspections.results.map(lambda x: 1 if x == "Risk 1 (High)" else 0)
data["risk_2"] = inspections.results.map(lambda x: 1 if x == "Risk 2 (Medium)" else 0)
data["risk_3"] = inspections.results.map(lambda x: 1 if x == "Risk 3 (Low)" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://panda

In [7]:
# Load violation data
values = pd.read_csv(os.path.join(root_path, "DATA/violation_values.csv"))
counts = pd.read_csv(os.path.join(root_path, "DATA/violation_counts.csv"))

# Merge with violation data
data = pd.merge(data, values, on="inspection_id", how="left")
data = pd.merge(data, counts, on="inspection_id", how="left")

# Set default to 0
data.fillna(0, inplace=True)

In [8]:
# Sort inspections by date
grouped = data.sort_values(by="inspection_date", inplace=True)

# Find previous inspections by shifting each license group
past_data = data.groupby("license").shift(1)

In [9]:
# Add past fails
data["past_fail"] = past_data.fail_flag

# Add past violation counts
data["past_critical"] = past_data.critical_count
data["past_serious"] = past_data.serious_count
data["past_minor"] = past_data.minor_count

In [10]:
# Select past violation values, remove past inspection id
past_values = past_data[values.columns].drop("inspection_id", axis=1).add_prefix("p")

# Set violation values to 0 for first inspections
past_values.fillna(0, inplace=True)

# Add past values to model data
data = data.join(past_values)

In [11]:
# Calculate time since previous inspection
deltas = pd.to_datetime(data.inspection_date) - pd.to_datetime(past_data.inspection_date)

# Add years since previous inspection
data["time_since_last"] = deltas.apply(lambda x: x.days / 365.25)

In [12]:
# Check if first record
data["first_record"] = data.time_since_last.map(lambda x: 1 if pd.isnull(x) else 0)

# Set time since last for first inspections to 2
data.time_since_last.fillna(2, inplace=True)

### CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA

In [13]:
# Load business license data
licenses = pd.read_csv(os.path.join(root_path, "DATA/business_licenses.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
# Business licenses have numbers on end preventing simple match
# so using street number instead
def get_street_number(address):
    return address.split()[0]

licenses["street_number"] = licenses.address.apply(get_street_number)
inspections["street_number"] = inspections.address.apply(get_street_number)

In [15]:
# Match based on DBA name and street number
venue_matches = pd.merge(inspections, licenses, left_on=["dba_name", "street_number"], right_on=["doing_business_as_name", "street_number"])

# Match based on license numbers
license_matches = pd.merge(inspections, licenses, left_on="license", right_on="license_number")

# Join matches and reset index
matches = venue_matches.append(license_matches).reset_index(drop=True)

# Drop duplicates and reindex
matches.drop_duplicates(["inspection_id", "id"], inplace=True)

# Restrict to matches where inspection falls within license period
matches = matches.loc[matches.inspection_date.between(matches.license_start_date, matches.expiration_date)]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [16]:
# Convert dates to datetime format
matches.inspection_date = pd.to_datetime(matches.inspection_date)
matches.license_start_date = pd.to_datetime(matches.license_start_date)

def get_age_data(group):
    min_date = group.license_start_date.min()
    deltas = group.inspection_date - min_date
    group["age_at_inspection"] = deltas.apply(lambda x: x.days / 365.25)
    return group[["inspection_id", "age_at_inspection"]]

# 3 mins
age_data = matches.groupby("license").apply(get_age_data)

In [17]:
# Merge age data
data = pd.merge(data, age_data, on="inspection_id", how="left")

In [18]:
data.age_at_inspection

0        0.804928
1        0.722793
2        0.722793
3        0.555784
4        1.886379
5        1.886379
6        1.886379
7        1.555099
8        1.555099
9        1.555099
10       0.136893
11       0.054757
12       1.388090
13       1.976728
14       1.976728
15       1.976728
16       0.555784
17       0.722793
18       0.722793
19       0.651608
20       1.144422
21       1.806982
22       1.806982
23       0.722793
24       0.887064
25       1.642710
26       1.642710
27       1.223819
28       0.643395
29       0.643395
           ...   
94075    8.424367
94076    8.424367
94077    6.759754
94078    7.679671
94079    3.323751
94080    2.847365
94081    9.930185
94082         NaN
94083    7.934292
94084    8.933607
94085    8.933607
94086    6.934976
94087    0.804928
94088    8.933607
94089    3.356605
94090    3.356605
94091    1.026694
94092    0.525667
94093    8.281999
94094    1.207392
94095    7.112936
94096    7.112936
94097    7.112936
94098    7.112936
94099     

In [None]:
matches.license_descriptions = matches.license_description.str.lower()

matches.license_descriptions.replace(" ", "_", inplace=True)

def get_category_data(group):
    
    
category_data = matches.groupby("inspection_id").apply(get_category_data, axis=1)

In [None]:
# merge in categories

In [None]:
# remove nas, set max value to 1

### ATTACH KDE DATA

### ATTACH WEATHER DATA

In [None]:
# Load weather data
weather = pd.read_csv(os.path.join(root_path, "DATA/weather.csv"))

# Merge weather data with model data
data = pd.merge(data, weather, on="inspection_id")