### CALCULATE FEATURES BASED ON FOOD INSPECTION DATA

In [1]:
import numpy as np
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())

# Load food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

# Create basis for model_data (risk not included in Chicago repository)
data = inspections[["inspection_date", "license", "inspection_id", "facility_type"]]

In [2]:
# Create pass / fail flags
data["pass_flag"] = inspections.results.map(lambda x: 1 if x == "Pass" else 0)
data["fail_flag"] = inspections.results.map(lambda x: 1 if x == "Fail" else 0)

# Create risk flags
data["risk_1"] = inspections.results.map(lambda x: 1 if x == "Risk 1 (High)" else 0)
data["risk_2"] = inspections.results.map(lambda x: 1 if x == "Risk 2 (Medium)" else 0)
data["risk_3"] = inspections.results.map(lambda x: 1 if x == "Risk 3 (Low)" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://panda

In [3]:
# Load violation data
values = pd.read_csv(os.path.join(root_path, "DATA/violation_values.csv"))
counts = pd.read_csv(os.path.join(root_path, "DATA/violation_counts.csv"))

# Merge with violation data
data = pd.merge(data, values, on="inspection_id")
data = pd.merge(data, counts, on="inspection_id")

In [4]:
# Sort inspections by date
grouped = data.sort_values(by="inspection_date", inplace=True)

# Find previous inspections by shifting each license group
past_data = data.groupby("license").shift(1)

In [5]:
# Add past fails
data["past_fail"] = past_data.fail_flag

# Add past violation counts
data["past_critical"] = past_data.critical_count
data["past_serious"] = past_data.serious_count
data["past_minor"] = past_data.minor_count

In [6]:
# Select past violation values, remove inspection_id
past_values = past_data[values.columns].drop("inspection_id", axis=1).add_prefix("p")

# Set violation values to 0 for first inspections
past_values.fillna(0, inplace=True)

# Add past values to model data
data = data.join(past_values)

In [7]:
# Calculate time since previous inspection
deltas = pd.to_datetime(data.inspection_date) - pd.to_datetime(past_data.inspection_date)

# Add years since previous inspection
data["time_since_last"] = deltas.apply(lambda x: x.days / 365.25)

In [8]:
# Check if first record
data["first_record"] = data.time_since_last.map(lambda x: 1 if pd.isnull(x) else 0)

# Set time since last for first inspections to 2
data.time_since_last.fillna(2, inplace=True)

### CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA

In [9]:
# Load business license data
licenses = pd.read_csv(os.path.join(root_path, "DATA/business_licenses.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
# Business licenses have numbers on end preventing simple match
# so using street number instead
def get_street_number(address):
    return address.split()[0]

licenses["street_number"] = licenses.address.apply(get_street_number)
inspections["street_number"] = inspections.address.apply(get_street_number)

In [49]:
licenses

Unnamed: 0,account_number,address,application_created_date,application_requirements_complete,application_type,business_activity,business_activity_id,city,conditional_approval,date_issued,...,payment_date,police_district,precinct,site_number,ssa,state,ward,ward_precinct,zip_code,street_number
0,51755,11243 CHESAPEAKE PLAC 1ST,,2000-06-16T00:00:00,RENEW,,,WESTCHESTER,N,2009-08-24T00:00:00,...,2009-08-21T00:00:00,,,1,,IL,,,60154,11243
1,51944,5410 NEWPORT DR 40,,2000-06-20T00:00:00,ISSUE,,,ROLLING MEADOWS,N,2006-04-12T00:00:00,...,,,,1,,IL,,,60008,5410
2,52896,17021 S MAGNOLIA DR 1ST,,2000-06-20T00:00:00,ISSUE,,,HAZEL CREST,N,2006-04-12T00:00:00,...,,,,1,,IL,,,60429,17021
3,54389,450 SKOKIE BLVD 904,,2000-06-20T00:00:00,ISSUE,,,NORTHBROOK,N,2006-04-12T00:00:00,...,,,,1,,IL,,,60062,450
4,55668,261 N WEST AVE 1ST,,2000-06-20T00:00:00,ISSUE,,,ELMHURST,N,2006-04-12T00:00:00,...,,,,1,,IL,,,60126,261
5,55668,261 N WEST AVE 1ST,,2000-06-20T00:00:00,ISSUE,,,ELMHURST,N,2006-04-12T00:00:00,...,,,,1,,IL,,,60126,261
6,58310,516 W 136 PL 1,,2000-06-20T00:00:00,ISSUE,,,RIVERDALE,N,2006-04-12T00:00:00,...,,,,1,,IL,,,60827,516
7,19866,65 E PALATINE RD 117,,,RENEW,,,PROSPECT HEIGHTS,N,2008-02-28T00:00:00,...,2008-02-28T00:00:00,,,1,,IL,,,60070,65
8,59758,12440 S ASHLAND AVE 1ST,,2003-12-22T00:00:00,RENEW,,,CALUMET PARK,N,2012-11-29T00:00:00,...,,,,1,,IL,,,60643,12440
9,48577,26 STACY CT 1ST,,2000-06-16T00:00:00,RENEW,,,GLENVIEW,N,2010-05-17T00:00:00,...,2010-05-14T00:00:00,,,1,,IL,,,60025,26


In [44]:
len(inspections)

56559

In [50]:
inspections.inspection_id.nunique()

56559

In [52]:
# Match inspections, business licenses based on DBA name, street number
matches = pd.merge(inspections, licenses, left_on=["dba_name", "street_number"], right_on=["doing_business_as_name", "street_number"])

# Restrict to matches where inspection date falls within license range
matches = matches.loc[
    (matches.license_start_date <= matches.inspection_date) &
    (matches.expiration_date > matches.inspection_date)
]

In [89]:
# Match based on DBA name and street number
venue_matches = pd.merge(inspections, licenses, left_on=["dba_name", "street_number"], right_on=["doing_business_as_name", "street_number"])

# Match based on license numbers
license_matches = pd.merge(inspections, licenses, left_on="license", right_on="license_number")

# Join matches
matches = venue_matches.append(license_matches)

# Drop duplicates
matches.drop_duplicates(["inspection_id", "id"], inplace=True)

# Restrict to matches where inspection falls within license period
matches = matches.loc[matches.inspection_date.between(matches.license_start_date, matches.expiration_date)]

In [95]:
matches

Unnamed: 0,account_number,address_x,address_y,aka_name,application_created_date,application_requirements_complete,application_type,business_activity,business_activity_id,city_x,...,state_x,state_y,street_number,street_number_x,street_number_y,violations,ward,ward_precinct,zip,zip_code
5,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,,2015-12-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,"30. FOOD IN ORIGINAL CONTAINER, PROPERLY LABEL...",19.0,19-50,60643.0,60643
8,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,,2015-12-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,22. DISH MACHINES: PROVIDED WITH ACCURATE THER...,19.0,19-50,60643.0,60643
11,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,,2015-12-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,22. DISH MACHINES: PROVIDED WITH ACCURATE THER...,19.0,19-50,60643.0,60643
13,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,,2013-12-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,19.0,19-50,60643.0,60643
16,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,,2013-12-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,"30. FOOD IN ORIGINAL CONTAINER, PROPERLY LABEL...",19.0,19-50,60643.0,60643
18,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,2011-12-13T00:00:00,2011-12-13T00:00:00,ISSUE,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,19.0,19-50,60643.0,60643
21,368388,10936 S VINCENNES AVE,10936 S VINCENNES AVE 1ST,JEAN'S CAFE & RESTAURANT,2011-12-13T00:00:00,2011-12-13T00:00:00,ISSUE,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,10936,,,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,19.0,19-50,60643.0,60643
44,86481,2323 W 111TH ST,2323 W 111TH ST 1,FAIRPLAY FOODS,,2017-10-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,2323,,,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,19.0,19-64,60643.0,60643
55,86481,2323 W 111TH ST,2323 W 111TH ST 1,FAIRPLAY FOODS,,2015-10-15T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,2323,,,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,19.0,19-64,60643.0,60643
71,86481,2323 W 111TH ST,2323 W 111TH ST 1,FAIRPLAY FOODS,,2011-10-14T00:00:00,RENEW,Retail Sales of Perishable Foods,775,CHICAGO,...,IL,IL,2323,,,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,19.0,19-64,60643.0,60643


In [108]:
def get_min_date(group):
    min_date = group.license_start_date.min()
    group["min_date"] = min_date
    return group.min_date

min_date = matches.groupby("license").apply(get_min_date)

deltas = pd.to_datetime(matches.inspection_date) - pd.to_datetime(min_date)

In [None]:
min_2 = matches.groupby("license").min()

In [103]:
len(matches)

93504

In [107]:
len(min_date)

93504

In [69]:

matches2 = matches2.loc[matches2.inspection_date.between(matches2.license_start_date, matches2.expiration_date)]

In [70]:
matches.inspection_id.nunique()

44711

In [88]:
len(matches) / matches.inspection_id.nunique()

1.6709675113477966

In [72]:
matches2.inspection_id.nunique()

55878

In [73]:
len(matches2) / matches2.inspection_id.nunique()

1.000107376785139

In [14]:
# Match by license number and when inspection_date
# falls wrt business license renewal cycle
def get_business_id(inspection):
    matches = licenses.loc[licenses.license_number == inspection.license]
    matches = matches.loc[
        (matches.license_start_date <= inspection.inspection_date) &
        (matches.expiration_date > inspection.inspection_date)
    ]
    if len(matches) > 0:
        print(inspection.address, matches.iloc[0].address)
        return matches.iloc[0].id
    return np.nan

# Sort licenses to allow selection of first match
# licenses.sort_values("license_start_date", inplace=True)

# # 3 mins
# data["business_id"] = data.apply(get_business_id, axis=1)

In [29]:
# Match by license number and when inspection_date
# falls wrt business license renewal cycle
def get_business_id(inspection):
    matches = licenses.loc[licenses.address == inspection.address]
#     matches1 = matches1.loc[matches1.address == inspection.address]
    matches = matches.loc[matches.doing_business_as_name == inspection.dba_name]
    matches = matches.loc[
        (matches.license_start_date <= inspection.inspection_date) &
        (matches.expiration_date > inspection.inspection_date)
    ]
    matches2 = licenses.loc[licenses.license_number == inspection.license]
#     matches1 = matches1.loc[matches1.address == inspection.address]
    matches2 = matches2.loc[
        (matches2.license_start_date <= inspection.inspection_date) &
        (matches2.expiration_date > inspection.inspection_date)
    ]
#     matches2 = matches2.loc[
#         (matches2.license_start_date <= inspection.inspection_date) &
#         (matches2.expiration_date > inspection.inspection_date)
#     ]
#     if (len(matches1) != len(matches2)):
#     print(len(matches), len(matches2))
    for match in matches.license_number:
        print(1, match)
    for match2 in matches2.license_number:
        print(2, match2)
#     if len(matches) > 0:
#         return matches.iloc[0].id
#     return np.nan

# Sort licenses to allow selection of first match
# licenses.sort_values("license_start_date", inplace=True)

# 3 mins
# data["business_id"] = data.apply(get_business_id, axis=1)

In [None]:
inspections.apply(get_business_id, axis=1)

1 10947.0
2 10947.0
1 2404844.0
1 2404845.0
1 2379419.0
2 2379419.0
1 2097959.0
1 2097958.0
1 2097956.0
1 2147229.0
2 2097956.0
1 2522599.0
2 2522599.0
2 2014104.0
1 2437489.0
2 2437489.0
1 2536392.0
2 2536392.0
1 2184404.0
1 2510616.0
2 2184404.0
1 15911.0
1 15910.0
2 15910.0
1 1844663.0
2 1844663.0
1 36468.0
2 36468.0
1 1490322.0
2 1490322.0
2 2551347.0
1 1444218.0
2 1444218.0
1 2307861.0
2 2307861.0
1 2379900.0
2 2379900.0
1 46653.0
2 46653.0
1 1903527.0
2 1903527.0
2 2437640.0
1 1804587.0
2 1804587.0
1 1817904.0
1 2021273.0
2 1817904.0
1 2363725.0
2 2363725.0
1 2488830.0
1 2488875.0
2 2488830.0
2 2189302.0
1 2511391.0
1 2511392.0
2 2511391.0
1 2488187.0
1 2488186.0
2 2488186.0
1 1616455.0
2 1616455.0
1 2027332.0
2 2027332.0
1 2487943.0
1 2487944.0
2 2487943.0
1 1544559.0
2 1544559.0
1 2084794.0
1 2084793.0
2 2084793.0
1 1820058.0
1 1224303.0
2 1224303.0
1 2487938.0
1 2487939.0
2 2487938.0
1 1932383.0
2 1932383.0
1 1740805.0
2 1740805.0
1 1473964.0
2 1473964.0
1 3846.0
2 3846.0
1 25

1 1145606.0
2 1145606.0
1 2549172.0
2 2549172.0
1 404.0
1 405.0
2 404.0
1 2506119.0
2 2506119.0
2 2549860.0
1 1270949.0
1 1426020.0
2 1270949.0
1 36324.0
2 36324.0
1 1124035.0
2 1124035.0
1 2379981.0
2 2379981.0
1 2438033.0
2 2438033.0
1 1959341.0
2 1959341.0
2 2202626.0
2 2269773.0
1 2564673.0
2 2564673.0
1 37096.0
1 1799248.0
2 37096.0
1 83462.0
1 1045677.0
2 83462.0
1 32111.0
1 32112.0
2 32111.0
1 3532.0
2 3532.0
2 2304398.0
1 1922066.0
2 1922066.0
1 2293813.0
2 2293813.0
2 1938611.0
1 2511404.0
1 2511405.0
2 2511404.0
1 1964058.0
2 1964058.0
1 2511389.0
1 2511390.0
2 2511389.0
1 2511386.0
1 2511385.0
2 2511385.0
1 2129321.0
1 2129323.0
1 2129322.0
2 2129321.0
1 1884293.0
2 1884293.0
1 1874917.0
1 1897991.0
2 1874917.0
2 2405944.0
1 15622.0
2 15622.0
1 1843586.0
2 1843586.0
1 2483444.0
2 2483444.0
1 1676717.0
2 1676717.0
2 2363067.0
2 2405951.0
1 1492287.0
2 1492287.0
1 2476990.0
2 2476990.0
1 2362828.0
2 2362828.0
1 6006.0
2 6006.0
1 1842806.0
2 1842806.0
1 2510601.0
2 2510601.0
1 

1 47014.0
1 47013.0
2 47013.0
2 2068996.0
1 2036683.0
1 1997875.0
1 1997874.0
2 1997874.0
1 84376.0
1 84375.0
2 84375.0
1 1879164.0
1 1879166.0
1 1879167.0
1 1879165.0
2 1879164.0
1 2397249.0
1 2397248.0
1 2397250.0
2 2397248.0
2 2284599.0
1 2501184.0
2 2501184.0
2 2463191.0
1 38776.0
2 38776.0
2 38279.0
1 2505997.0
1 2505998.0
2 2505997.0
1 2363766.0
2 2363766.0
1 85112.0
2 85112.0
1 1379435.0
1 1403516.0
2 1379435.0
1 2507003.0
1 2507004.0
1 2507005.0
2 2507003.0
1 34205.0
1 34211.0
1 34219.0
1 34212.0
1 34235.0
1 34228.0
1 2506463.0
1 2583234.0
2 34228.0
1 2358864.0
2 2358864.0
2 2036397.0
1 2432944.0
2 2432944.0
1 2320734.0
1 2320735.0
2 2320734.0
1 2179700.0
1 2179701.0
2 2179700.0
2 2320593.0
1 2109699.0
1 2109700.0
2 2109699.0
1 2043368.0
2 2043368.0
2 2074456.0
1 2385749.0
1 2385153.0
2 2385153.0
1 1594715.0
1 1594714.0
2 1594714.0
1 1272424.0
2 1272424.0
1 1141457.0
2 1141457.0
1 59480.0
2 59480.0
1 1980934.0
2 1980934.0
1 1272426.0
2 1272426.0
1 64032.0
2 64032.0
1 29177.0
1 

1 17249.0
2 17249.0
1 2314219.0
1 2314234.0
1 2314236.0
1 2314233.0
1 2314238.0
1 2314239.0
1 2314220.0
1 2314230.0
1 2314223.0
1 2516654.0
1 2314229.0
1 2563922.0
1 2314221.0
2 2314219.0
1 2320391.0
1 2314185.0
1 2314184.0
2 2320391.0
1 2042566.0
2 2042566.0
1 2246914.0
1 2246915.0
1 2246916.0
2 2246914.0
1 33267.0
1 33268.0
2 33267.0
1 27836.0
1 27835.0
2 27835.0
1 2492766.0
2 2492766.0
2 1543382.0
1 56449.0
1 1171870.0
2 56449.0
1 2177798.0
2 2177798.0
2 2333268.0
1 2412069.0
2 2412069.0
1 2543085.0
2 2543085.0
1 2359057.0
1 2359058.0
1 2363368.0
2 2359057.0
1 2492535.0
2 2492535.0
1 2102832.0
1 2033170.0
2 2033170.0
1 2488404.0
2 2488404.0
1 2404707.0
2 2404707.0
1 2340852.0
2 2340852.0
1 2500613.0
2 2500613.0
1 2129639.0
1 2447045.0
2 2129639.0
2 2348968.0
2 1942785.0
1 1122248.0
1 1141813.0
1 1141814.0
1 1141818.0
1 1170920.0
1 1141823.0
1 1141816.0
1 1141815.0
1 1122244.0
1 1141819.0
1 1225157.0
1 1141817.0
1 2506150.0
1 2506151.0
1 2506247.0
1 2506152.0
1 2506158.0
1 2506246.0


In [None]:
# Chicago matched descriptions from dba name & address
# But address seems to have gunk thrown on the end
# & matching on license seems to give same result
# Only issue is fewer matches? some should have multiple licenses?

In [55]:
inspections.address = inspections.address.str.lower()
licenses.address = licenses.address.str.lower()

In [61]:
licenses.iloc[0].address

455660              3058 w belmont ave  1
454479            4318 w armitage ave 1st
567379                325 w huron st  401
499599             2542 s western ave 1st
492821          5724 n lincoln ave  first
268486               2630 n clark st  1st
412646           5027 w fullerton ave 1st
852614                 2200 e 73rd st 1st
191043            4056 w lawrence ave 1st
216259                  6118 w 63rd st  1
820841                        po box 1463
177629            4606 w lawrence ave 1st
820901                        po box 1463
139191                  2045 w 35th st  2
373168                 2804 w belmont ave
540333                 160 n state st 1st
388420              2146 n halsted st 1st
255546               2414 n cicero ave  1
264856                  5359 w madison st
150662              2015 w irving park rd
253400               1465 w leland ave  1
356501            4805 w cornelia ave 1st
95505             2225 w wabansia ave 1st
483684            6525 w diversey 

In [65]:
# Match by license number and when inspection_date
# falls wrt business license renewal cycle
def get_business_data(inspection):
    matches = licenses.loc[licenses.license_number == inspection.license]
    matches = matches.loc[
        (matches.license_start_date <= inspection.inspection_date) &
        (matches.expiration_date > inspection.inspection_date)
    ]
    
# #     matches2 = matches2.loc[
#         (matches2.license_start_date <= inspection.inspection_date) &
#         (matches2.expiration_date > inspection.inspection_date)
#     ]
#     if (len(matches1) != len(matches2)):
#         print(len(matches1), len(matches2))
    print(len(matches1), len(matches2))
#     if len(matches) > 0:
#         return matches.iloc[0].id
#     return np.nan

# Sort licenses to allow selection of first match
# licenses.sort_values("license_start_date", inplace=True)

# 3 mins
# data["business_id"] = data.apply(get_business_id, axis=1)

In [75]:
matches = pd.merge(data, licenses, left_on="license", right_on="license_number", how="left")

In [76]:
matches = matches.loc[
    (matches.license_start_date <= matches.inspection_date) &
    (matches.expiration_date > matches.inspection_date)
]

In [77]:
print(len(inspections), len(matches))

56559 55648


In [59]:
# Merge with business licenses to create temporary dataframe
temp = pd.merge(data, licenses, left_on="business_id", right_on="id")

# Convert dates to datetime
temp.inspection_date = pd.to_datetime(temp.inspection_date)
temp.license_start_date = pd.to_datetime(temp.license_start_date)

In [21]:
temp.head().inspection_date

NameError: name 'temp' is not defined

In [15]:
def get_age_data(group):
    min_date = group.license_start_date.min()
    deltas = group.inspection_date - min_date
    group["age_at_inspection"] = deltas.apply(lambda x: x.days / 365.25)
    return group[["inspection_id", "age_at_inspection"]]

# Group by licenses and get age at inspection
age_data = temp.groupby("license").apply(get_age_data)

In [15]:
def get_age_data(group):
    min_date = group.license_start_date.min()
    deltas = group.inspection_date - min_date
    group["age_at_inspection"] = deltas.apply(lambda x: x.days / 365.25)
    return group[["inspection_id", "age_at_inspection"]]

# Group by licenses and get age at inspection
age_data = temp.groupby("license").apply(get_age_data)

In [16]:
# Merge in age_at_inspection
thing = pd.merge(data, age_data, on="inspection_id", how="left")

In [None]:
# its about license descriptions

In [17]:
for column in licenses.columns:
    print(column)

account_number
address
application_created_date
application_requirements_complete
application_type
business_activity
business_activity_id
city
conditional_approval
date_issued
doing_business_as_name
expiration_date
id
latitude
legal_name
license_approved_for_issuance
license_code
license_description
license_id
license_number
license_start_date
license_status
license_status_change_date
location
longitude
payment_date
police_district
precinct
site_number
ssa
state
ward
ward_precinct
zip_code


In [18]:
licenses.license_description.value_counts()

Limited Business License                               401630
Retail Food Establishment                              138723
Tobacco                                                 42390
Regulated Business License                              35628
Consumption on Premises - Incidental Activity           29085
Home Occupation                                         27136
Home Repair                                             26102
Tavern                                                  16433
Package Goods                                           14283
Manufacturing Establishments                            11906
Special Event Food                                      11213
Hazardous Materials                                     10937
Motor Vehicle Repair : Engine Only (Class II)           10008
Wholesale Food Establishment                             9078
Peddler, non-food                                        8506
Peddler License                                          7554
Motor Ve

In [None]:
# merge in categories

In [None]:
# remove nas, set max value to 1

### ATTACH KDE DATA

### ATTACH WEATHER DATA

In [None]:
# Load weather data
weather = pd.read_csv(os.path.join(root_path, "DATA/weather.csv"))

# Merge weather data with model data
data = pd.merge(data, weather, on="inspection_id")