In [1]:
# Install necessary packages

In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
# Import origional CSV file for all inspections

In [4]:
inspections = pd.read_csv('../inspections.csv')

In [5]:
# Snake case function to standardize and format column headers

In [6]:
def snake_case(title):
    title = title.lower().replace(" ", "_")
    title = re.sub('\W+',"", title)
    if len(title) > 1:
        if title[-1] == "_":
            return title[:-1]
    return title

In [7]:
# Snake case funtion test

In [8]:
string = "HEllo *you PErson!@"


snake_case(string)

'hello_you_person'

In [9]:
# Reformat all column headers

In [10]:
inspections.columns = [snake_case(x) for x in inspections.columns]
inspections.columns

Index(['inspection_id', 'dba_name', 'aka_name', 'license', 'facility_type',
       'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
       'inspection_type', 'results', 'violations', 'latitude', 'longitude',
       'location'],
      dtype='object')

In [11]:
# Prototype function for cleaning violations column to separate out comments and separate out violations

In [12]:
inspections["comments"] = inspections["violations"][0].split(" | ")[0].split(" - Comments: ")[1]


In [13]:
# Function to pull out all violations from each inspection

In [14]:
def get_violations(inspections):
    violation_list = []
    for violations in inspections["violations"]:
        if type(violations) is str:
            violations = violations.split(" | ")
            for violation in violations:
                single_violation = violation.split(" - Comments: ")[0]
                if single_violation not in violation_list:
                    violation_list.append(single_violation)
    return violation_list

In [15]:
# Function to correctly number violations

In [16]:
def number_violations(violations):
    numbered_violations = []
    for violation in violations:
        number_violation = violation.split(' ',1)
        number_violation[0] = int(float(number_violation[0]))
        numbered_violations.append(number_violation)
    return numbered_violations

In [17]:
# Make sorted violations dataframe

In [18]:
unsorted_violations = get_violations(inspections)
numbered_violations = number_violations(unsorted_violations)

violations_df = pd.DataFrame(columns=['violation_number', 'violation'], data=numbered_violations)
violations_df = violations_df.sort_values('violation_number', ascending=True).reset_index(drop=True)



In [19]:
# Add violation type, based on https://webapps1.cityofchicago.org/healthinspection/Code_Violations.jsp

In [20]:
def add_violation_type(row):
    if row['violation_number'] < 15:
        return "critical"
    elif row['violation_number'] < 30:
        return "serious"
    return "minor"

violations_df['violation_type'] = violations_df.apply(add_violation_type, axis=1)
violations_df

Unnamed: 0,violation_number,violation,violation_type
0,1,"SOURCE SOUND CONDITION, NO SPOILAGE, FOODS PRO...",critical
1,2,FACILITIES TO MAINTAIN PROPER TEMPERATURE,critical
2,3,POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATURE R...,critical
3,4,SOURCE OF CROSS CONTAMINATION CONTROLLED I.E. ...,critical
4,5,PERSONNEL WITH INFECTIONS RESTRICTED: NO OPEN ...,critical
5,6,"HANDS WASHED AND CLEANED, GOOD HYGIENIC PRACTI...",critical
6,7,WASH AND RINSE WATER: CLEAN AND PROPER TEMPERA...,critical
7,8,SANITIZING RINSE FOR EQUIPMENT AND UTENSILS: ...,critical
8,9,"WATER SOURCE: SAFE, HOT & COLD UNDER CITY PRES...",critical
9,10,"SEWAGE AND WASTE WATER DISPOSAL, NO BACK SIPHO...",critical


In [21]:
inspections['dba_name'].value_counts()

SUBWAY                          2328
DUNKIN DONUTS                   1209
MCDONALD'S                       510
7-ELEVEN                         396
MCDONALDS                        297
CHIPOTLE MEXICAN GRILL           277
POTBELLY SANDWICH WORKS LLC      237
CORNER BAKERY CAFE               212
POTBELLY SANDWICH WORKS          206
DUNKIN DONUTS/BASKIN ROBBINS     193
DOMINO'S PIZZA                   183
SPORTSERVICE SOLDIER FIELD       176
AU BON PAIN                      175
WHOLE FOODS MARKET               175
FRESHII                          175
SUBWAY SANDWICHES                172
HAROLD'S CHICKEN SHACK           167
KFC                              161
Subway                           154
SEE THRU CHINESE KITCHEN         139
SHARKS FISH & CHICKEN            135
PIZZA HUT                        126
J & J FISH                       124
MC DONALD'S                      123
CITGO                            122
JIMMY JOHN'S                     116
STARBUCKS                        114
J

In [50]:
cleaned_inspections = inspections.dropna()
results = cleaned_inspections[cleaned_inspections['facility_type'].str.contains('Wri')]
results['dba_name'].value_counts()


DOWN THE LINE ROOFTOP    6
THE IVY LEAGUE CLUB      5
Name: dba_name, dtype: int64