In [11]:
import pandas as pd
import os.path
root_path = os.path.dirname(os.getcwd())

# Import food inspection data
inspections = pd.read_csv(os.path.join(root_path, "DATA/food_inspections.csv"))

In [18]:
# Generate column names
critical_columns = [("v_" + str(num)) for num in range(1, 15)]
serious_columns = [("v_" + str(num)) for num in range(15, 30)]
minor_columns = [("v_" + str(num)) for num in range(30, 45)]
minor_columns.append("v_70")

columns = critical_columns + serious_columns + minor_columns

In [13]:
# Split violations into binary values for each violation
def split_violations(violations):
    values_row = pd.Series([])
    
    if type(violations) == str:
        violations = violations.split(' | ')
        for violation in violations:
            index = "v_" + violation.split('.')[0]
            values_row[index] = 1
    return values_row

In [14]:
# 5 mins
values_data = inspections.violations.apply(split_violations)

In [15]:
# Ensure no missing columns, fill NaN
values = pd.DataFrame(values_data, columns=columns).fillna(0)

values['inspection_id'] = inspections.inspection_id

In [17]:
# Count violations
counts = pd.DataFrame({
    "critical_count": values[critical_columns].sum(axis=1),
    "serious_count": values[serious_columns].sum(axis=1),
    "minor_count": values[minor_columns].sum(axis=1)
})

counts['inspection_id'] = inspections.inspection_id

In [22]:
import os.path
root_path = os.path.dirname(os.getcwd())

# Save violation values and counts
values.to_csv(os.path.join(root_path, "DATA/violation_values.csv"), index=False)
counts.to_csv(os.path.join(root_path, "DATA/violation_counts.csv"), index=False)