In [71]:
# Import necessary libraries and modules
import pandas as pd
import numpy as np
import john_acquire as a  # Custom module for data acquisition
import john_prepare as p
%load_ext autoreload
%autoreload 2

# Set the option to display all columns in DataFrames
pd.set_option('display.max_columns', None)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Dataset Overview

| Column | Description | Data Type |
| --- | --- | --- |
| camis | Unique identifier for each record | int64 |
| dba | Doing Business As (DBA) name | object |
| boro | Borough where the establishment is located | object |
| building | Building number | object |
| street | Street name | object |
| zipcode | Zip code | float64 |
| phone | Phone number | object |
| cuisine\_description | Description of the cuisine type | object |
| inspection\_date | Date of inspection | object |
| action | Action taken during inspection | object |
| critical\_flag | Indicator of critical violations | object |
| score | Inspection score | float64 |
| record\_date | Date of record | object |
| inspection\_type | Type of inspection | object |
| latitude | Latitude of the establishment | float64 |
| longitude | Longitude of the establishment | float64 |
| community\_board | Community board district | float64 |
| council\_district | Council district | float64 |
| census\_tract | Census tract | float64 |
| bin | Building identification number | float64 |
| bbl | Borough block and lot number | float64 |
| nta | Neighborhood Tabulation Area (NTA) | object |
| violation\_code | Code indicating violations | object |
| violation\_description | Description of the violations | object |
| grade | Inspection grade | object |
| grade\_date | Date of inspection grade | object |

In [72]:
# Load the acquired data from the CSV file
inspection_df = pd.read_csv('health_inspections.csv', index_col=False)

# Display the first few rows of the loaded DataFrame
inspection_df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,cuisine_description,action,score,grade,grade_date,inspection_type,violation_code,violation_description
0,50145150,MARY'S,Manhattan,146,ORCHARD STREET,10002.0,9738091987,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.720534,-73.988982,103.0,1.0,3001.0,1005340.0,1004110000.0,MN27,,,,,,,,
1,50116081,Mysttik Masaala,Manhattan,42,WEST 42 STREET,10036.0,9174003090,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.754068,-73.982321,105.0,4.0,8400.0,1085593.0,1012570000.0,MN17,,,,,,,,
2,50142718,ARCH,Manhattan,140,BROADWAY,10005.0,8663723035,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.709036,-74.010626,101.0,1.0,700.0,1001027.0,1000480000.0,MN25,,,,,,,,
3,50129707,ESTRELLA DEL CARIBE RESTAURANT,Bronx,857,EAST 149 STREET,10455.0,9178554852,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.811889,-73.903881,202.0,17.0,8300.0,2096003.0,2026030000.0,BX33,,,,,,,,
4,50143250,HIDE ROOFTOP,Manhattan,24,JOHN STREET,10038.0,9173635827,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.709758,-74.008765,101.0,1.0,1502.0,1001105.0,1000650000.0,MN25,,,,,,,,


In [73]:
inspections_df_status = pd.DataFrame({
    'Null Count': inspection_df.isna().sum(),
    'Zero Count': (inspection_df == 0).sum(),
    'Zero Count (str)': (inspection_df == '0').sum(),
    'Data Types': inspection_df.dtypes
})
# Name the index
inspections_df_status = inspections_df_status.rename_axis('Column')

# inspections_df_status.to_csv('inspections_df_status.csv')
ispections_prepare = inspections_df_status[inspections_df_status[['Null Count', 'Zero Count', 'Zero Count (str)']].gt(0).any(axis=1)]
# ispections_prepare.to_csv('ispections_prepare.csv')

ispections_prepare

Unnamed: 0_level_0,Null Count,Zero Count,Zero Count (str),Data Types
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dba,640,0,0,object
boro,0,0,9,object
building,297,0,454,object
street,3,0,0,object
zipcode,2698,0,0,float64
phone,2,0,0,object
latitude,279,2695,0,float64
longitude,279,2695,0,float64
community_board,3227,0,0,float64
council_district,3231,0,0,float64


In [74]:
# Display information about the dataset, including non null counts per column
inspection_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211164 entries, 0 to 211163
Data columns (total 26 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  211164 non-null  int64  
 1   dba                    210524 non-null  object 
 2   boro                   211164 non-null  object 
 3   building               210867 non-null  object 
 4   street                 211161 non-null  object 
 5   zipcode                208466 non-null  float64
 6   phone                  211162 non-null  object 
 7   inspection_date        211164 non-null  object 
 8   critical_flag          211164 non-null  object 
 9   record_date            211164 non-null  object 
 10  latitude               210885 non-null  float64
 11  longitude              210885 non-null  float64
 12  community_board        207937 non-null  float64
 13  council_district       207933 non-null  float64
 14  census_tract           207933 non-nu

### Checking for Missing Values

Summarizing Missing Values by Column

In [75]:
inspections_df_isna = pd.DataFrame({
    'Null Count': inspection_df.isna().sum(),
})
# Name the index
inspections_df_isna = inspections_df_isna.rename_axis('Column')

inspections_df_isna_true = inspections_df_isna[inspections_df_isna['Null Count'] > 0]

inspections_df_isna_true
# inspections_df_isna_true.to_csv('inspections_df_isna.csv')

Unnamed: 0_level_0,Null Count
Column,Unnamed: 1_level_1
dba,640
building,297
street,3
zipcode,2698
phone,2
latitude,279
longitude,279
community_board,3227
council_district,3231
census_tract,3231


In [76]:
# Calculate the count of missing values in each column
null_counts_by_column = inspection_df.isnull().sum()

# Filter and display columns with missing values
null_counts_by_column[null_counts_by_column > 0]

dba                         640
building                    297
street                        3
zipcode                    2698
phone                         2
latitude                    279
longitude                   279
community_board            3227
council_district           3231
census_tract               3231
bin                        4231
bbl                         532
nta                        3227
cuisine_description        2434
action                     2434
score                     10124
grade                    107757
grade_date               116724
inspection_type            2434
violation_code             3567
violation_description      3567
dtype: int64

### Grades Column

Drop the 'grade' and 'grade\_date' columns. According to the documentation, not all inspections receive a grade. We can simply calculate the grade using the score. Additionally, the documentation mentions that the grade may not match the scores due to input errors.

In [77]:
# Dropping the 'grade' and 'grade_date' columns
inspection_df = inspection_df.drop(['grade', 'grade_date'], axis=1)

In [78]:
# Reassessing the null counts in the dataset
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

dba                        640
building                   297
street                       3
zipcode                   2698
phone                        2
latitude                   279
longitude                  279
community_board           3227
council_district          3231
census_tract              3231
bin                       4231
bbl                        532
nta                       3227
cuisine_description       2434
action                    2434
score                    10124
inspection_type           2434
violation_code            3567
violation_description     3567
dtype: int64

##### Inferring Missing Values

Our next step is to strategize how to address these missing values by leveraging available data in other columns. The proposed hierarchy for inference is as follows:

`lat&long < building < bin < bbl < nta, zipcode* < community board < council district < census tract`

Given the relatively low count of missing values in the BBL column, it appears to be a promising candidate for inferring related data such as NTA (Neighborhood Tabulation Area), Community Board, Council District, and Census Tract.

Let's examine the first few unique values in the BBL column to understand its content

In [79]:
# Let's take a look at the first few unique values in the BBL column
sorted(inspection_df.bbl.unique())[:10]


[1.0,
 3.0,
 4.0,
 5.0,
 1000000000.0,
 1000020001.0,
 1000020002.0,
 1000030001.0,
 1000047501.0,
 1000070031.0]

The BBL column shows the presence of non-standard values, which do not conform to the expected 10-digit format (1.0, 2.0, 3.0, 4.0, etc)

Now, let's find out the count of these non-standard values:

In [80]:
# Define non-standard BBL values
bbl_values = [np.nan, 1.0, 2.0, 3.0, 4.0, 5.0]

# Calculate the count of these non-standard values in the BBL column
inspection_df['bbl'].isin(bbl_values).sum()

4231

Non-standard BBL values are exactly the same as NaN values in the BIN column, indicating a pattern of missing values across these key columns.

- census_tract               3157
- bin                        4144
- bbl                        4144
- nta                        3153
- community_board            3153
- council_district           3157

We are unable to rely on bbl make inferences because the features were missing across the same rows. We must abandon the hierarchy inference plan. To proceed, we drop rows with NaN values in the BIN column.

In [81]:
# Dropping rows with null values in the 'bin' column
inspection_df = inspection_df.dropna(subset=['bin'])


##### Reevaluating Null Counts After BIN Column Cleanup

In [82]:
# Calculate the count of missing values in each column
null_counts_by_column = inspection_df.isnull().sum()

# Filter and display columns with missing values
null_counts_by_column[null_counts_by_column > 0]

dba                       610
zipcode                    31
phone                       2
community_board            31
council_district           35
census_tract               35
nta                        31
cuisine_description      2333
action                   2333
score                    9886
inspection_type          2333
violation_code           3385
violation_description    3385
dtype: int64

As expected, the remaining NaNs are mostly related or in common with the initial set. 

##### Handling Remaining Zoning Nulls

For the small number of remaining NaNs in the zoning columns, we can safely drop them due to their limited impact on the dataset. I chose to drop 'council_district' to see if this also got rid of the other NaNs.

In [83]:
# Dropping rows with null values in the 'council_district' column
inspection_df = inspection_df.dropna(subset=['council_district'])

# Reassessing the null counts in the dataset
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

dba                       610
phone                       2
cuisine_description      2332
action                   2332
score                    9883
inspection_type          2332
violation_code           3384
violation_description    3384
dtype: int64

As expected, dropping council_district also removed the other zoning features with nulls.

##### Identifying Relevant Inspection Types

Before proceeding with the score nulls, let's identify and focus on inspection types related to food safety.

In [84]:
# Display rows where 'inspection_type' is null
inspection_df[inspection_df['inspection_type'].isna()]

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,cuisine_description,action,score,inspection_type,violation_code,violation_description
0,50145150,MARY'S,Manhattan,146,ORCHARD STREET,10002.0,9738091987,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.720534,-73.988982,103.0,1.0,3001.0,1005340.0,1.004110e+09,MN27,,,,,,
1,50116081,Mysttik Masaala,Manhattan,42,WEST 42 STREET,10036.0,9174003090,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.754068,-73.982321,105.0,4.0,8400.0,1085593.0,1.012570e+09,MN17,,,,,,
2,50142718,ARCH,Manhattan,140,BROADWAY,10005.0,8663723035,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.709036,-74.010626,101.0,1.0,700.0,1001027.0,1.000480e+09,MN25,,,,,,
3,50129707,ESTRELLA DEL CARIBE RESTAURANT,Bronx,857,EAST 149 STREET,10455.0,9178554852,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.811889,-73.903881,202.0,17.0,8300.0,2096003.0,2.026030e+09,BX33,,,,,,
4,50143250,HIDE ROOFTOP,Manhattan,24,JOHN STREET,10038.0,9173635827,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.709758,-74.008765,101.0,1.0,1502.0,1001105.0,1.000650e+09,MN25,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210345,50138242,JUST SALAD,Manhattan,4,COLUMBUS CIRCLE,10019.0,7323004245,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.767559,-73.982798,104.0,3.0,13900.0,1026054.0,1.010480e+09,MN15,,,,,,
210462,50135157,,Brooklyn,291,GREENE AVENUE,11238.0,7207715265,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.687346,-73.960326,302.0,35.0,23100.0,3055755.0,3.019520e+09,BK69,,,,,,
210540,50142126,ROSALIA'S PIZZERIA,Queens,10315,QUEENS BLVD,11375.0,7184590300,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.724658,-73.850042,406.0,29.0,71303.0,4051281.0,4.021380e+09,QN17,,,,,,
210677,50130131,Dear Mr Moto,Manhattan,120,SAINT MARKS PLACE,10009.0,2695980939,1900-01-01,Not Applicable,2023-12-17T06:00:14.000,40.727260,-73.984540,103.0,2.0,3200.0,1005832.0,1.004350e+09,MN22,,,,,,


In [85]:
inspection_df.inspection_type.isna().sum()

2332

In [86]:
# Filter out rows with 'inspection_date' as '1900-01-01'
inspection_df = inspection_df[inspection_df['inspection_date'] != '1900-01-01']

# Reset the index to ensure continuous index values
inspection_df.reset_index(drop=True, inplace=True)

In [87]:
# Assuming 'inspection_df' is your DataFrame
unique_inspection_types = inspection_df['inspection_type'].unique()

# Convert the numpy array to a list and then sort it
sorted_inspection_types = sorted(unique_inspection_types.tolist())
sorted_inspection_types

['Administrative Miscellaneous / Compliance Inspection',
 'Administrative Miscellaneous / Initial Inspection',
 'Administrative Miscellaneous / Re-inspection',
 'Administrative Miscellaneous / Reopening Inspection',
 'Administrative Miscellaneous / Second Compliance Inspection',
 'Calorie Posting / Compliance Inspection',
 'Calorie Posting / Initial Inspection',
 'Calorie Posting / Re-inspection',
 'Cycle Inspection / Compliance Inspection',
 'Cycle Inspection / Initial Inspection',
 'Cycle Inspection / Re-inspection',
 'Cycle Inspection / Reopening Inspection',
 'Cycle Inspection / Second Compliance Inspection',
 'Inter-Agency Task Force / Initial Inspection',
 'Inter-Agency Task Force / Re-inspection',
 'Pre-permit (Non-operational) / Compliance Inspection',
 'Pre-permit (Non-operational) / Initial Inspection',
 'Pre-permit (Non-operational) / Re-inspection',
 'Pre-permit (Non-operational) / Second Compliance Inspection',
 'Pre-permit (Operational) / Compliance Inspection',
 'Pre-per

 We will exclude types such as "Calorie Posting," "Pre-permit," "Smoke-Free Air Act," and "Trans Fat," as they do not directly pertain to food safety

In [88]:
original_length = len(inspection_df)
# List of inspection types to be removed
remove_types = ["Calorie Posting", "Pre-permit", "Smoke-Free Air Act", "Trans Fat"]

# Filter the DataFrame in a single step
inspection_df = inspection_df[~inspection_df['inspection_type'].str.startswith(tuple(remove_types))]
new_length = len(inspection_df)
print(f' {original_length} - {new_length} = {(original_length - new_length)}')

 204566 - 157097 = 47469


In [89]:
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

score                    6130
violation_code            773
violation_description     773
dtype: int64

Eliminating these rows led to a modest decrease in null values, due to the overlap in missing data among these rows. However, a detailed analysis of a few outstanding violation codes is still required.

Now, let's examine the history of a restaurant with a null value in the violation code to understand the reasons behind this occurrence.

In [90]:
inspection_df[inspection_df['camis'] == 40365644]

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,cuisine_description,action,score,inspection_type,violation_code,violation_description
133913,40365644,JOE ALLEN RESTAURANT,Manhattan,326,WEST 46 STREET,10036.0,2125816464,2022-09-20,Not Critical,2023-12-17T06:00:11.000,40.760179,-73.988882,104.0,3.0,12100.0,1025016.0,1010360000.0,MN15,American,Violations were cited in the following area(s).,12.0,Cycle Inspection / Re-inspection,10B,Anti-siphonage or back-flow prevention device ...
161005,40365644,JOE ALLEN RESTAURANT,Manhattan,326,WEST 46 STREET,10036.0,2125816464,2022-02-09,Not Applicable,2023-12-17T06:00:13.000,40.760179,-73.988882,104.0,3.0,12100.0,1025016.0,1010360000.0,MN15,American,No violations were recorded at the time of thi...,0.0,Cycle Inspection / Initial Inspection,,
174095,40365644,JOE ALLEN RESTAURANT,Manhattan,326,WEST 46 STREET,10036.0,2125816464,2022-09-20,Not Critical,2023-12-17T06:00:11.000,40.760179,-73.988882,104.0,3.0,12100.0,1025016.0,1010360000.0,MN15,American,Violations were cited in the following area(s).,12.0,Cycle Inspection / Re-inspection,10F,Non-food contact surface or equipment made of ...
203837,40365644,JOE ALLEN RESTAURANT,Manhattan,326,WEST 46 STREET,10036.0,2125816464,2022-09-20,Critical,2023-12-17T06:00:11.000,40.760179,-73.988882,104.0,3.0,12100.0,1025016.0,1010360000.0,MN15,American,Violations were cited in the following area(s).,12.0,Cycle Inspection / Re-inspection,04H,"Raw, cooked or prepared food is adulterated, c..."


In [91]:
# Group by 'camis' and 'inspection_date' and check for nulls in 'violation_code'
grouped = inspection_df.groupby(['camis', 'inspection_date'])
groups_with_nulls = grouped.apply(lambda x: x['violation_code'].isna().any())

# Count the number of rows in each group
group_sizes = grouped.size()

# Filter the DataFrame to include only those groups with nulls in 'violation_code' and at least 2 rows
filtered_groups = groups_with_nulls[groups_with_nulls].index.intersection(group_sizes[group_sizes >= 2].index)
filtered_df = inspection_df[inspection_df.set_index(['camis', 'inspection_date']).index.isin(filtered_groups)].reset_index(drop=True)

# Now, 'filtered_df' contains only the groups where there are null values in 'violation_code' and at least 2 rows in the group
filtered_df.sort_values(by='camis').head()

# inspections_group = filtered_df.sort_values(by='camis').head()
# inspections_group.to_csv('inspections_camisg.csv')

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,cuisine_description,action,score,inspection_type,violation_code,violation_description
253,40390409,THE FAMOUS JIMBO'S HAMBURGER PALACE,Manhattan,1345,AMSTERDAM AVENUE,10027.0,2128658777,2021-07-23,Critical,2023-12-17T06:00:11.000,40.813705,-73.956012,109.0,7.0,20901.0,1084098.0,1019660000.0,MN09,Hamburgers,Violations were cited in the following area(s).,26.0,Cycle Inspection / Initial Inspection,02G,Cold food item held above 41º F (smoked fish a...
87,40390409,THE FAMOUS JIMBO'S HAMBURGER PALACE,Manhattan,1345,AMSTERDAM AVENUE,10027.0,2128658777,2021-07-23,Critical,2023-12-17T06:00:11.000,40.813705,-73.956012,109.0,7.0,20901.0,1084098.0,1019660000.0,MN09,Hamburgers,Violations were cited in the following area(s).,26.0,Cycle Inspection / Initial Inspection,06F,Wiping cloths soiled or not stored in sanitizi...
285,40390409,THE FAMOUS JIMBO'S HAMBURGER PALACE,Manhattan,1345,AMSTERDAM AVENUE,10027.0,2128658777,2021-07-23,Critical,2023-12-17T06:00:11.000,40.813705,-73.956012,109.0,7.0,20901.0,1084098.0,1019660000.0,MN09,Hamburgers,Violations were cited in the following area(s).,26.0,Cycle Inspection / Initial Inspection,06D,"Food contact surface not properly washed, rins..."
69,40390409,THE FAMOUS JIMBO'S HAMBURGER PALACE,Manhattan,1345,AMSTERDAM AVENUE,10027.0,2128658777,2021-07-23,Not Applicable,2023-12-17T06:00:13.000,40.813705,-73.956012,109.0,7.0,20901.0,1084098.0,1019660000.0,MN09,Hamburgers,No violations were recorded at the time of thi...,,Administrative Miscellaneous / Initial Inspection,,
211,40390409,THE FAMOUS JIMBO'S HAMBURGER PALACE,Manhattan,1345,AMSTERDAM AVENUE,10027.0,2128658777,2021-07-23,Critical,2023-12-17T06:00:11.000,40.813705,-73.956012,109.0,7.0,20901.0,1084098.0,1019660000.0,MN09,Hamburgers,Violations were cited in the following area(s).,26.0,Cycle Inspection / Initial Inspection,04M,Live roaches present in facility's food and/or...


The data format clearly shows that individual violations from an inspection are recorded on separate rows. However, it's unusual to observe different types of inspections, such as 'Administrative Miscellaneous', occurring concurrently within a single visit. Notably, rows categorized under 'Administrative Miscellaneous' frequently present missing data in the 'score, 'violation_code' and 'violation_description' fields. The next step in our analysis involves determining the frequency of NaN values within the 'Administrative Miscellaneous' inspection category

In [92]:
# Group by 'inspection_type' and count null 'violation_code' entries
null_score_count = inspection_df.groupby('inspection_type').apply(lambda x: x['score'].isnull().sum())

# The result is a Series where the index is 'inspection_type' and the values are the counts of null 'violation_code'
print(null_score_count)


inspection_type
Administrative Miscellaneous / Compliance Inspection             99
Administrative Miscellaneous / Initial Inspection              5003
Administrative Miscellaneous / Re-inspection                    973
Administrative Miscellaneous / Reopening Inspection              47
Administrative Miscellaneous / Second Compliance Inspection       8
Cycle Inspection / Compliance Inspection                          0
Cycle Inspection / Initial Inspection                             0
Cycle Inspection / Re-inspection                                  0
Cycle Inspection / Reopening Inspection                           0
Cycle Inspection / Second Compliance Inspection                   0
Inter-Agency Task Force / Initial Inspection                      0
Inter-Agency Task Force / Re-inspection                           0
dtype: int64


The analysis reveals that all the missing 'score' rows are tied to various "Administrative" inspection types. This pattern suggests that "Administrative" inspections might be documenting a distinct category of violations, particularly given the occurrence of both "Administrative" and "Cycle" inspections within the same visit. This finding indicates a potential strategy to either deduce the score for these cases or consider the removal of "Administrative" inspection types from our dataset.

To proceed effectively, it's important to closely examine the specific types of violations recorded under "Administrative" inspections. Understanding the nuances of these violations will assist in determining their relevance to our overall data analysis and their impact on the comprehensive scoring system.

We filter the dataset to only include rows where 'inspection_type' starts with "Administrative" and then identified the unique 'violation_description' values to understand the nature of violations in "Administrative" inspections.

In [93]:
# Filter for rows where 'inspection_type' starts with "Administrative"
administrative_rows = inspection_df[inspection_df['inspection_type'].str.startswith("Administrative")]

# Get a count of each unique 'violation_description' in these rows
violation_description_counts = administrative_rows['violation_description'].value_counts()

# Display the counts
violation_description_counts

violation_description
Food allergy information poster not conspicuously posted where food is being prepared or processed by food workers.                                                                                                                                                                                                                                  705
Current letter grade or Grade Pending card not posted                                                                                                                                                                                                                                                                                                626
Failure to post or conspicuously post healthy eating information                                                                                                                                                                                                                                

The analysis of "Administrative" inspections revealed that these primarily involve non-food safety violations, such as missing posters, signage, or documentation, rather than critical food safety issues. Common violations in "Administrative" inspections include:

- Missing "Choking first aid" and "Alcohol and pregnancy" posters.
- Failure to post or conspicuously post current letter grades or Grade Pending cards.
- Providing certain items without customer request, such as plastic straws.

Given that "Administrative" inspections do not contribute to our food safety analysis and mainly involve non-critical violations, we made the decision to drop rows where the 'inspection_type' starts with "Administrative." This step helps streamline the dataset and focuses our analysis on relevant food safety factors.

In [94]:
# Drop rows where 'inspection_type' starts with "Administrative"
inspection_df = inspection_df[~inspection_df['inspection_type'].str.startswith("Administrative")]
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

violation_code           416
violation_description    416
dtype: int64

As a result, the null values in the 'score' column have been successfully addressed, leaving no NaNs in this column. Moving forward, we will further investigate the remaining null values in the 'violation_code' and 'violation_description' columns to gain insights into their presence, even though their frequency is relatively low.

In [95]:
# Group by 'inspection_type' and count null 'violation_code' entries
null_violation_count = inspection_df.groupby('inspection_type').apply(lambda x: x['violation_code'].isnull().sum())
null_violation_count

inspection_type
Cycle Inspection / Compliance Inspection             2
Cycle Inspection / Initial Inspection              238
Cycle Inspection / Re-inspection                    37
Cycle Inspection / Reopening Inspection             69
Cycle Inspection / Second Compliance Inspection      0
Inter-Agency Task Force / Initial Inspection        69
Inter-Agency Task Force / Re-inspection              1
dtype: int64

As observed, the presence of null values in the 'violation_code' and 'violation_description' columns varies depending on the inspection type. While this insight doesn't directly explain why these nulls exist, it's a useful observation. To further investigate the underlying reasons behind these null values, we can analyze the 'action' column, which may provide more context.

In [96]:
violation_code_null = inspection_df[inspection_df['violation_code'].isna()]
# Group by 'inspection_type' and count null 'violation_code' entries
null_violation_count = violation_code_null.groupby('action').apply(lambda x: x['violation_code'].isnull().sum())
null_violation_count

action
Establishment re-opened by DOHMH.                               69
No violations were recorded at the time of this inspection.    344
Violations were cited in the following area(s).                  3
dtype: int64

In [97]:
null_violation_count.sum()

416

Our analysis has revealed that a significant portion of the null values in the 'violation_code' and 'violation_description' columns are associated with inspections where no violations were recorded. To address this, we plan to replace these null values for the 'violation_code' column with "none" and for the 'violation_description' column with "No violations were recorded." 

In [98]:
# Identify rows where 'action' starts with the specified strings and 'violation_code' is null
condition = inspection_df['violation_code'].isna() & inspection_df['action'].str.startswith("No violations were recorded at the time of this inspection.")

# Update 'violation_code' and 'violation_description' for these rows
inspection_df.loc[condition, ['violation_code', 'violation_description']] = ['none', 'No violations were recorded']

In [99]:
violation_code_null = inspection_df[inspection_df['violation_code'].isna()]
# Group by 'inspection_type' and count null 'violation_code' entries
null_violation_count = violation_code_null.groupby('action').apply(lambda x: x['violation_code'].isnull().sum())
null_violation_count

action
Establishment re-opened by DOHMH.                  69
Violations were cited in the following area(s).     3
dtype: int64

To gain further clarity and address the remaining null values in the 'violation_code' and 'violation_description' columns, we will focus on a subset of rows related to reopening inspections. Specifically, we will examine these rows to understand why some of them have null values in these columns.

In [100]:
# Filter rows where 'inspection_type' starts with "Administrative"
action_reopened = inspection_df[inspection_df['action'].str.startswith("Establishment re-opened by DOHMH")]
action_reopened.head(10)
# action_reopened.head().to_csv('action_reopened.csv')

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,cuisine_description,action,score,inspection_type,violation_code,violation_description
0,50053144,XIN,Brooklyn,8324,3 AVENUE,11209.0,3476626222,2022-05-27,Not Applicable,2023-12-17T06:00:11.000,40.625468,-74.030249,310.0,47.0,6200.0,3152394.0,3060160000.0,BK31,Chinese,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,
8,50058053,RED HOT II,Brooklyn,349,7 AVENUE,11215.0,7183692577,2022-09-09,Critical,2023-12-17T06:00:11.000,40.666194,-73.982143,306.0,39.0,15100.0,3026127.0,3010940000.0,BK37,Chinese,Establishment re-opened by DOHMH.,20.0,Cycle Inspection / Reopening Inspection,04L,Evidence of mice or live mice in establishment...
24,50012841,LITTLENECK OUTPOST,Brooklyn,128,FRANKLIN STREET,11222.0,7183833080,2019-11-22,Not Applicable,2023-12-17T06:00:11.000,40.729271,-73.957486,301.0,33.0,56500.0,3064804.0,3025630000.0,BK76,American,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,
31,50038113,NATHAN'S FAMOUS,Bronx,200,BAYCHESTER AVENUE,10475.0,7186711234,2021-07-23,Not Applicable,2023-12-17T06:00:11.000,40.865905,-73.83043,210.0,12.0,46201.0,2120098.0,2051410000.0,BX13,Hotdogs,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,
132,50079943,EL PAISA TEPEACA,Manhattan,1548,SAINT NICHOLAS AVENUE,10040.0,9175210972,2023-03-22,Not Applicable,2023-12-17T06:00:11.000,40.853321,-73.930787,112.0,10.0,26900.0,1063808.0,1021580000.0,MN35,Mexican,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,
153,50096065,YAFA CAFE,Brooklyn,4415,4 AVENUE,11220.0,3474643999,2022-06-15,Not Applicable,2023-12-17T06:00:11.000,40.649672,-74.009226,307.0,38.0,8000.0,3011175.0,3007380000.0,BK32,Middle Eastern,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,
218,50116727,NEW TASTE OF CHINA,Queens,8026,BAXTER AVE,11373.0,7185330883,2023-06-09,Critical,2023-12-17T06:00:11.000,40.744671,-73.884595,404.0,25.0,26902.0,4000000.0,4015080000.0,QN29,Chinese,Establishment re-opened by DOHMH.,13.0,Cycle Inspection / Reopening Inspection,04L,Evidence of mice or live mice in establishment...
281,41337465,CAFE SELECT,Manhattan,212,LAFAYETTE STREET,10012.0,2129259322,2022-09-22,Critical,2023-12-17T06:00:11.000,40.721775,-73.99759,102.0,1.0,4500.0,1007222.0,1004820000.0,MN24,Eastern European,Establishment re-opened by DOHMH.,12.0,Cycle Inspection / Reopening Inspection,04L,Evidence of mice or live mice in establishment...
294,40378827,KOSHER HUT OF BROOKLYN,Brooklyn,709,KINGS HIGHWAY,11223.0,7183768996,2022-01-07,Not Applicable,2023-12-17T06:00:11.000,40.606198,-73.965438,315.0,44.0,41800.0,3177500.0,3066640000.0,BK25,Jewish/Kosher,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,
309,50009079,TING FAI CUISINE,Brooklyn,1962,86 STREET,11214.0,7183733888,2022-10-03,Critical,2023-12-17T06:00:11.000,40.604976,-73.998891,311.0,38.0,28400.0,3166862.0,3063740000.0,BK28,Chinese,Establishment re-opened by DOHMH.,9.0,Cycle Inspection / Reopening Inspection,04M,Live roaches in facility's food or non-food area.


In the case of reopening inspections, we observed that some rows had NaN values in the violation code/description, while others had codes and descriptions, suggesting the absence of violations. Additionally, the "critical_flag" column contained 'Not Applicable' when no violations were present. We can reasonably assume that this indicates no violations were found during those inspections. Therefore, we will be replacing violation_code and violation_description NaNs with 'none' and 'No violations were recorded', respectively.

In [101]:
# Identify rows where 'action' starts with the specified strings and 'violation_code' is null
condition = (inspection_df['violation_code'].isna() & 
            inspection_df['action'].str.startswith("Establishment re-opened") &
           ( inspection_df['critical_flag'] == 'Not Applicable'))

# Update 'violation_code' and 'violation_description' for these rows
inspection_df.loc[condition, ['violation_code', 'violation_description']] = ['none', 'No violations were recorded']

In [102]:
# Create a DataFrame containing rows where 'violation_code' is null
violation_code_null = inspection_df[inspection_df['violation_code'].isna()]

# Group the DataFrame by 'action' and count null 'violation_code' entries for each group
null_violation_count = violation_code_null.groupby('action').apply(lambda x: x['violation_code'].isnull().sum())

# Display the count of null 'violation_code' entries for each 'action'
null_violation_count

action
Violations were cited in the following area(s).    3
dtype: int64

That leaves us with:
- Violations were cited in the following area(s).


In [103]:
# Filter rows where 'inspection_type' doesn't start with "Violations"
action_violationcited = inspection_df[inspection_df['action'].str.startswith("Violations were cited in the following area(s)")]

# Sort the DataFrame by 'violation_code' with NaN values at the beginning
action_violationcited = action_violationcited.sort_values(by='violation_code', na_position='first')

action_violationcited.head()
# action_violationcited.head().to_csv('action_violationcited.csv')

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,cuisine_description,action,score,inspection_type,violation_code,violation_description
33637,41644180,MANHATTAN TERRACE BAR,Queens,110-00,ROCKAWAY BOULEVARD,11420.0,2122153542,2016-05-14,Not Applicable,2023-12-17T06:00:11.000,40.677665,-73.828758,410.0,32.0,86400.0,4457718.0,4115430000.0,QN55,American,Violations were cited in the following area(s).,0.0,Cycle Inspection / Initial Inspection,,
134806,41564956,RADIO CITY MUSIC HALL,Manhattan,1260,AVENUE OF THE AMERICAS,10020.0,2124857000,2017-08-09,Not Applicable,2023-12-17T06:00:11.000,40.759983,-73.980349,105.0,4.0,10400.0,1083861.0,1012660000.0,MN17,American,Violations were cited in the following area(s).,0.0,Cycle Inspection / Initial Inspection,,
150601,41688093,PAISANOS BURGERS\MELT,Brooklyn,620,ATLANTIC AVENUE,11217.0,9176186310,2018-05-11,Not Applicable,2023-12-17T06:00:11.000,40.683447,-73.975691,302.0,35.0,12902.0,3398156.0,3011180000.0,BK37,Hamburgers,Violations were cited in the following area(s).,0.0,Cycle Inspection / Initial Inspection,,
36791,41490991,LIPS RESTAURANT,Manhattan,227,EAST 56 STREET,10022.0,2126757710,2022-11-15,Critical,2023-12-17T06:00:11.000,40.759261,-73.967267,106.0,4.0,10800.0,1038592.0,1013300000.0,MN19,American,Violations were cited in the following area(s).,27.0,Cycle Inspection / Initial Inspection,02A,Time/Temperature Control for Safety (TCS) food...
22207,50080345,ANISE,Bronx,3511,JOHNSON AVENUE,10463.0,7185430500,2023-04-27,Critical,2023-12-17T06:00:11.000,40.885875,-73.909922,208.0,11.0,29700.0,2084161.0,2057941000.0,BX29,Chinese,Violations were cited in the following area(s).,66.0,Cycle Inspection / Initial Inspection,02A,Time/Temperature Control for Safety (TCS) food...


We examined inspections with the action "Violations were cited in the following area(s)" which had a mix of nulls and codes in the violation_code column. Since we cannot determine what the code should be for these cases, we have made the decision to drop these rows.

In [104]:
inspection_df = inspection_df.drop(inspection_df[(inspection_df['violation_code'].isna()) &
                                                 (inspection_df['action'].str.startswith("Violations were cited in the following area(s)"))].index)

### Phone

There are only a few rows with nulls in this column. We can fill these remaining nulls with a common placeholder, such as '0000000000,':

In [105]:
# Fill remaining nulls in numerical columns with '0000000000'
inspection_df['phone'].fillna('0000000000', inplace=True)

# Reassessing the null counts in the dataset
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

Series([], dtype: int64)

There are no violation_code nulls left.

In [106]:
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

Series([], dtype: int64)

We have successfully addressed all the nulls in the DataFrame. 

In [107]:
null_zero_counts = pd.DataFrame({
    'Numeric_Zero_Count': (inspection_df == 0).sum(),
    'String_Zero_Count': (inspection_df == '0').sum(),
    'Null_Count': (inspection_df.isna().sum()).sum(),
    'Blank Count': (inspection_df == '').sum(),
    'Space Count': (inspection_df == ' ').sum(),
    'Data Types': inspection_df.dtypes
})

null_zero_counts

# null_zero_counts.to_csv('null_zero_counts.csv')

Unnamed: 0,Numeric_Zero_Count,String_Zero_Count,Null_Count,Blank Count,Space Count,Data Types
camis,0,0,0,0,0,int64
dba,0,0,0,0,0,object
boro,0,0,0,0,0,object
building,0,284,0,0,0,object
street,0,0,0,0,0,object
zipcode,0,0,0,0,0,float64
phone,0,0,0,0,0,object
inspection_date,0,0,0,0,0,object
critical_flag,0,0,0,0,0,object
record_date,0,0,0,0,0,object


#### Dealing with 0s

##### Building

For the 'building' column, it appears to have some 0 values, but there's not much we can do about that, so we will leave it as is.

##### Score
Regarding the 'score' column, we can infer that a score of 0 indicates no violations.

### Dealing with Data Types

In [109]:
inspection_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150964 entries, 0 to 204565
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  150964 non-null  int64  
 1   dba                    150964 non-null  object 
 2   boro                   150964 non-null  object 
 3   building               150964 non-null  object 
 4   street                 150964 non-null  object 
 5   zipcode                150964 non-null  float64
 6   phone                  150964 non-null  object 
 7   inspection_date        150964 non-null  object 
 8   critical_flag          150964 non-null  object 
 9   record_date            150964 non-null  object 
 10  latitude               150964 non-null  float64
 11  longitude              150964 non-null  float64
 12  community_board        150964 non-null  float64
 13  council_district       150964 non-null  float64
 14  census_tract           150964 non-null  f

### Building Column
First, lets address the building column.

In [324]:
inspection_df['building'].str.isalpha().any()

True

Building has a mix of letter and numbers, it must remain an object type. 

### Score Column

Prepare the 'score' column for numerical analysis, the following action has been taken.

In [325]:
inspection_df['score'] = inspection_df['score'].astype(int)

### Float data type columns

The following columns should exclusively contain whole numbers. Currently, they are in float type. To ensure their integrity:

1. Verify if they consist of whole numbers.
2. Convert them to integers to confirm the absence of special characters.
3. Convert them back to strings, as these columns are categorical features.

In [326]:
columns_to_check = ['zipcode', 'score', 'community_board', 'council_district', 'census_tract', 'bin', 'bbl']

for column in columns_to_check:
    is_integer = (inspection_df[column] % 1 == 0).all()
    print(f"{column} Column: {is_integer}")

zipcode Column: True
score Column: True
community_board Column: True
council_district Column: True
census_tract Column: True
bin Column: True
bbl Column: True


In [327]:
for column in columns_to_check:
    inspection_df[column] = inspection_df[column].astype(int)
    inspection_df[column] = inspection_df[column].astype(str)

### Phone Column

Lets work on the 'phone' column, we will perform the following steps:

1. Remove all non-numerical characters from the 'phone' column.
2. Replace missing or empty values with '1000000000' to avoid having all zeros.


In [328]:
# Use regex to extract digits from the "phone" column
inspection_df['phone'] = inspection_df['phone'].str.replace(r'\D', '', regex=True)

In [329]:
# Remove black or 0s placeholder with '1000000000'.
inspection_df['phone'] = inspection_df['phone'].str.strip().replace(['', '0000000000'], '1000000000')

## Inspection Date Column

To standardize the 'inspection_date' column, we will follow these steps:

1. Begin by printing the 'inspection_date' from the first row of the DataFrame to verify the initial format, which is in the format 'YYYY-MM-DDThh:mm:ss.sss'.
2. Next, convert the 'inspection_date' column to datetime format and format it to display only the date in 'YYYY-MM-DD' format.
3. Finally, print the 'inspection_date' from the first row of the DataFrame again to confirm that it has been standardized to 'YYYY-MM-DD'.



In [330]:
# Print the 'inspection_date' from the first row of the DataFrame
inspection_df.loc[0, 'inspection_date']

'2022-05-27'

In [331]:
# Convert the 'inspection_date' column to datetime and format it to display only the date (YYYY-MM-DD)
inspection_df['inspection_date'] = pd.to_datetime(inspection_df['inspection_date']).dt.strftime('%Y-%m-%d')

# Print the 'inspection_date' from the first row of the DataFrame
inspection_df.loc[0, 'inspection_date']


'2022-05-27'

The DataFrame 'inspection_df' has been thoroughly checked and cleaned, resulting in the following characteristics:

- No null values exist in any of the columns.
- The data types of the columns are appropriate.

The data is now ready for further analysis and exploration. If you have any additional tasks or questions related to this DataFrame or any other topic, please feel free to ask.

In [332]:
inspection_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150964 entries, 0 to 204565
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  150964 non-null  int64  
 1   dba                    150964 non-null  object 
 2   boro                   150964 non-null  object 
 3   building               150964 non-null  object 
 4   street                 150964 non-null  object 
 5   zipcode                150964 non-null  object 
 6   phone                  150964 non-null  object 
 7   inspection_date        150964 non-null  object 
 8   critical_flag          150964 non-null  object 
 9   record_date            150964 non-null  object 
 10  latitude               150964 non-null  float64
 11  longitude              150964 non-null  float64
 12  community_board        150964 non-null  object 
 13  council_district       150964 non-null  object 
 14  census_tract           150964 non-null  o

In [None]:
inspection_df.record_date.unique()


In [285]:
df_prepared = p.prepare_data('health_inspections.csv')
df_prepared.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150959 entries, 0 to 204565
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  150959 non-null  int64  
 1   dba                    150959 non-null  object 
 2   boro                   150959 non-null  object 
 3   building               150959 non-null  object 
 4   street                 150959 non-null  object 
 5   zipcode                150959 non-null  object 
 6   phone                  150959 non-null  object 
 7   inspection_date        150959 non-null  object 
 8   critical_flag          150959 non-null  object 
 9   record_date            150959 non-null  object 
 10  latitude               150959 non-null  float64
 11  longitude              150959 non-null  float64
 12  community_board        150959 non-null  object 
 13  council_district       150959 non-null  object 
 14  census_tract           150959 non-null  o