In [1]:
import pandas as pd
import numpy as np
import os
import requests
import env
import john_acquire as a

In [2]:
%load_ext autoreload
%autoreload 2

# Set the option to display all columns
pd.set_option('display.max_columns', None)

In [3]:
# Load the acquired data
inspection_df = pd.read_csv('nyc_health_inspections_2000_to_2023.csv', index_col=False)
inspection_df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date
0,50067297,GERBASI RESTAURANT,Bronx,2389,ARTHUR AVENUE,10458.0,7182205735,Italian,2021-09-12T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Inter-Agency Task Force / Initial Inspection,40.85529,-73.887797,206.0,15.0,39100.0,2011897.0,2030650000.0,BX06,,,,
1,50034232,RELISH CATERERS,Bronx,2501,3 AVENUE,10451.0,2122281672,American,2021-09-25T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Inter-Agency Task Force / Initial Inspection,40.810202,-73.928401,201.0,8.0,5100.0,2000795.0,2023200000.0,BX39,,,,
2,50064240,DAXI SICHUAN,Queens,136-20,ROOSEVELT AVENUE,11354.0,9175631983,Chinese,2022-09-21T00:00:00.000,Violations were cited in the following area(s).,Not Critical,13.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.759778,-73.829235,407.0,20.0,85300.0,4113546.0,4050190000.0,QN22,09B,Thawing procedure improper.,A,2022-09-21T00:00:00.000
3,50105603,LE PAIN QUOTIDIEN,Manhattan,81,WEST BROADWAY,10007.0,6468639168,French,2022-11-25T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:09.000,Administrative Miscellaneous / Re-inspection,40.715083,-74.009567,101.0,1.0,2100.0,1001480.0,1001368000.0,MN24,,,,
4,50069583,PHO BEST,Queens,4235,MAIN ST,11355.0,9173618878,Southeast Asian,2022-05-09T00:00:00.000,Violations were cited in the following area(s).,Critical,30.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.754418,-73.827881,407.0,20.0,85300.0,4573539.0,4051358000.0,QN22,02B,Hot food item not held at or above 140º F.,,


In [4]:
# Find what fields are missing values
inspection_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207365 entries, 0 to 207364
Data columns (total 26 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  207365 non-null  int64  
 1   dba                    207365 non-null  object 
 2   boro                   207365 non-null  object 
 3   building               207076 non-null  object 
 4   street                 207365 non-null  object 
 5   zipcode                204723 non-null  float64
 6   phone                  207359 non-null  object 
 7   cuisine_description    207365 non-null  object 
 8   inspection_date        207365 non-null  object 
 9   action                 207365 non-null  object 
 10  critical_flag          207365 non-null  object 
 11  score                  199793 non-null  float64
 12  record_date            207365 non-null  object 
 13  inspection_type        207365 non-null  object 
 14  latitude               207119 non-nu

Lets first find out what type of inspections there are, we are only focus on inspections related to food safety for now. 

In [5]:
# Assuming 'inspection_df' is your DataFrame
unique_inspection_types = inspection_df['inspection_type'].unique()

# Convert the numpy array to a list and then sort it
sorted_inspection_types = sorted(unique_inspection_types.tolist())
sorted_inspection_types

['Administrative Miscellaneous / Compliance Inspection',
 'Administrative Miscellaneous / Initial Inspection',
 'Administrative Miscellaneous / Re-inspection',
 'Administrative Miscellaneous / Reopening Inspection',
 'Administrative Miscellaneous / Second Compliance Inspection',
 'Calorie Posting / Compliance Inspection',
 'Calorie Posting / Initial Inspection',
 'Calorie Posting / Re-inspection',
 'Cycle Inspection / Compliance Inspection',
 'Cycle Inspection / Initial Inspection',
 'Cycle Inspection / Re-inspection',
 'Cycle Inspection / Reopening Inspection',
 'Cycle Inspection / Second Compliance Inspection',
 'Inter-Agency Task Force / Initial Inspection',
 'Inter-Agency Task Force / Re-inspection',
 'Pre-permit (Non-operational) / Compliance Inspection',
 'Pre-permit (Non-operational) / Initial Inspection',
 'Pre-permit (Non-operational) / Re-inspection',
 'Pre-permit (Non-operational) / Second Compliance Inspection',
 'Pre-permit (Operational) / Compliance Inspection',
 'Pre-per

Inspection types : "Calorie Posting", "Pre-permit", "Smoke-Free Air Act", "Trans Fat" seem to be a diferent type of inspection that does not directly pertain to food safety, we can drop theses so we can focus on the other types: Administrative, Cycle and Inter-Agency 

In [7]:
# List of inspection types to be removed
remove_types = ["Calorie Posting", "Pre-permit", "Smoke-Free Air Act", "Trans Fat"]

# Filter the DataFrame in a single step
inspection_df = inspection_df[~inspection_df['inspection_type'].str.startswith(tuple(remove_types))]
len(inspection_df)

159325

In [8]:
null_counts_by_column = inspection_df.isnull().sum()
null_counts_by_column[null_counts_by_column > 0]

building                   239
zipcode                   2306
phone                        6
score                     6121
latitude                   143
longitude                  143
community_board           2658
council_district          2662
census_tract              2662
bin                       3338
bbl                        352
nta                       2658
violation_code             864
violation_description      864
grade                    79771
grade_date               82894
dtype: int64

In [9]:
# In order to infer the missing values using other observations we need to double check that the CAMIS holds the same restaurant data. 

# Group by 'camis' and check for unique 'dba' and 'street' values
unique_dba_street = inspection_df.groupby('camis').agg({'dba': pd.Series.nunique, 'street': pd.Series.nunique})

# Identify camis IDs with inconsistent 'dba' or 'street' values
inconsistent_camis = unique_dba_street[(unique_dba_street['dba'] > 1) | (unique_dba_street['street'] > 1)]

# Displaying inconsistent records for review
inconsistent_camis

Unnamed: 0_level_0,dba,street
camis,Unnamed: 1_level_1,Unnamed: 2_level_1


We found no inconsistent restaurant street and DBA, we can assume that the each CAMIS has correct data for each restaurant. now we can use the CAMIS to see if you can fill any null, which may be found in a separate observation of the same restaurant. 

In [10]:
# Group by 'camis' and take the first non-null 'building' value
building_mapping = inspection_df.groupby('camis')['building'].first()

# Now, fill missing building values using this mapping
inspection_df['building'] = inspection_df['building'].fillna(inspection_df['camis'].map(building_mapping))

# Check the count of remaining missing values in 'building'
missing_after = inspection_df['building'].isna().sum()
print(f"Remaining missing building numbers: {missing_after}")


Remaining missing building numbers: 239


We attempted to infer the building number, but every instance of the matching CAMIS also had missing building number. 

We attempted the same for: zipcode, phone, community_board, council_district, census_tract, bin, bbl, nta. All had the same result. 

I have decided to drop the following fields with null values because I intend to use the various categories in the model, plus it was a small percentage of the total data.
latitude, longitude, community_board, council_district, census_tract, bin, bbl, nta,

because not all inspections receive a grade depending on the inspecton type, we will be dropping these 2 features. we can calculate the grade with the score. The documentation also mentions that the grade may not match the score because of input errors, so its safer this way. 

There is still other fields that have missing values: score, violation_code, violation_description. However, we may be able to handle it by grouping by camis + indpection type + inspections date. If any matching observations have the missing values, we can use that data. 




we only have a 864 missing violation_code and about 6121 missing scores.

In [12]:
inspection_vscore =  inspection_df[inspection_df.score.isna()]
inspection_vcode =  inspection_df[inspection_df.violation_code.isna()]
len(inspection_vscore), len(inspection_vcode)

(6121, 864)

Lets see the history of a restaurant with a null value in the violation code and try to figure out why. 

In [13]:
# Step 1: Group by 'camis' and 'inspection_date' and check for nulls in 'violation_code'
grouped = inspection_df.groupby(['camis', 'inspection_date'])
groups_with_nulls = grouped.apply(lambda x: x['score'].isna().any())

# Step 2: Filter the DataFrame to include only those groups
filtered_df = inspection_df[inspection_df.set_index(['camis', 'inspection_date']).index.isin(groups_with_nulls[groups_with_nulls].index)].reset_index(drop=True)

# Now, 'filtered_df' contains only the groups where there are null values in 'violation_code'
filtered_df.sort_values(by='camis').head(3)


Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date
5276,30112340,WENDY'S,Brooklyn,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,2022-07-13T00:00:00.000,Violations were cited in the following area(s).,Critical,11.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.662652,-73.962081,309.0,40.0,32700.0,3029737.0,3011970000.0,BK60,02G,Cold TCS food item held above 41 °F; smoked or...,A,2022-07-13T00:00:00.000
2252,30112340,WENDY'S,Brooklyn,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,2022-07-13T00:00:00.000,Violations were cited in the following area(s).,Not Critical,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.662652,-73.962081,309.0,40.0,32700.0,3029737.0,3011970000.0,BK60,20-06,Current letter grade or Grade Pending card not...,,
3570,30112340,WENDY'S,Brooklyn,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,2022-07-13T00:00:00.000,Violations were cited in the following area(s).,Not Critical,11.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.662652,-73.962081,309.0,40.0,32700.0,3029737.0,3011970000.0,BK60,10F,Non-food contact surface or equipment made of ...,A,2022-07-13T00:00:00.000


using the date, we can see that in a single inspection, each violation is in a new row. from here we can see that a single inspection can have more thna one inspection type. Administrative Miscellaneous seem to be the ones that hold the NaN data. We should figure out how many Administrative types have nan

In [14]:
# Group by 'inspection_type' and count null 'violation_code' entries
null_score_count = inspection_df.groupby('inspection_type').apply(lambda x: x['score'].isnull().sum())

# The result is a Series where the index is 'inspection_type' and the values are the counts of null 'violation_code'
print(null_score_count)


inspection_type
Administrative Miscellaneous / Compliance Inspection            100
Administrative Miscellaneous / Initial Inspection              4980
Administrative Miscellaneous / Re-inspection                    990
Administrative Miscellaneous / Reopening Inspection              43
Administrative Miscellaneous / Second Compliance Inspection       8
Cycle Inspection / Compliance Inspection                          0
Cycle Inspection / Initial Inspection                             0
Cycle Inspection / Re-inspection                                  0
Cycle Inspection / Reopening Inspection                           0
Cycle Inspection / Second Compliance Inspection                   0
Inter-Agency Task Force / Initial Inspection                      0
Inter-Agency Task Force / Re-inspection                           0
dtype: int64


We can see that all of the null values come from inspection type Administrative. It could be that this type of row is used to record violations of a different type since there is a mix of admin and cycle inspectons for a single visit. We will probably be able to infer the score or drop the inspection type administrative. 

In [15]:
# Group by 'inspection_type' and count null 'violation_code' entries
null_violation_count = inspection_df.groupby('inspection_type').apply(lambda x: x['violation_code'].isnull().sum())

# The result is a Series where the index is 'inspection_type' and the values are the counts of null 'violation_code'
print(null_violation_count)


inspection_type
Administrative Miscellaneous / Compliance Inspection             8
Administrative Miscellaneous / Initial Inspection              285
Administrative Miscellaneous / Re-inspection                    68
Administrative Miscellaneous / Reopening Inspection              0
Administrative Miscellaneous / Second Compliance Inspection      2
Cycle Inspection / Compliance Inspection                         2
Cycle Inspection / Initial Inspection                          312
Cycle Inspection / Re-inspection                                42
Cycle Inspection / Reopening Inspection                         72
Cycle Inspection / Second Compliance Inspection                  0
Inter-Agency Task Force / Initial Inspection                    72
Inter-Agency Task Force / Re-inspection                          1
dtype: int64


In [43]:
null_violation_count.sum()

864

missing violation code varies by inspeciton type. This does not tell us why, but it was worth looking at the ispection type for any clues. 

Since we want to know why violation code/description is null, we can look at the "action" column to find any hints

In [49]:
violation_code_null = inspection_df[inspection_df['violation_code'].isna()]
# Group by 'inspection_type' and count null 'violation_code' entries
null_violation_count = violation_code_null.groupby('action').apply(lambda x: x['violation_code'].isnull().sum())
null_violation_count

action
Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.      1
Establishment re-opened by DOHMH.                                                                                                      72
No violations were recorded at the time of this inspection.                                                                           785
Violations were cited in the following area(s).                                                                                         6
dtype: int64

In [50]:
null_violation_count.sum()

864

It seems that the nulls because no violations were found, although "Establishment re-opened by DOHMH" is not specifically mentioned, No violations were recorded at the time of this inspection has almost the entirety of the null values. that leaves only 7 rows with null values and the action (akin to a description but of the inspection) says violations were cited. 

we should infer score = 0 and violation description = No Violations and code = No_violation. Then we can simply drop the rows with action = Violations were cited as a possible error in the data. 

In [79]:
# Step 1: Group by 'camis' and 'inspection_date' and check if there are different 'inspection_type' values within each group
grouped = inspection_df.groupby(['camis', 'inspection_date'])
diff_inspection_type = grouped['inspection_type'].nunique() > 1

# Step 2: Filter the DataFrame to include only those groups
filtered_df = inspection_df[inspection_df.set_index(['camis', 'inspection_date']).index.isin(diff_inspection_type[diff_inspection_type].index)].reset_index(drop=True)

# Now, 'filtered_df' contains only the groups where 'inspection_type' varies within the same 'camis' and 'inspection_date'
filtered_df.sort_values(by='camis').head(3)

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date
4587,30112340,WENDY'S,Brooklyn,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,2022-07-13T00:00:00.000,Violations were cited in the following area(s).,Critical,11.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.662652,-73.962081,309.0,40.0,32700.0,3029737.0,3011970000.0,BK60,02G,Cold TCS food item held above 41 °F; smoked or...,A,2022-07-13T00:00:00.000
3105,30112340,WENDY'S,Brooklyn,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,2022-07-13T00:00:00.000,Violations were cited in the following area(s).,Not Critical,11.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.662652,-73.962081,309.0,40.0,32700.0,3029737.0,3011970000.0,BK60,10F,Non-food contact surface or equipment made of ...,A,2022-07-13T00:00:00.000
1953,30112340,WENDY'S,Brooklyn,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,2022-07-13T00:00:00.000,Violations were cited in the following area(s).,Not Critical,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.662652,-73.962081,309.0,40.0,32700.0,3029737.0,3011970000.0,BK60,20-06,Current letter grade or Grade Pending card not...,,


In [91]:
# Step 1: Determine the mode 'inspection_type' for each group
mode_inspection_type = inspection_df.groupby(['camis', 'inspection_date'])['inspection_type'].agg(lambda x: x.mode()[0]).reset_index()
mode_inspection_type.rename(columns={'inspection_type': 'mode_inspection_type'}, inplace=True)

# Step 2: Merge this mode information back into the original DataFrame
merged_df = inspection_df.merge(mode_inspection_type, on=['camis', 'inspection_date'], how='left')

# Step 3: Filter out rows where 'inspection_type' is the mode
non_mode_df = merged_df[merged_df['inspection_type'] != merged_df['mode_inspection_type']]

# Filter out rows where 'inspection_type' starts with "Cycle Inspection"
non_mode_df = non_mode_df[~non_mode_df['inspection_type'].str.startswith("Cycle Inspection")]

# Step 4: Get unique 'inspection_type' values that are not the mode
unique_non_mode_types = non_mode_df['inspection_type'].unique().tolist()

# Now, 'unique_non_mode_types' contains the unique list of 'inspection_type' values that are not the mode in their group
unique_non_mode_types

['Administrative Miscellaneous / Re-inspection',
 'Administrative Miscellaneous / Initial Inspection',
 'Administrative Miscellaneous / Compliance Inspection',
 'Administrative Miscellaneous / Reopening Inspection',
 'Inter-Agency Task Force / Initial Inspection']

In [92]:
# Step 4: Get unique 'inspection_type' values that are not the mode
unique_non_mode_violation_description = non_mode_df['violation_description'].unique().tolist()

# Now, 'unique_non_mode_types' contains the unique list of 'inspection_type' values that are not the mode in their group
unique_non_mode_violation_description

[nan,
 'MISBRANDED AND LABELING',
 'Current letter grade sign not posted.',
 'Failure to post or conspicuously post healthy eating information',
 'Failure to display required signage about plastic straw availability.',
 'Nuisance created or allowed to exist. Facility not free from unsafe, hazardous, offensive or annoying condition.',
 'Failure to display required signage about plastic straw availability',
 'Current letter grade or "Grade Pending" card not posted.',
 'Providing single-use, non-compostable plastic straws to customers without customer request (including providing such straws at a self-serve station)',
 'Current letter grade or Grade Pending card not posted',
 'Permit not conspicuously displayed.',
 'Nuisance created or allowed to exist.  Facility not free from unsafe, hazardous, offensive or annoying conditions.',
 'Food allergy information poster not conspicuously posted where food is being prepared or processed by food workers.',
 'Failure to maintain a sufficient suppl

In [93]:
# Step 4: Get unique 'inspection_type' values that are not the mode
unique_non_mode_violation_description = non_mode_df['violation_code'].unique().tolist()

# Now, 'unique_non_mode_types' contains the unique list of 'inspection_type' values that are not the mode in their group
unique_non_mode_violation_description

[nan,
 '22F',
 '20F',
 '20-08',
 '19-10',
 '20-04',
 '28-01',
 '20-06',
 '19-06',
 '18-08',
 '22A',
 '20A',
 '19-07',
 '18-11',
 '22C',
 '20D',
 '18G',
 '18D',
 '22G',
 '20E',
 '20-01',
 '19-05',
 '19-11',
 '28-03',
 '19-03',
 '18-01',
 '20-05',
 '22E',
 '28-04',
 '18C',
 '19-04',
 '20-07',
 '18F',
 '04J',
 '18B',
 '04A',
 '18-14',
 '18-02',
 '18-13',
 '19-01',
 '19-08']

In [95]:
# Filter out rows where 'inspection_type' starts with "Cycle Inspection"
task_force_df_ = non_mode_df[non_mode_df['inspection_type'].str.startswith("Inter-Agency")]

# Step 4: Get unique 'inspection_type' values that are not the mode
unique_non_mode_violation_description = task_force_df_['violation_description'].unique().tolist()

# Now, 'unique_non_mode_types' contains the unique list of 'inspection_type' values that are not the mode in their group
unique_non_mode_violation_description

['Appropriately scaled metal stem-type thermometer or thermocouple not provided or used to evaluate temperatures of potentially hazardous foods during cooking, cooling, reheating and holding.',
 'Food Protection Certificate (FPC) not held by manager or supervisor of food operations.']

In [96]:
task_force_df_

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date,mode_inspection_type
35230,41646811,EL PATRON NIGHTCLUB CABARET-RESTAURANT,Bronx,1465,JEROME AVENUE,10452.0,6462280275,American,2016-11-05T00:00:00.000,Violations were cited in the following area(s).,Critical,8.0,2023-12-01T06:00:08.000,Inter-Agency Task Force / Initial Inspection,40.841056,-73.917106,204.0,16.0,21900.0,2008317.0,2028580000.0,BX26,04J,Appropriately scaled metal stem-type thermomet...,,,Administrative Miscellaneous / Initial Inspection
63888,50081955,STAND UP CAFE,Brooklyn,7215,18 AVENUE,11204.0,7182328866,Chinese,2022-09-16T00:00:00.000,Violations were cited in the following area(s).,Critical,10.0,2023-12-01T06:00:08.000,Inter-Agency Task Force / Initial Inspection,40.614978,-73.99436,311.0,43.0,26600.0,3158517.0,3061940000.0,BK28,04A,Food Protection Certificate (FPC) not held by ...,,,Administrative Miscellaneous / Initial Inspection


In [None]:
inspection_vscore =  inspection_df[inspection_df.score.isna()]
inspection_vcode =  inspection_df[inspection_df.violation_code.isna()]
len(inspection_vcode), len(inspection_vscore)

(864, 6121)

In [69]:
inspection_df[inspection_df['inspection_type'] == 'Administrative Miscellaneous / Initial Inspection']

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date
15,50087361,PROOF COFFEE ROASTERS,Manhattan,2286,7 AVENUE,10030.0,3142300768,Coffee/Tea,2023-06-27T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.814888,-73.944014,110.0,9.0,22800.0,1058842.0,1.019400e+09,MN03,,,,
23,50039250,LIN'S GARDEN,Bronx,3986,WHITE PLAINS ROAD,10466.0,7185190135,Chinese,2023-04-27T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.888148,-73.860254,212.0,12.0,40600.0,2063440.0,2.048390e+09,BX44,,,,
30,50050386,PANDA EXPRESS,Manhattan,835,3 AVENUE,10022.0,2123552821,Chinese,2022-07-20T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:09.000,Administrative Miscellaneous / Initial Inspection,40.756303,-73.970402,106.0,4.0,9800.0,1081187.0,1.013240e+09,MN19,,,,
55,50092607,JALLOH FAMILY RESTAURANT,Bronx,3396,3 AVENUE,10456.0,9176595169,African,2023-05-16T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.827899,-73.907168,203.0,17.0,18500.0,2004198.0,2.026080e+09,BX35,,,,
72,50018434,SEA WOLF,Brooklyn,19,WYCKOFF AVENUE,11237.0,7183883265,American,2022-07-13T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:09.000,Administrative Miscellaneous / Initial Inspection,40.706642,-73.922966,304.0,34.0,44700.0,3072408.0,3.031770e+09,BK77,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206948,50127672,TBAR,Manhattan,116,EAST 60 STREET,10022.0,2127720404,American,2022-09-29T00:00:00.000,Violations were cited in the following area(s).,Not Critical,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.763189,-73.968540,108.0,4.0,11402.0,1041906.0,1.013940e+09,MN40,20-04,“Choking first aid” poster not posted. “Alcoho...,,
206949,50090163,PLACE TO BEACH CANTINA,Brooklyn,1301,BOARDWALK WEST,11224.0,7183735862,American,2019-07-13T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:09.000,Administrative Miscellaneous / Initial Inspection,40.573047,-73.980865,313.0,47.0,35200.0,3189663.0,3.070740e+09,BK21,,,,
206989,50136766,Balila Labanese Restaurant,Manhattan,1640,3 AVENUE,10128.0,2014107189,Middle Eastern,2023-06-29T00:00:00.000,Violations were cited in the following area(s).,Not Critical,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.782492,-73.951310,108.0,5.0,15801.0,1078536.0,1.015200e+09,MN40,20-04,“Choking first aid” poster not posted. “Alcoho...,,
207199,41630950,CITY LINE PIZZA & PASTA,Brooklyn,1224,LIBERTY AVENUE,11208.0,7182774992,Pizza,2021-08-25T00:00:00.000,Violations were cited in the following area(s).,Not Critical,,2023-12-01T06:00:08.000,Administrative Miscellaneous / Initial Inspection,40.679015,-73.863896,305.0,37.0,118800.0,3094524.0,3.042060e+09,BK83,22C,"Bulb not shielded or shatterproof, in areas wh...",,


In [47]:
vcode_date = inspection_vcode.sort_values(by=['inspection_date'], ascending=False)
vcode_date.sort_values(by=['camis'], ascending=False)

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date
197017,50144058,RESPITE CENTERS (STEWART HOTEL),Manhattan,371,7 AVENUE,10001.0,9178480367,American,2023-11-22T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Pre-permit (Non-operational) / Initial Inspection,40.748799,-73.992204,105.0,3.0,10100.0,1015174.0,1.008060e+09,MN17,,,N,
84323,50143779,ROSE VALLEY CAKES BY KAVITA & KAMLA,Queens,25501B,UNION TPKE,11004.0,9174953351,Bakery Products/Desserts,2023-11-13T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Pre-permit (Non-operational) / Initial Inspection,40.745066,-73.715683,413.0,23.0,155102.0,4440956.0,4.085130e+09,QN44,,,N,
136278,50143605,NEPALI MOMO CAFE,Queens,7418,37TH RD,11372.0,3477384199,Asian/Asian Fusion,2023-11-13T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Pre-permit (Non-operational) / Initial Inspection,40.747252,-73.891246,403.0,25.0,28900.0,4029839.0,4.012850e+09,QN28,,,N,
47333,50143038,ZEN KITCHEN,Brooklyn,1736,SHORE PARKWAY,11214.0,7187999134,"Juice, Smoothies, Fruit Salads",2023-10-18T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Pre-permit (Non-operational) / Initial Inspection,40.593522,-73.996817,311.0,43.0,30400.0,3170082.0,3.064910e+09,BK29,,,N,
50885,50141803,L'INDUSTRIE PIZZERIA,Manhattan,104,CHRISTOPHER STREET,10014.0,2122560648,Pizza,2023-10-26T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:09.000,Pre-permit (Operational) / Initial Inspection,40.733275,-74.004925,102.0,3.0,7300.0,1010039.0,1.005880e+09,MN23,,,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106964,40376515,AMERICAN MUSEUM OF NATURAL HISTORY FOOD COURT,Manhattan,,W 79 STREET,10024.0,2127695370,American,2023-06-21T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,,,,,,,,,,,A,2023-06-21T00:00:00.000
145390,40367339,COCKTAIL ROOM (NYAC),Manhattan,180,CENTRAL PARK SOUTH,10019.0,2127677000,American,2022-02-16T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.766718,-73.978737,105.0,4.0,13700.0,1023750.0,1.010110e+09,MN17,,,A,2022-02-16T00:00:00.000
117370,40365942,HOP KEE RESTAURANT,Manhattan,21,MOTT STREET,10013.0,2129648365,Chinese,2022-09-23T00:00:00.000,Establishment re-opened by DOHMH.,Not Applicable,0.0,2023-12-01T06:00:08.000,Cycle Inspection / Reopening Inspection,40.714364,-73.998784,103.0,1.0,2900.0,1001821.0,1.001640e+09,MN27,,,C,2022-09-23T00:00:00.000
53835,40365784,GARGIULO'S RESTAURANT,Brooklyn,2911,WEST 15 STREET,11224.0,7182664891,Italian,2023-03-16T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,0.0,2023-12-01T06:00:09.000,Cycle Inspection / Initial Inspection,40.576555,-73.982584,313.0,47.0,34800.0,3339622.0,3.070640e+09,BK21,,,,


In [30]:
# Group by 'camis' and take the first non-null 'community_board' value
community_board_mapping = inspection_df.groupby('camis')['community_board'].first()

# Now, fill missing community_board values using this mapping
inspection_df['community_board'] = inspection_df['community_board'].fillna(inspection_df['camis'].map(community_board_mapping))

# Check the count of remaining missing values in 'community_board'
missing_after = inspection_df['community_board'].isna().sum()
print(f"Remaining missing community_board numbers: {missing_after}")


Remaining missing community_board numbers: 3153


In [36]:
inspection_df[inspection_df['camis'] == 41360872].sort_values(by='inspection_date')

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,critical_flag,score,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,violation_code,violation_description,grade,grade_date
125478,41360872,DUNKIN,Staten Island,2945,VETERANS ROAD WEST,10309.0,7189843717,Donuts,2022-03-30T00:00:00.000,Violations were cited in the following area(s).,Not Critical,3.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.527753,-74.231871,503.0,51.0,22600.0,5150130.0,5075110000.0,SI11,10F,Non-food contact surface improperly constructe...,A,2022-03-30T00:00:00.000
106069,41360872,DUNKIN,Staten Island,2945,VETERANS ROAD WEST,10309.0,7189843717,Donuts,2023-04-05T00:00:00.000,Violations were cited in the following area(s).,Not Critical,4.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.527753,-74.231871,503.0,51.0,22600.0,5150130.0,5075110000.0,SI11,10F,Non-food contact surface or equipment made of ...,A,2023-04-05T00:00:00.000
197602,41360872,DUNKIN,Staten Island,2945,VETERANS ROAD WEST,10309.0,7189843717,Donuts,2023-04-05T00:00:00.000,Violations were cited in the following area(s).,Not Critical,4.0,2023-12-01T06:00:08.000,Cycle Inspection / Initial Inspection,40.527753,-74.231871,503.0,51.0,22600.0,5150130.0,5075110000.0,SI11,10E,Accurate thermometer not provided or properly ...,A,2023-04-05T00:00:00.000
205818,41360872,DUNKIN,Staten Island,2945,VETERANS ROAD WEST,10309.0,7189843717,Donuts,2023-04-05T00:00:00.000,No violations were recorded at the time of thi...,Not Applicable,,2023-12-01T06:00:09.000,Administrative Miscellaneous / Initial Inspection,40.527753,-74.231871,503.0,51.0,22600.0,5150130.0,5075110000.0,SI11,,,,
