In [1]:
import requests
import pandas as pd

# Define the URL
url = "https://data.cityofnewyork.us/resource/43nn-pn8j.json"

# Initialize an empty list to store data
data_list = []

# Initialize offset
offset = 0

# Define limit
limit = 1000

# Make API calls until all data is fetched
while True:
    # Construct the URL with the offset
    params = {"$limit": limit, "$offset": offset}
    response = requests.get(url, params=params)
    
    # Check if the response is successful
    if response.status_code == 200:
        # Append the fetched data to the list
        data_list.extend(response.json())
        
        # Increment the offset
        offset += limit
        
        # If the length of fetched data is less than the limit, it means we have reached the end of the dataset
        if len(response.json()) < limit:
            break
    else:
        print("Error occurred while fetching data")
        break

# Convert the list of dictionaries to a DataFrame
restos = pd.DataFrame(data_list)

# Display the first few rows of the DataFrame
print(restos.head())

      camis                      dba       boro building             street  \
0  50102347                 SHUKETTE  Manhattan      230           9 AVENUE   
1  50148522                OK CANAAN     Queens     4318            MAIN ST   
2  50094451                      NaN  Manhattan       45  ROCKEFELLER PLAZA   
3  50117919  FRESCA BOWL & POKE MAHI   Brooklyn      117       ADAMS STREET   
4  50147466       SHAKEN NOT STIRRED  Manhattan     1113           1 AVENUE   

  zipcode       phone          inspection_date   critical_flag  \
0   10001  2122421803  1900-01-01T00:00:00.000  Not Applicable   
1   11355  7188868844  1900-01-01T00:00:00.000  Not Applicable   
2   10111  2126080555  1900-01-01T00:00:00.000  Not Applicable   
3   11201  5706208622  1900-01-01T00:00:00.000  Not Applicable   
4   10065  4046411973  1900-01-01T00:00:00.000  Not Applicable   

               record_date  ...         bbl   nta cuisine_description action  \
0  2024-03-24T06:00:12.000  ...  1007480001  MN1

In [2]:
# Get the shape of the df
num_rows = restos.shape[0]

# Print the number of rows
print("Number of rows:", num_rows)

# Get the column names and data types
column_info = restos.dtypes

# Print the column names and data types
print("Column names and data types:")
for column_name, dtype in column_info.items():
    print(f"{column_name}: {dtype}")

Number of rows: 221726
Column names and data types:
camis: object
dba: object
boro: object
building: object
street: object
zipcode: object
phone: object
inspection_date: object
critical_flag: object
record_date: object
latitude: object
longitude: object
community_board: object
council_district: object
census_tract: object
bin: object
bbl: object
nta: object
cuisine_description: object
action: object
violation_code: object
violation_description: object
score: object
inspection_type: object
grade: object
grade_date: object


In [3]:
# Count the number of N/A values in each column
na_counts = restos.isna().sum()

# Print the column names and their corresponding N/A counts
print("Column names and N/A counts:")
for column_name, na_count in na_counts.items():
    print(f"{column_name}: {na_count}")

Column names and N/A counts:
camis: 0
dba: 561
boro: 0
building: 425
street: 0
zipcode: 2756
phone: 2
inspection_date: 0
critical_flag: 0
record_date: 0
latitude: 287
longitude: 287
community_board: 3426
council_district: 3421
census_tract: 3421
bin: 4478
bbl: 670
nta: 3426
cuisine_description: 2291
action: 2291
violation_code: 3437
violation_description: 3437
score: 10603
inspection_type: 2291
grade: 112526
grade_date: 121230


In [4]:
# Display the top 5 rows of the 'inspection_date' column
top_5_inspection_dates = restos['inspection_date'].head(5)
print(top_5_inspection_dates)

0    1900-01-01T00:00:00.000
1    1900-01-01T00:00:00.000
2    1900-01-01T00:00:00.000
3    1900-01-01T00:00:00.000
4    1900-01-01T00:00:00.000
Name: inspection_date, dtype: object


In [5]:
# Display the bottom 10 rows of the 'inspection_date' column
bottom_10_inspection_dates = restos['inspection_date'].tail(10)
print(bottom_10_inspection_dates)

221716    2022-04-01T00:00:00.000
221717    2023-02-14T00:00:00.000
221718    2023-03-29T00:00:00.000
221719    2023-02-23T00:00:00.000
221720    2024-02-27T00:00:00.000
221721    2021-11-10T00:00:00.000
221722    2022-03-17T00:00:00.000
221723    2022-07-11T00:00:00.000
221724    2023-03-22T00:00:00.000
221725    2021-10-14T00:00:00.000
Name: inspection_date, dtype: object


In [6]:
# Convert 'inspection_date' column to datetime format
restos['inspection_date'] = pd.to_datetime(restos['inspection_date'])

# Count the rows where 'inspection_date' equals '1900-01-01'
count = (restos['inspection_date'] == '1900-01-01').sum()

print("Number of rows with inspection_date equals '1900-01-01':", count)

Number of rows with inspection_date equals '1900-01-01': 2291


In [7]:
# Create a new DataFrame 'restos_2' excluding rows with inspection_date equals '1900-01-01'
restos_2 = restos[restos['inspection_date'] != '1900-01-01']

# Reset index of the new DataFrame
restos_2.reset_index(drop=True, inplace=True)

# Now, 'restos_2' contains rows excluding '1900-01-01' in the 'inspection_date' column
print(restos_2)

           camis                      dba       boro building  \
0       50014686               JONGRO BBQ  Manhattan       22   
1       41168300  ORIGINAL NAPOLI'S PIZZA   Brooklyn      594   
2       50112968          DELICE MACARONS  Manhattan  321 1/2   
3       41171181                  169 BAR  Manhattan      169   
4       50063129              PUBLIC ARTS  Manhattan      215   
...          ...                      ...        ...      ...   
219430  50105869       EL POTE RESTAURANT  Manhattan      718   
219431  50097461       NEW CAMERON BAKERY  Manhattan      242   
219432  50117634              JUNKO SUSHI     Queens     3302   
219433  50115749         BASS AND BOURBON   Brooklyn      113   
219434  50101503      EASTVIEW SR. LIVING  Manhattan     2306   

                            street zipcode       phone inspection_date  \
0                 WEST   32 STREET   10001  2124732233      2022-05-11   
1                  CRESCENT STREET   11208  7188279734      2022-02-15 

In [8]:
num_rows_restos_2 = restos_2.shape[0]
print("Number of rows in restos_2:", num_rows_restos_2)

Number of rows in restos_2: 219435


In [9]:
# Use unique() function to get unique values in the 'inspection_date' column
unique_inspection_dates = restos_2['inspection_date'].unique()

print("Unique inspection dates:")
print(unique_inspection_dates)

Unique inspection dates:
['2022-05-11T00:00:00.000000000' '2022-02-15T00:00:00.000000000'
 '2021-09-01T00:00:00.000000000' ... '2017-05-31T00:00:00.000000000'
 '2021-10-03T00:00:00.000000000' '2022-04-16T00:00:00.000000000']


In [10]:
# Find the minimum and maximum dates
min_date = restos_2['inspection_date'].min()
max_date = restos_2['inspection_date'].max()

print("Range of dates in the 'inspection_date' column:")
print("Minimum date:", min_date)
print("Maximum date:", max_date)

Range of dates in the 'inspection_date' column:
Minimum date: 2015-09-24 00:00:00
Maximum date: 2024-03-22 00:00:00


In [11]:
# Get the shape of the df
num_rows_2 = restos_2.shape[0]

# Print the number of rows
print("Number of rows:", num_rows_2)
# Number of rows: 217475

# Get the column names and data types
column_info_2 = restos_2.dtypes

# Print the column names and data types
print("Column names and data types:")
for column_name, dtype in column_info_2.items():
    print(f"{column_name}: {dtype}")

Number of rows: 219435
Column names and data types:
camis: object
dba: object
boro: object
building: object
street: object
zipcode: object
phone: object
inspection_date: datetime64[ns]
critical_flag: object
record_date: object
latitude: object
longitude: object
community_board: object
council_district: object
census_tract: object
bin: object
bbl: object
nta: object
cuisine_description: object
action: object
violation_code: object
violation_description: object
score: object
inspection_type: object
grade: object
grade_date: object


So we started with 221,726 rows in the original dataframe "restos". And after removing the 2291 rows that have a value of "1900-01-01" in the "inspection_date" column, we are left with 219,435 rows.

In [12]:
# Display the top 10 rows of the 'action' column
top_10_action = restos_2['action'].head(10)
print(top_10_action)

0      Violations were cited in the following area(s).
1                    Establishment re-opened by DOHMH.
2    No violations were recorded at the time of thi...
3    No violations were recorded at the time of thi...
4      Violations were cited in the following area(s).
5      Violations were cited in the following area(s).
6    No violations were recorded at the time of thi...
7      Violations were cited in the following area(s).
8    No violations were recorded at the time of thi...
9      Violations were cited in the following area(s).
Name: action, dtype: object


In [13]:
# Display the bottom 10 rows of the 'action' column
bottom_10_action = restos_2['action'].tail(10)
print(bottom_10_action)

219425      Violations were cited in the following area(s).
219426      Violations were cited in the following area(s).
219427    Establishment Closed by DOHMH. Violations were...
219428      Violations were cited in the following area(s).
219429      Violations were cited in the following area(s).
219430      Violations were cited in the following area(s).
219431      Violations were cited in the following area(s).
219432      Violations were cited in the following area(s).
219433      Violations were cited in the following area(s).
219434      Violations were cited in the following area(s).
Name: action, dtype: object


In [15]:
# Filter rows with NaN or NA values in 'violation_code'
filtered_df = restos_2[restos_2['violation_code'].isna()]

# Count the number of rows with NaN or NA values in 'violation_code'
num_rows_with_nan = len(filtered_df)

# Get unique values in the 'action' column of the filtered DataFrame
unique_actions = filtered_df['action'].unique()

print("Number of rows with NaN or NA values in 'violation_code':", num_rows_with_nan)
print("Unique actions for rows with NaN or NA values in 'violation_code':")
print(unique_actions)

Number of rows with NaN or NA values in 'violation_code': 1146
Unique actions for rows with NaN or NA values in 'violation_code':
['Establishment re-opened by DOHMH.'
 'No violations were recorded at the time of this inspection.'
 'Violations were cited in the following area(s).'
 'Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.']


In [21]:
# Remove rows with NaN or NA values in both "latitude" and "longitude" columns
restos_2 = restos_2.dropna(subset=['latitude', 'longitude'], how='all')

# Reset index of the new DataFrame
restos_2.reset_index(drop=True, inplace=True)

# Now, 'restos_2' contains rows without NaN or NA values in both "latitude" and "longitude" columns
print(restos_2)

           camis                      dba       boro building  \
0       50014686               JONGRO BBQ  Manhattan       22   
1       41168300  ORIGINAL NAPOLI'S PIZZA   Brooklyn      594   
2       50112968          DELICE MACARONS  Manhattan  321 1/2   
3       41171181                  169 BAR  Manhattan      169   
4       50063129              PUBLIC ARTS  Manhattan      215   
...          ...                      ...        ...      ...   
219173  50105869       EL POTE RESTAURANT  Manhattan      718   
219174  50097461       NEW CAMERON BAKERY  Manhattan      242   
219175  50117634              JUNKO SUSHI     Queens     3302   
219176  50115749         BASS AND BOURBON   Brooklyn      113   
219177  50101503      EASTVIEW SR. LIVING  Manhattan     2306   

                            street zipcode       phone inspection_date  \
0                 WEST   32 STREET   10001  2124732233      2022-05-11   
1                  CRESCENT STREET   11208  7188279734      2022-02-15 

In [25]:
# Check if there are any rows with "0" in the "latitude" column
has_zero_latitude = (restos_2['latitude'] == 0).any()

if has_zero_latitude:
    print("There are rows with '0' in the 'latitude' column.")
else:
    print("There are no rows with '0' in the 'latitude' column.")

There are no rows with '0' in the 'latitude' column.


In [26]:
# Get the unique values in the 'boro' column
unique_boro_values = restos_2['boro'].unique()

# Print the unique values
print("Unique values in the 'boro' column:", unique_boro_values)

Unique values in the 'boro' column: ['Manhattan' 'Brooklyn' 'Bronx' 'Queens' 'Staten Island']


In [27]:
# Get the unique values in the "dba" column
unique_dbas = restos_2['dba'].unique()

# Count the number of unique values
num_unique_dbas = len(unique_dbas)

print("Number of unique values in the 'dba' column:", num_unique_dbas)

print("Unique values in the 'dba' column:")
for dba in unique_dbas:
    print(dba)

Number of unique values in the 'dba' column: 21008
Unique values in the 'dba' column:
JONGRO BBQ
ORIGINAL NAPOLI'S PIZZA
DELICE MACARONS
169 BAR
PUBLIC ARTS
MESA COYOACAN
GERMAN DELIGHTS INC
HARDEE
ITO
SHELI RESTAURANT
MADISON CAFE
LOTUS LOUNGE
BLUE SKY
CITI FIELD BUD ISLAND, STAND 140
ORSAY
KING DRAGON
EL MIL SABORES
SUPERFINE
ISLE OF US
MCDONALD'S
SPRING THAI FUSION & BAR
THE LOFT STEAKHOUSE
ALEX TACOS
LA LUNA CAFE
FORTY CARROTS
LANDMARK TAVERN
GRAND AVENUE PIZZA
TIO NACHO
GRAND HYATT NEW YORK HOTEL
UPSIDE PIZZA
JOE'S PIZZA
ROYAL CARIBBEAN BAKERY & RESTAURANT
CABANA BREEZE
GUANG JI BAKERY
SOULFUL FOODS
ORIGINAL JOE'S PIZZA
Bon Appetit
A-1 PIZZA SHOP
STELLA'S PIZZA
REGGIANO'S II
EDEN WOK
GEIDO RESTAURANT
BLUE MEZCAL
MITOUSHI
CHLOE'S
SKINFLINTS RESTAURANT
DYLAN'S CANDY BAR
PENNYLANE COFFEE
CITI FIELD STAND 423
SHANGHAI TIME
SUEDE
1982 COFFEE ROASTERS
BIG WONG RESTAURANT
SURREAL CREAMERY
SUSHI SEKI
CEO
I LOVE DIM SUM
ODD SISTER
JOHN'S PIZZA
VIA CAROTA
THE OASIS CAFE
THE ORIGINAL EMILIO'

In [44]:
# Write the dataframe "restos_2" to a CSV file
restos_2.to_csv("restos_2_mar_24.csv", index=False)