In [18]:
import os
import pandas as pd

data_path = os.path.join("..", "data", "raw", "380K_US_Restaurants.csv")
output_dir = os.path.join("..", "data", "scaled")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "nyc_restaurants.csv")


In [9]:
sample = pd.read_csv(data_path, nrows=5)
print(sample.columns)
sample

Index(['Title', 'Link', 'Category', 'Rating', 'Website', 'Phone', 'Address',
       'Images', 'Categories', 'Geo_Coordinates', 'Time_Zone', 'Latitude',
       'Longitude'],
      dtype='object')


Unnamed: 0,Title,Link,Category,Rating,Website,Phone,Address,Images,Categories,Geo_Coordinates,Time_Zone,Latitude,Longitude
0,Dairy Queen Grill & Chill,https://www.google.com/maps/place/Dairy+Queen+...,Fast food restaurant,3.8,http://www.fourteenfoods.com/?y_source=1_ODk5N...,+1256-496-0404,"Dairy Queen Grill & Chill, 3143 US-280, Alexan...",[{'thumbnail': 'https://lh5.googleusercontent....,"['Fast food restaurant', 'Ice cream shop']","{'latitude': 32.9338695, 'longitude': -85.9704...",America/Chicago,32.93387,-85.970419
1,Jake's Restaurant,https://www.google.com/maps/place/Jake%27s+Res...,American restaurant,4.4,http://jakesonbroad.com/,+1256-234-4300,"Jake's Restaurant, 16 Broad St, Alexander City...",[{'thumbnail': 'https://lh5.googleusercontent....,['American restaurant'],"{'latitude': 32.945406, 'longitude': -85.953806}",America/Chicago,32.945406,-85.953806
2,Carib Kitchen,https://www.google.com/maps/place/Carib+Kitche...,Caribbean restaurant,4.9,https://carib-kitchen.webnode.com/,+1256-392-4433,"Carib Kitchen, 68 Broad St, Alexander City, AL...",[{'thumbnail': 'https://lh5.googleusercontent....,['Caribbean restaurant'],"{'latitude': 32.9446173, 'longitude': -85.954932}",America/Chicago,32.944617,-85.954932
3,Cazadores Mexican Restaurant,https://www.google.com/maps/place/Cazadores+Me...,Mexican restaurant,4.5,,+1256-392-3991,"Cazadores Mexican Restaurant, 910 Cherokee Rd,...",[{'thumbnail': 'https://lh5.googleusercontent....,['Mexican restaurant'],"{'latitude': 32.935643, 'longitude': -85.952365}",America/Chicago,32.935643,-85.952365
4,La Posada Mexican Grill,https://www.google.com/maps/place/La+Posada+Me...,Mexican restaurant,4.3,http://www.laposadamexicangrill.net/,+1256-329-3005,"La Posada Mexican Grill, 3714 US-280, Alexande...",[{'thumbnail': 'https://lh5.googleusercontent....,"['Mexican restaurant', 'Latin American restaur...","{'latitude': 32.926932, 'longitude': -85.9649187}",America/Chicago,32.926932,-85.964919


40.486931, -74.275446
South Amboy, New Jersey

40.935640, -74.273059
Wayne, New Jersey

40.932935, -73.633357
Long Island Sound


40.481485, -73.636937


Latitude: 40.48 to 40.92
Longitude: -74.26 to -73.69



In [12]:
df = pd.read_csv(data_path)
df["Latitude"] = pd.to_numeric(df["Latitude"], errors='coerce')
df["Longitude"] = pd.to_numeric(df["Longitude"], errors='coerce')
lat_min, lat_max = 40.4810, 40.9290
lon_min, lon_max = -74.2690, -73.6990
nyc_restaurants = df[(df['Latitude'] >= lat_min) & (df['Latitude'] <= lat_max) & (df['Longitude'] >= lon_min) & (df['Longitude'] <= lon_max)].copy()
print(f"Filtered resturants in NYC area are: {len(nyc_restaurants)} / {len(df)} total.")


Filtered resturants in NYC area are: 7322 / 380358 total.


  df = pd.read_csv(data_path)


In [19]:
nyc_restaurants.sample()
nyc_restaurants.to_csv(output_path, index=False)

In [22]:
import ast, requests

nyc = pd.read_csv(output_path, nrows=3)
# images column is likely a string that looks like a list -> parse with ast.literal_eval
imgs = ast.literal_eval(nyc.loc[0, "Images"])
imgs[0].keys(), imgs[0].get("thumbnail")

(dict_keys(['thumbnail', 'title']),
 'https://lh5.googleusercontent.com/p/AF1QipN457HX5cp96BN-JEKbCg0axE00OHCroOGzhNlU=w224-h298-k-no')

# Images | how to store them in MongoDB?
- The thumbnail is a google-hosted image URL in the csv.
- We see the image once it is opened.
- Can store images only for top restaurants (like the ones above 4.5 rating or something. And store them as binaries in GridFS.)
- Remove all McDonald's, because they ass.

In [None]:
import re

df_nyc = nyc_restaurants.copy()
df_nyc["title_lower"] = df_nyc["Title"].str.lower()
df_nyc["categories_lower"] = df_nyc["Categories"].astype(str).str.lower()
mcd_pattern = r"mc\s*donald'?s?"
mask = (
    df_nyc["title_lower"].str.contains(mcd_pattern, regex=True, na=False) |
    df_nyc["categories_lower"].str.contains(mcd_pattern, regex=True, na=False)
)

before = len(df_nyc)
df_no_mcd = df_nyc[~mask].copy()
after = len(df_no_mcd)

print(f"Rows before McDonald's removal: {before}")
print(f"Rows after McDonald's removal: {after}")
print(f"Removed: {before - after}")


Rows before McDonald's removal: 7322
Rows after McDonald's removal: 7294
Removed: 28


In [24]:
output_dir = os.path.join("..", "data", "scaled")
os.makedirs(output_dir, exist_ok=True)

clean_path = os.path.join(output_dir, "nyc_restaurants_no_mcD.csv")
df_no_mcd.to_csv(clean_path, index=False)

print("Saved:", clean_path)

Saved: ../data/scaled/nyc_restaurants_no_mcD.csv
