Code to clean JSON and csv inputs for better outputs

In [2]:
# necessary imports

import pandas as pd
import numpy as np
import csv
import json
import re
from difflib import get_close_matches # for fuzzy matching

In [8]:
# get restaurant names from inspection data
try:
        df = pd.read_csv("../outputs/merged_restaurant_data.csv")

        # Extract the 'name' column and add "NYC restaurant"
        restaurant_names = df['name'].tolist()  # Convert to a list
        formatted_names = [f"{name} NYC restaurant" for name in restaurant_names]


        # Write names to text file
        with open("../outputs/restaurant_names.txt", 'w', encoding='utf-8') as txtfile: # Use encoding to support non-ascii
            for name in formatted_names:
                txtfile.write(name + '\n') # newline after each name

        print(f"Restaurant names saved to {"../outputs/restaurant_names.txt"}")

except FileNotFoundError:
    print(f"Error: Input CSV file '{"../outputs/merged_restaurant_data.csv"}' not found.")
except KeyError as e:
    print(f"Error: Column '{e}' not found in the CSV. Check the header.")
except Exception as e:
    print(f"An error occurred: {e}")

Restaurant names saved to ../outputs/restaurant_names.txt


In [10]:
# add instagram data restaurant names

with open("../outputs/IG_posts.json", "r", encoding="utf-8") as f:
    insta_data = json.load(f)
    

restaurant_names = set()

for entry in insta_data:
    caption = entry.get("caption", "").lower()
    
    # Extract restaurant names from caption
    possible_names = re.findall(r'@([a-zA-Z0-9_]+)', caption)
    restaurant_names.update(possible_names)

restaurant_names = list(restaurant_names)

# put into file

with open("../outputs/restaurant_names.json", "w") as file:
    
    json.dump(restaurant_names, file, indent=4)
    
    print("Data has been written to file")
    


Data has been written to file


In [14]:
# create nodes for Gephi

"""Creates a Gephi-compatible nodes CSV from the restaurant data."""
try:
    df = pd.read_csv("../outputs/merged_restaurant_data.csv")

    # Select desired columns and rename "DBA" to "Label" for Gephi compatibility
    nodes_df = df[["name", "BOROUGH", "categories_list", "SCORE", "GRADE"]].copy()  # Create a copy to avoid warnings

    nodes_df.rename(columns={"name": "Label"}, inplace=True) #Rename here

    # Add an "Id" column (essential for Gephi) - use index as ID
    nodes_df.insert(0, 'Id', range(len(nodes_df)))  # Insert at beginning


    # Ensure 'SCORE' is numeric and handle errors gracefully
    nodes_df['SCORE'] = pd.to_numeric(nodes_df['SCORE'], errors='coerce') # Convert to numeric, invalid values become NaN


    # Handle potential empty or NaN values (replace with empty string to avoid Gephi import issues)
    nodes_df.fillna('', inplace=True) #Important to use fillna after to_numeric.  Otherwise you replace valid numbers


    # Write nodes to CSV
    nodes_df.to_csv("../outputs/nodes.csv", index=False, encoding="utf-8")

    print(f"Nodes CSV saved to {"../outputs/nodes.csv"}")


except FileNotFoundError:
    print(f"Error: Input CSV file not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Nodes CSV saved to ../outputs/nodes.csv


In [17]:
# create edges for gephi

df = pd.read_csv("../outputs/nodes.csv")
edges = []
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        if df.iloc[i]['BOROUGH'] == df.iloc[j]['BOROUGH']:
            edges.append([df.iloc[i]['Id'], df.iloc[j]['Id'], "Undirected", 1])  # Create edge


edges_df = pd.DataFrame(edges, columns=["Source", "Target", "Type", "Weight"])
edges_df.to_csv("../outputs/boro_edges.csv", index=False)

In [23]:
# filter google places reviews

with open("../outputs/review_aggregated.json", "r", encoding="utf-8") as f:
    review_data = json.load(f)
    
filtered_reviews = []

for entry in review_data:
    
    updates = entry.get("updatesFromCustomers", {})
    posted_by = updates.get("postedBy", {}) if updates else {}
    
    filtered_review = {
        "city": entry.get("city"),
        "category": entry.get("categoryName"),
        "title": entry.get("title"),
        "score" : entry.get("totalScore"),
        "review": updates.get("text") if updates else None,
        "name" : posted_by.get("name") if posted_by else None,
        "links" : [media.get("link") for media in updates.get("media", [])] if updates else []
    }
    
    filtered_reviews.append(filtered_review)

with open("../outputs/filtered_reviews.json", "w") as file:
    json.dump(filtered_reviews, file, indent=4)
    
    print("Data has been written to file")

Data has been written to file


In [24]:
# only keep those review names which we have active information for
with open("../outputs/restaurant_names.txt", "r", encoding="utf-8") as f:
    restaurant_names = [line.strip().replace(" NYC restaurant", "") for line in f.readlines()]
    
with open("../outputs/filtered_reviews.json", "r", encoding="utf-8") as f:
    filtered_data = json.load(f)
    
filtered_data = [entry for entry in filtered_data if entry.get("title") and get_close_matches(entry.get("title"), restaurant_names, n=1, cutoff=0.6)]

with open("../outputs/filtered_reviews.json", "w", encoding = "utf-8") as file:
    json.dump(filtered_data, file, indent=4, ensure_ascii=False)

In [25]:
# dump those restaurants into a .txt file 

with open("../outputs/filtered_reviews.txt", "w", encoding="utf-8") as f:
    for entry in filtered_data:
        f.write(entry.get("title") + "\n")