In [1]:
import openai
from langchain import PromptTemplate
from openai import OpenAI
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
import pandas as pd
import json
import os
import tqdm as tqdm
import random

# Make Subsets of Existing Data

In [2]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [76]:
def generate_subset(input_filename, output_filename, subset_size=10000):
    subset = []
    with open(input_filename, 'r') as file:
        for i, line in enumerate(file):
            if i >= subset_size:  # Stop after reading subset_size objects
                break
            try:
                obj = json.loads(line)  # Try to parse each line as a JSON object
                subset.append(obj)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from line {i}: {e}")
                continue

    with open(output_filename, 'w') as outfile:
        json.dump(subset, outfile, ensure_ascii=False, indent=4)



# Example usage
input_filename = '/Users/justin/Downloads/yelp_dataset/yelp_academic_dataset_business.json'
output_filename = '/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/restaurant_10000.json'
generate_subset(input_filename, output_filename)

input_filename = '/Users/justin/Downloads/yelp_dataset/yelp_academic_dataset_user.json'
output_filename = '/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/user_10000.json'
generate_subset(input_filename, output_filename)

input_filename = '/Users/justin/Downloads/yelp_dataset/yelp_academic_dataset_review.json'
output_filename = '/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/review_10000.json'
generate_subset(input_filename, output_filename)

### Restaurant

In [None]:
tastes = [
    "Spicy",
    "Sweet",
    "Sour",
    "Salty",
    "Umami",
    "Bitter",
    "Earthy",
    "Smokey",
    "Herbal",
    "Fruity",
    "Nutty",
    "Buttery",
    "Cheesy",
    "Tangy",
    "Savory",
    "Rich",
    "Creamy",
    "Zesty",
    "Mild",
    "Fiery",
    "Crispy",
    "Tender",
    "Juicy",
    "Dry",
    "Moist",
    "Fluffy",
    "Crunchy",
    "Gooey",
    "Chewy",
    "Frothy"
]

In [77]:
def operating_hours_to_string(hours_dict):
    days_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    summary_str = ""
    for day in days_order:
        if day in hours_dict:
            summary_str += f"{day}: {hours_dict[day]}\n"
        else:
            summary_str += f"{day}: Closed\n"  # Assuming closed if not specified
    return summary_str.strip() 

In [78]:
def assign_city_indices(city_list):
    cities = set(dic["city"] for dic in city_list if "city" in dic)
    sorted_cities = sorted(cities)
    city_indices = {city: index for index, city in enumerate(sorted_cities)}
    return city_indices

In [82]:
def generate_restaurant(input_filename, output_filename):
    """
    Restaurants(RestaurantId:INT[PK], 
    RestaurantName:VARCHAR(255), 
    Stars:REAL, 
    Hours:VARCHAR(255), 
    Category:VARCHAR(127), 
    LocationId:VARCHAR(255)[FK to Location.LocationId])
    """
    output = []
    original_file = read_json(input_filename)
    
    city_indices = assign_city_indices(original_file)
    
    for i in range(len(original_file)):
        new_sample = {}
        new_sample["RestaurantId"] = i+1
        # new_sample["LocationId"] = city_indices[original_file[i]["city"]]
        new_sample["LocationId"] = random.randint(1,1000)
        new_sample["Hours"] = operating_hours_to_string(original_file[i]["hours"])
        new_sample["Stars"] = original_file[i]["stars"]
        # new_sample["Category"] = original_file[i]["categories"]
        new_sample["Category"] = tastes[random.randint(0, 29)]
        new_sample["RestaurantName"] = original_file[i]["name"]
        output.append(new_sample)

        
    write_json(output,output_filename)

    

In [90]:
generate_restaurant("/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/restaurant_10000.json", "/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/restaurant_10000.json")

### Users

In [None]:
def assign_city_indices(city_list):
    cities = set(dic["city"] for dic in city_list if "city" in dic)
    sorted_cities = sorted(cities)
    city_indices = {city: index for index, city in enumerate(sorted_cities)}
    return city_indices

In [35]:
data = read_json("/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/user.json")
new_data = []
i = 1
for sample in data:
    new_sample = {}
    new_sample["user_id"] = i
    new_sample["name"] = sample["name"]
    new_sample["Taste"] = tastes[random.randint(0, 29)]
    new_sample["Password"] = sample["user_id"]
    i+=1
    new_data.append(new_sample)
write_json(new_data, "/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/user.json")
    
    

### Review

In [None]:
"""
RestaurantId:INT[FK to Restaurant.RestaurantId], 
UserId:INT[FK to User.UserId], 
Date:DATETIME[PK], 
Star:REAL, 
Text:MEDIUMTEXT
"""

In [43]:
def generate_review(restaurant_data_path, user_data_path, input_filename, output_filename):
    """
    RestaurantId:INT[FK to Restaurant.RestaurantId], 
    UserId:INT[FK to User.UserId], 
    Date:DATETIME[PK], 
    Star:REAL, 
    Text:MEDIUMTEXT
    """
    output = []
    original_file = read_json(input_filename)
    restaurant_data = read_json(restaurant_data_path)
    user_data = read_json(user_data_path)
    
    
    for i in range(len(original_file)):
        new_sample = {}
        new_sample["RestaurantId"] = restaurant_data[i]["RestaurantId"]
        new_sample["UserId"] = user_data[i]["UserId"]
        new_sample["Stars"] = original_file[i]["stars"]
        new_sample["Text"] = original_file[i]["text"]
        new_sample["Date"] = original_file[i]["date"]
        output.append(new_sample)

        
    write_json(output,output_filename)

In [54]:
generate_review(user_data_path="/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/user.json", 
                restaurant_data_path="/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/restaurant.json",
                input_filename="/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/review.json",
                output_filename="/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/review.json")

### Location

In [56]:
def generate_location(restaurant_data_path, original_restaurant_data_path, output_filename):
    '''
    LocationId:INT[PK], 
    Latitude:REAL, 
    Longitude:REAL, 
    PostalCode:VARCHAR(31), 
    State:VARCHAR(15), 
    City:VARCHAR(31)
    '''
    
    output = []
    restaurant_data = read_json(restaurant_data_path)
    original_restaurant_data = read_json(original_restaurant_data_path)
    
    
    for i in range(len(restaurant_data)):
        new_sample = {}
        new_sample["LocationId"] = restaurant_data[i]["LocationId"]
        new_sample["Latitude"] = original_restaurant_data[i]["latitude"]
        new_sample["Longitude"] = original_restaurant_data[i]["longitude"]
        new_sample["PostalCode"] = original_restaurant_data[i]["postal_code"]
        new_sample["State"] = original_restaurant_data[i]["state"]
        new_sample["City"] = original_restaurant_data[i]["city"]
        output.append(new_sample)

        
    write_json(output,output_filename)

In [57]:
generate_location(restaurant_data_path="/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/restaurant.json", original_restaurant_data_path="/Users/justin/Desktop/sp24-cs411-team027-team3x9/original_data/restaurant.json", output_filename="/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/location.json")

### Dishes

In [41]:
client = OpenAI(api_key = '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')

def get_dish_name_completion(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a naming helper."},
            {"role": "user", "content": f"Generate names for 5 dishes or items suitable for a restaurant or store called {prompt}.1, seperate the names by ';', 2, just return the names and nothing else."},
        ]
    )
    return response.choices[0].message.content

In [42]:
get_dish_name_completion("sea food")

"1. Ocean's Delight Platter; \n2. Seaside Bounty Pasta; \n3. Captain's Catch Tacos; \n4. Neptune's Feast Bowl; \n5. Marine Delight Sushi Rolls"

In [43]:
def extract_dish_names(s):
    lines = s.split("\n")
    dish_names = [line.split(". ", 1)[1].split(";")[0] for line in lines if ". " in line]
    return dish_names

In [44]:
extract_dish_names("1. Ocean Bounty Platter; \n2. Seashell Surprise Soup; \n3. Maritime Medley Salad; \n4. Neptune's Delight Pasta; \n5. Coastal Catch Tacos; \n6. Lighthouse Lobster Roll; \n7. Seaside Scallops Stir-fry; \n8. Coral Reef Ceviche; \n9. Tidal Wave Tempura; \n10. Captain's Choice Clambake")

['Ocean Bounty Platter',
 'Seashell Surprise Soup',
 'Maritime Medley Salad',
 "Neptune's Delight Pasta",
 'Coastal Catch Tacos',
 'Lighthouse Lobster Roll',
 'Seaside Scallops Stir-fry',
 'Coral Reef Ceviche',
 'Tidal Wave Tempura',
 "Captain's Choice Clambake"]

In [49]:
from tqdm import tqdm
def generate(restaurant_data_path, output_filename):
    """
    DishId:INT[PK], 
    RestaurantId:INT[FK to Restaurant.RestaurantId], 
    Price:REAL, 
    Name:VARCHAR(127)
    """
    
    output = []
    restaurant_data = read_json(restaurant_data_path)
    
    j=0
    for i in tqdm(range(len(restaurant_data))):
        name_list = extract_dish_names(get_dish_name_completion(restaurant_data[i]["RestaurantName"]))
        for dishname in name_list:
            
            new_sample = {}
            new_sample["DishId"] = j+1
            new_sample["RestaurantId"] = restaurant_data[i]["RestaurantId"]
            new_sample["Price"] = random.randint(5,15)
            new_sample["Name"] = dishname
            output.append(new_sample)

        
    write_json(output,output_filename)

In [50]:
generate("/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/restaurant_10000.json", "/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/dish_50000.json")

100%|██████████| 10000/10000 [2:39:12<00:00,  1.05it/s]  


In [54]:
sample = read_json("/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/dish_50000.json")
print(len(sample))
for i in range(len(sample)):
    sample[i]["DishId"] = i+1
write_json(sample, "/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/dish_50000.json")

21855


### Crime

In [3]:
def assign_crime_indices(location_list):
    postals = set(dic["PostalCode"] for dic in location_list if "PostalCode" in dic)
    sorted_postals = sorted(postals)
    dic = {}
    for index, postal in enumerate(sorted_postals):
        dic[postal] = random.randint(0, 50)
    return dic

In [4]:

def generate_crime(location_data_path, output_filename):
    """
    Crimes(CrimeId:INT[PK]), 
    Count:INT, 
    Type:INT, 
    Cleared:BOOLEAN, 
    LocationId:INT[FK to Location.LocationId])
    """
    
    output = []
    location_data = read_json(location_data_path)
    
    
    for i in range(10000):
        new_sample = {}
        new_sample["CrimeId"] = i+1
        new_sample["Count"] = random.randint(0,30)
        new_sample["Type"] = random.randint(0,6)
        random_boolean = random.choice([0, 1])
        new_sample["Cleared"] = random_boolean
        new_sample["LocationId"] = random.randint(1, 1000)
        
        output.append(new_sample)

        
    write_json(output,output_filename)

In [5]:
generate_crime("/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/location.json", "/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/crime_10000.json")

In [57]:
def generate_occurence(input, output_filename):
    """
    RestaurantId:INT[PK], 
    CrimeId:INT[PK]
    """
    output=[]
    crime = read_json(input)
    for i in tqdm(range(len(crime))):
        new_sample = {}
        new_sample["LocationId"] = crime[i]["LocationId"]
        new_sample["CrimeId"] =crime[i]["CrimeId"]
        
        output.append(new_sample)
        
    write_json(output,output_filename)
    

In [58]:
generate_occurence(input = "/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/crime_10000.json",output_filename="/Users/justin/Desktop/sp24-cs411-team027-team3x9/data/occurence_10000.json")

100%|██████████| 10000/10000 [00:00<00:00, 3315393.25it/s]




In [6]:


def convert_json_to_csv(source_dir, target_dir):
    """
    Converts all JSON files in the source directory to CSV format and saves them in the target directory.

    Args:
    - source_dir (str): The directory containing the JSON files.
    - target_dir (str): The directory where the CSV files will be saved.
    """
    # Create the target directory if it does not exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Loop through all files in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith('.json'):
            # Construct the full file path
            json_path = os.path.join(source_dir, filename)


In [7]:
def convert_json_to_csv(source_dir, target_dir):
    """
    Converts all JSON files in the source directory to CSV format and saves them in the target directory.

    Args:
    - source_dir (str): The directory containing the JSON files.
    - target_dir (str): The directory where the CSV files will be saved.
    """
    # Create the target directory if it does not exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Loop through all files in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith('.json'):
            # Construct the full file path
            json_path = os.path.join(source_dir, filename)
            csv_path = os.path.join(target_dir, filename.replace('.json', '.csv'))
            
            # Load the JSON data
            with open(json_path, 'r') as json_file:
                data = json.load(json_file)
            
            # Convert the JSON data to a DataFrame
            df = pd.DataFrame(data)
            
            # Save the DataFrame to a CSV file
            df.to_csv(csv_path, index=False)
            
            print(f"Converted {filename} to CSV.")

# Example usage:
source_directory = '/Users/justin/Desktop/sp24-cs411-team027-team3x9/data'
target_directory = '/Users/justin/Desktop/sp24-cs411-team027-team3x9/data_csv'
convert_json_to_csv(source_directory, target_directory)


Converted crime.json to CSV.
Converted dish.json to CSV.
Converted restaurant.json to CSV.
Converted location.json to CSV.
Converted restaurant_10000.json to CSV.
Converted user.json to CSV.
Converted occurence_10000.json to CSV.
Converted crime_10000.json to CSV.
Converted review.json to CSV.
Converted occurence.json to CSV.
Converted dish_50000.json to CSV.
