In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import re
import json
import ast

pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_rows', 50)

In [None]:
# --- Dish.csv ---
dish_df = pd.read_csv('Dish.csv', usecols=['name', 'times_appeared', 'menus_appeared', 'lowest_price', 'highest_price'])
dish_df['name_lower'] = dish_df['name'].astype(str).str.lower()
print(f'Dish.csv: {len(dish_df):,} dishes loaded')

# --- train.json ---
with open('train.json') as f:
    recipes_raw = json.load(f)
recipe_df = pd.DataFrame(recipes_raw)
recipe_df['ingredients_lower'] = recipe_df['ingredients'].apply(lambda lst: [i.lower() for i in lst])
print(f'train.json: {len(recipe_df):,} recipes loaded')

# --- US_restaurants.csv ---
rest_df = pd.read_csv('US_restaurants.csv', usecols=['name', 'city', 'menus'])
rest_df = rest_df.dropna(subset=['menus'])

menu_rows = []
for _, row in rest_df.iterrows():
    try:
        items = json.loads(row['menus'])
    except (json.JSONDecodeError, TypeError):
        try:
            items = ast.literal_eval(row['menus'])
        except Exception:
            continue
    if not isinstance(items, list):
        continue
    for item in items:
        if not isinstance(item, dict):
            continue
        menu_rows.append({
            'restaurant': row['name'],
            'city': row['city'] if pd.notna(row['city']) else '',
            'dish_name': item.get('name', ''),
            'description': item.get('descriptions', '')
        })

menu_df = pd.DataFrame(menu_rows)
menu_df['search_text'] = (menu_df['dish_name'].astype(str) + ' ' + menu_df['description'].astype(str)).str.lower()
print(f'US_restaurants.csv: {len(menu_df):,} menu items parsed from {rest_df["name"].nunique():,} restaurants')

In [None]:
user_input = input('Enter ingredients to search (e.g. "chicken rice"): ').strip()
search_terms = user_input.lower().split()
print(f'Searching for dishes containing ALL of: {search_terms}')

In [None]:
# --- Popular Dishes (from Dish.csv) ---
mask = pd.Series(True, index=dish_df.index)
for term in search_terms:
    mask &= dish_df['name_lower'].str.contains(term, regex=False)

popular = (
    dish_df[mask]
    .sort_values('times_appeared', ascending=False)
    .head(15)
    [['name', 'times_appeared', 'menus_appeared', 'lowest_price', 'highest_price']]
    .reset_index(drop=True)
)
popular.columns = ['Dish Name', 'Times Appeared', 'Menus Appeared', 'Lowest Price', 'Highest Price']

print(f'\nTop {len(popular)} most popular dishes matching {search_terms}:')
display(popular)

In [None]:
# --- Ingredient Breakdown (from train.json) ---
mask = pd.Series(True, index=recipe_df.index)
for term in search_terms:
    mask &= recipe_df['ingredients_lower'].apply(lambda lst: any(term in ing for ing in lst))

matched_recipes = recipe_df[mask].copy()
print(f'Found {len(matched_recipes):,} recipes containing all of {search_terms}\n')

# Top 10 matches with cuisine + ingredients
top_recipes = matched_recipes.head(10)[['cuisine', 'ingredients']].reset_index(drop=True)
top_recipes.columns = ['Cuisine', 'Ingredients']
top_recipes['Ingredients'] = top_recipes['Ingredients'].apply(lambda x: ', '.join(x))
print('--- Sample Recipes ---')
display(top_recipes)

# Most common co-occurring ingredients
from collections import Counter
all_ingredients = []
for lst in matched_recipes['ingredients_lower']:
    all_ingredients.extend(lst)

# Exclude the search terms themselves from the co-occurrence list
freq = Counter(ing for ing in all_ingredients if ing not in search_terms)
common_df = pd.DataFrame(freq.most_common(20), columns=['Ingredient', 'Count'])
print(f'\n--- Most Common Co-occurring Ingredients (across {len(matched_recipes):,} recipes) ---')
display(common_df)

In [None]:
# --- Restaurant Matches (from US_restaurants.csv) ---
mask = pd.Series(True, index=menu_df.index)
for term in search_terms:
    mask &= menu_df['search_text'].str.contains(term, regex=False)

rest_matches = (
    menu_df[mask]
    [['restaurant', 'city', 'dish_name', 'description']]
    .head(15)
    .reset_index(drop=True)
)
rest_matches.columns = ['Restaurant', 'City', 'Dish', 'Description']

print(f'Top {len(rest_matches)} restaurant menu items matching {search_terms}:')
display(rest_matches)