In [None]:
# Install required libraries
!pip install requests pandas pymupdf

# Import necessary libraries
import requests
import re
import pandas as pd
import json
import fitz  # PyMuPDF
from google.colab import files

def extract_and_merge_menu_data():
    # Define the menu PDFs to process
    menu_pdfs = [
        {"url": "https://www.ihcltata.com/content/dam/luxury/hotels/taj-palace-delhi/documents/Capital-Kitchen.pdf",
         "name": "Capital Kitchen"},
        {"url": "https://www.ihcltata.com/content/dam/luxury/hotels/taj-palace-delhi/documents/Spicy-Duck.pdf",
         "name": "Spicy Duck"},
        {"url": "https://www.vivantahotels.com/content/dam/luxury/hotels/Taj_Mahal_Delhi/documents/menu-pdfs/Rick's_Menu.PDF",
         "name": "Rick's Bar"}
    ]

    all_menu_items = []

    # Process each menu PDF
    for menu in menu_pdfs:
        url = menu["url"]
        restaurant_name = menu["name"]

        print(f"\nProcessing {restaurant_name} menu from {url}")

        try:
            # Download the PDF
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to download PDF. Status code: {response.status_code}")
                continue

            # Save the PDF locally
            pdf_filename = url.split('/')[-1]
            with open(pdf_filename, "wb") as f:
                f.write(response.content)
            print(f"Downloaded {pdf_filename}")

            # Extract text using PyMuPDF
            pdf_document = fitz.open(stream=response.content, filetype="pdf")
            menu_text = ""
            for page in pdf_document:
                menu_text += page.get_text() + "\n\n"

            # Process text to extract menu items
            menu_items = extract_menu_items(menu_text, restaurant_name)

            if menu_items:
                all_menu_items.extend(menu_items)
                print(f"Extracted {len(menu_items)} menu items")
            else:
                print(f"No menu items extracted from {restaurant_name}")

        except Exception as e:
            print(f"Error processing {restaurant_name}: {e}")

    # Create merged dataset
    if all_menu_items:
        # Convert to DataFrame
        menu_df = pd.DataFrame(all_menu_items)

        # Save to CSV and JSON
        menu_df.to_csv("merged_restaurant_menus.csv", index=False)
        with open("merged_restaurant_menus.json", "w", encoding="utf-8") as f:
            json.dump(all_menu_items, f, indent=2, ensure_ascii=False)

        # Display summary
        print(f"\nSuccessfully merged {len(all_menu_items)} menu items from all restaurants")
        print("\nSample items:")
        print(menu_df[["restaurant", "item", "price", "description"]].head(10))
        print("\nData saved to 'merged_restaurant_menus.csv' and 'merged_restaurant_menus.json'")

        # Download files in Colab
        files.download("merged_restaurant_menus.csv")
        files.download("merged_restaurant_menus.json")

        return menu_df
    else:
        print("No menu items were extracted from any restaurant")
        return None

def extract_menu_items(text, restaurant_name):
    menu_items = []
    current_section = "Menu Items"
    lines = text.split('\n')

    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue

        # Check for section headers (all caps, no price)
        if line.isupper() and len(line) > 3 and len(line) < 40 and not re.search(r'\d{3,4}$', line):
            current_section = line
            continue

        # Look for menu items with prices (3-4 digit number at end)
        price_match = re.search(r'(\d{3,4})$', line)
        if price_match:
            price = price_match.group(1)
            item_name = line[:line.rfind(price)].strip()

            # Skip lines that are likely not menu items
            if len(item_name) < 3 or item_name.isdigit():
                continue

            # Look for description in the next line
            description = "None"
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                if next_line and not re.search(r'\d{3,4}$', next_line) and not next_line.isupper():
                    description = next_line

            menu_items.append({
                "restaurant": restaurant_name,
                "section": current_section,
                "item": item_name,
                "price": "₹" + price,
                "description": description
            })

    return menu_items

# Run the extraction and merging
merged_menus = extract_and_merge_menu_data()



Processing Capital Kitchen menu from https://www.ihcltata.com/content/dam/luxury/hotels/taj-palace-delhi/documents/Capital-Kitchen.pdf
Downloaded Capital-Kitchen.pdf
Extracted 67 menu items

Processing Spicy Duck menu from https://www.ihcltata.com/content/dam/luxury/hotels/taj-palace-delhi/documents/Spicy-Duck.pdf
Downloaded Spicy-Duck.pdf
No menu items extracted from Spicy Duck

Processing Rick's Bar menu from https://www.vivantahotels.com/content/dam/luxury/hotels/Taj_Mahal_Delhi/documents/menu-pdfs/Rick's_Menu.PDF
Downloaded Rick's_Menu.PDF
Extracted 2 menu items

Successfully merged 69 menu items from all restaurants

Sample items:
        restaurant                      item  price  \
0  Capital Kitchen             SEAFOOD PIZZA  ₹1450   
1  Capital Kitchen           PIZZA PEPPERONI  ₹1450   
2  Capital Kitchen  THE CAPITAL PIZZA 1450 /  ₹1150   
3  Capital Kitchen  CLASSIC PIZZA MARGHERITA  ₹1150   
4  Capital Kitchen               PIZZA FUNGI  ₹1150   
5  Capital Kitchen       

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
menu_df.head()

Unnamed: 0,section,name,price,description
0,Menu Items,All prices are in Indian Rupees and subject to...,₹1450,"Calamari, shrimp, smoked salmon, con/f_it garl..."
1,Menu Items,PIZZA PEPPERONI,₹1450,"Pork pepperoni, crushed tomatoes, mozzarella, ..."
2,Menu Items,THE CAPITAL PIZZA 1450 /,₹1150,"Barbecued chicken, red onion, fresh coriander,..."
3,Menu Items,CLASSIC PIZZA MARGHERITA,₹1150,"Mozzarella, fresh basil, tomato sauce"
4,Menu Items,PIZZA FUNGI,₹1150,


In [None]:
import pandas as pd
# Assuming menu_dataframes is the dictionary returned by extract_multiple_menus()
# It contains DataFrames for each restaurant

if 'Spicy Duck' in menu_dataframes and 'Ricks Bar' in menu_dataframes:
    merged_df = pd.concat([menu_dataframes['Spicy Duck'], menu_dataframes['Ricks Bar']], ignore_index=True)
    print("\nMerged Menu Data:")
    print(merged_df)

    # Save the merged dataframe to a new CSV file
    merged_df.to_csv("merged_menu.csv", index=False)
    print("\nMerged data saved to merged_menu.csv")

elif 'Spicy Duck' in menu_dataframes:
    print("\nOnly Spicy Duck menu data extracted.")
    print(menu_dataframes['Spicy Duck'])
elif 'Ricks Bar' in menu_dataframes:
    print("\nOnly Ricks Bar menu data extracted.")
    print(menu_dataframes['Ricks Bar'])
else:
    print("\nNo menu data was extracted for either restaurant.")



Only Ricks Bar menu data extracted.
       section                                               name  price  \
0   Menu Items                Sherry Cinzano (Dry, Rosso, Bianco)   ₹600   
1   Menu Items                                            Campari   ₹600   
2   Menu Items                                             Pernod   ₹600   
3   Menu Items                              Harveys Bristol Cream   ₹600   
4   Menu Items                                           Tio Pepe   ₹600   
..         ...                                                ...    ...   
93      N.O.M.                                   Sparkling Frenzy  ₹1350   
94      N.O.M.  A refreshing mix of gin with fresh basil leave...  ₹1000   
95      N.O.M.                                                     ₹1150   
96      N.O.M.                                                     ₹1250   
97      N.O.M.                         Curries of South East ASIA  ₹1350   

                                          descript

In [None]:

df = menu_df
print(df.head())


      section                                               name  price  \
0  Menu Items  All prices are in Indian Rupees and subject to...  ₹1450   
1  Menu Items                                    PIZZA PEPPERONI  ₹1450   
2  Menu Items                           THE CAPITAL PIZZA 1450 /  ₹1150   
3  Menu Items                           CLASSIC PIZZA MARGHERITA  ₹1150   
4  Menu Items                                        PIZZA FUNGI  ₹1150   

                                         description  
0  Calamari, shrimp, smoked salmon, con/f_it garl...  
1  Pork pepperoni, crushed tomatoes, mozzarella, ...  
2  Barbecued chicken, red onion, fresh coriander,...  
3              Mozzarella, fresh basil, tomato sauce  
4                                                N/A  


In [None]:
import re
import pandas as pd

menu_text = '''
Curried vegetable turnover 350
Puff pastry, mix vegetable masala
Chilli Paneer Croissant 350
Flaky pastry, chili paneer preparation
Corn - spinach quiche 350
Short crust pastry, corn & spinach preparation
Chicken Tikka Puff 400
Puff pastry, chicken tikka masala
Keema Mattar Croissant 400
Flaky pastry, lamb keema masala
Chicken Mushroom quiche 400
Short crust pastry, chicken & mushroom preparation
Red velvet cake half kg 1050
Soft red velvet sponge, cream cheese frosting
Chocolate truffle cake half kg 950
Soft chocolate sponge, dark chocolate truffle
Mix fruit cake half kg 850
Soft vanilla sponge, cream patisserie, assorted fruits
Pineapple cake half kg 850
Soft vanilla sponge, cream patisserie, pineapple
Carrot cake half kg 1050
Grated carrot, egg, refined flour, vegetable oil, cream cheese frosting, walnut
New York cheese cake half kg 1050
Cream cheese, fresh cream, egg, butter, biscuit crumbs
Sugar free/ liqueur/ truffles 4 / 6  / 9 / 12 pc 275 /375 / 475 / 575
Key lime pie 275
White couverture, lime truffle
Banana blossom 275
Milk couverture, banana chocolate truffle
Dulce de leche 275
Milk couverture, caramel filling
Coffee truffle 275
Dark couverture, coffee truffle
Strawberry ginger truffle 275
Milk couverture, strawberry ginger truffle
Lemon 275
Almond meal, powder sugar, white couverture, lemon juice
Chocolate ganache 275
Almond meal, powder sugar, cocoa powder, vegetable oil
Pistachio 275
Almond meal, powder sugar, white couverture, pistachio paste, butter
Raspberry 275
Almond meal, powder sugar, white couverture, raspberry puree, butter
Coffee 275
Almond meal, powder sugar, dark couverture, roasted coffee beans, butter
Vanilla 275
Almond meal, powder sugar, white couverture, butter
Red velvet Pastry 295
Soft red velvet sponge, cream cheese frosting
CARAMEL TIRAMISU SLICE 516
Soft vanilla sponge, italian mascarpone cream, caramel sauce
Fresh Fruit Pastry 347
Soft vanilla sponge, cream patisserie, assorted fruits
Black Forest 100
Soft chocolate sponge, dark chocolate cream, vanilla cream, dark cherry filling
CHOCOLATE HAZELNUT SLICE 639
Soft chocolate sponge, dark chocolate truffle, hazelnut crunch
BAKED BERRY CHEESECAKE 519
Cream cheese, fresh cream, egg , berry confit, biscuit crumbs
Butter Croissant 195
Refined flour, butter, milk
Pain au Chocolate 521
Refined flour, butter, milk, dark couverture
Chocolate Croissant 511
Refined flour, butter, milk, dark couverture
Chocolate Doughnut 615
Dark couverture, butter, milk, flour, egg, cream
Oats and Cranberry Cookies 75 per pc
White couverture, butter, flour, cream, oats, dried cranberry, golden syrup
Double Choco Chip Cookies 75 per pc
Dark couverture, butter, flour, cream, cocoa powder, golden syrup
CHOCOLATE FUDGE COOKIES 75 per pc
Butter, flour, egg, walnut
Blueberry crumble 195
Egg, cream, refined flour, almond meal, yogurt, butter
Cranberry Orange 195
Egg, cream, refined flour, almond meal, yogurt, butter, dried cranberry
Chocolate Coffee Toffee 195
Egg, cream, refined flour, almond meal, yogurt, butter, coffee
'''

lines = menu_text.split('\n')
menu_items = []

for i, line in enumerate(lines):
    line = line.strip()
    if not line:
        continue
    # Match lines with price at end (number)
    price_match = re.search(r'(\d+)$', line)
    if price_match:
        price = price_match.group(1)
        item_name = line[:line.rfind(price)].strip()
        description = "N/A"
        if i + 1 < len(lines):
            next_line = lines[i+1].strip()
            if next_line and not re.search(r'\d+$', next_line):
                description = next_line
        menu_items.append({"item": item_name, "price": price, "description": description})

df = pd.DataFrame(menu_items)
print(df.head(10))


                             item price  \
0      Curried vegetable turnover   350   
1         Chilli Paneer Croissant   350   
2           Corn - spinach quiche   350   
3              Chicken Tikka Puff   400   
4          Keema Mattar Croissant   400   
5         Chicken Mushroom quiche   400   
6         Red velvet cake half kg  1050   
7  Chocolate truffle cake half kg   950   
8          Mix fruit cake half kg   850   
9          Pineapple cake half kg   850   

                                         description  
0                  Puff pastry, mix vegetable masala  
1             Flaky pastry, chili paneer preparation  
2     Short crust pastry, corn & spinach preparation  
3                  Puff pastry, chicken tikka masala  
4                    Flaky pastry, lamb keema masala  
5  Short crust pastry, chicken & mushroom prepara...  
6      Soft red velvet sponge, cream cheese frosting  
7      Soft chocolate sponge, dark chocolate truffle  
8  Soft vanilla sponge, cream p

In [None]:
# prompt: add this extracted data to merged menu

import pandas as pd
# Assuming 'merged_menus' DataFrame exists from the previous code execution
# and 'df' is the DataFrame created from the new menu_text string.

if merged_menus is not None:
    # Concatenate the two DataFrames
    final_merged_df = pd.concat([merged_menus, df], ignore_index=True)

    # Save to CSV and JSON
    final_merged_df.to_csv("completely_merged_restaurant_menus.csv", index=False)
    with open("completely_merged_restaurant_menus.json", "w", encoding="utf-8") as f:
        json.dump(final_merged_df.to_dict('records'), f, indent=2, ensure_ascii=False)

    print("\nCompletely Merged Menu Data:")
    print(final_merged_df.head())
    print(f"\nData saved to 'completely_merged_restaurant_menus.csv' and 'completely_merged_restaurant_menus.json'")

    # Download the files
    files.download("completely_merged_restaurant_menus.csv")
    files.download("completely_merged_restaurant_menus.json")

else:
    print("Previous menu extraction failed. Cannot merge.")



Completely Merged Menu Data:
        restaurant     section                      item  price  \
0  Capital Kitchen  Menu Items             SEAFOOD PIZZA  ₹1450   
1  Capital Kitchen  Menu Items           PIZZA PEPPERONI  ₹1450   
2  Capital Kitchen  Menu Items  THE CAPITAL PIZZA 1450 /  ₹1150   
3  Capital Kitchen  Menu Items  CLASSIC PIZZA MARGHERITA  ₹1150   
4  Capital Kitchen  Menu Items               PIZZA FUNGI  ₹1150   

                                         description  
0  Calamari, shrimp, smoked salmon, conﬁt garlic,...  
1  Pork pepperoni, crushed tomatoes, mozzarella, ...  
2  Barbecued chicken, red onion, fresh coriander,...  
3              Mozzarella, fresh basil, tomato sauce  
4  Assorted mushrooms, crushed tomatoes, crispy g...  

Data saved to 'completely_merged_restaurant_menus.csv' and 'completely_merged_restaurant_menus.json'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install required libraries
!pip install requests beautifulsoup4 pandas

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from google.colab import files

def scrape_avartana_menu(url, location):
    """
    Scrapes menu items from Avartana restaurant webpage
    """
    print(f"\nScraping {location} menu...")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # Fetch webpage
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text content
        page_text = soup.get_text(separator='\n')
        lines = [line.strip() for line in page_text.split('\n') if line.strip()]

        # Extract menu items
        menu_items = []
        i = 0
        while i < len(lines) - 1:
            # Check for menu item pattern: short item name followed by description with "with"
            if (len(lines[i]) < 30 and
                i+1 < len(lines) and
                lines[i+1].lower().startswith('with') and
                not re.search(r'(TIMING|CUISINE|CONTACT|DOWNLOAD|submit|cookie)', lines[i], re.IGNORECASE)):

                menu_items.append({
                    'location': location,
                    'item': lines[i],
                    'description': lines[i+1],
                    'price': "N/A"  # Avartana uses set menu pricing, not à la carte
                })
                i += 2
            else:
                i += 1

        # Extract degustation menu information
        degustation_info = search_degustation_info(page_text, location)
        menu_items.extend(degustation_info)

        return menu_items

    except Exception as e:
        print(f"Error scraping {location}: {e}")
        return []

def search_degustation_info(text, location):
    """Extract information about degustation menus"""
    # Default degustation menu information from search results
    degustation_menus = []

    # Set default pricing based on location
    if "New Delhi" in location:
        # From search result [9]: "Cost per person (degustation menu) starts at INR 3000 plus taxes"
        base_price = "₹3000+"
    else:
        base_price = "N/A"

    # Add standard degustation menus mentioned in search results
    menus = [
        {"name": "Maya", "courses": 7, "price": base_price},
        {"name": "Bela", "courses": 9, "price": base_price},
        {"name": "Jiaa", "courses": 11, "price": base_price},
        {"name": "Anika", "courses": 13, "price": base_price},
        {"name": "Tara", "courses": 14, "price": base_price}
    ]

    for menu in menus:
        degustation_menus.append({
            'location': location,
            'item': f"Degustation Menu: {menu['name']}",
            'description': f"{menu['courses']} course tasting menu experience",
            'price': menu['price']
        })

    return degustation_menus

def extract_all_avartana_menus():
    """Extract menu data from all three Avartana locations"""
    # Define restaurant locations
    locations = [
        {
            "url": "https://www.itchotels.com/in/en/itcroyalbengal-kolkata/fine-dine/avartana",
            "name": "Kolkata (ITC Royal Bengal)"
        },
        {
            "url": "https://www.itchotels.com/in/en/itcmaurya-new-delhi/fine-dine/avartana",
            "name": "New Delhi (ITC Maurya)"
        },
        {
            "url": "https://www.itchotels.com/in/en/itcmaratha-mumbai/fine-dine/avartana",
            "name": "Mumbai (ITC Maratha)"
        }
    ]

    all_menu_items = []

    # Process each location
    for location in locations:
        menu_items = scrape_avartana_menu(location["url"], location["name"])
        if menu_items:
            all_menu_items.extend(menu_items)
            print(f"Found {len(menu_items)} items from {location['name']}")
        else:
            print(f"No menu items found at {location['name']}")

    # Create DataFrame and save data
    if all_menu_items:
        # Create DataFrame
        df = pd.DataFrame(all_menu_items)

        # Save to CSV and JSON
        df.to_csv("avartana_menus.csv", index=False)
        with open("avartana_menus.json", "w", encoding="utf-8") as f:
            json.dump(all_menu_items, f, indent=2, ensure_ascii=False)

        # Display results
        print("\nSample of extracted menu items:")
        print(df.head(10))
        print(f"\nTotal menu items extracted: {len(all_menu_items)}")

        # Download files
        files.download("avartana_menus.csv")
        files.download("avartana_menus.json")

        return df
    else:
        print("No menu items found at any location.")
        return None

# Run the extraction
avartana_df = extract_all_avartana_menus()



Scraping Kolkata (ITC Royal Bengal) menu...
Found 17 items from Kolkata (ITC Royal Bengal)

Scraping New Delhi (ITC Maurya) menu...
Found 17 items from New Delhi (ITC Maurya)

Scraping Mumbai (ITC Maratha) menu...
Found 17 items from Mumbai (ITC Maratha)

Sample of extracted menu items:
                     location                 item  \
0  Kolkata (ITC Royal Bengal)       Potato cracker   
1  Kolkata (ITC Royal Bengal)    Coriander chicken   
2  Kolkata (ITC Royal Bengal)       Spiced bolteus   
3  Kolkata (ITC Royal Bengal)    Tomato and millet   
4  Kolkata (ITC Royal Bengal)     Spiced aubergine   
5  Kolkata (ITC Royal Bengal)     Sago and yoghurt   
6  Kolkata (ITC Royal Bengal)     Pan seared quail   
7  Kolkata (ITC Royal Bengal)  Crispy chili potato   
8  Kolkata (ITC Royal Bengal)    Raw mango pudding   
9  Kolkata (ITC Royal Bengal)   Stir fried chicken   

                                 description price  
0                   with tamarind ghee glaze   N/A  
1         

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# prompt: merge it with previous data and give final csv file for download

import pandas as pd
# Assuming 'final_merged_df' DataFrame exists from the previous code execution
# and 'avartana_df' DataFrame exists from the avartana scraping.

if final_merged_df is not None and avartana_df is not None:
    # Concatenate the two DataFrames
    final_df = pd.concat([final_merged_df, avartana_df], ignore_index=True)

    # Save to CSV
    final_df.to_csv("final_merged_menu.csv", index=False)
    print("\nFinal Merged Menu Data:")
    print(final_df.head())
    print(f"\nData saved to 'final_merged_menu.csv'")

    # Download the file
    files.download("final_merged_menu.csv")

elif final_merged_df is not None:
    print("Avartana data not available for merging.")
    files.download("completely_merged_restaurant_menus.csv") # Download the previous merged file

elif avartana_df is not None:
    print("Previous menu data not available for merging.")
    files.download("avartana_menus.csv") # Download the avartana menu file

else:
    print("No menu data available for download.")



Final Merged Menu Data:
        restaurant     section                      item  price  \
0  Capital Kitchen  Menu Items             SEAFOOD PIZZA  ₹1450   
1  Capital Kitchen  Menu Items           PIZZA PEPPERONI  ₹1450   
2  Capital Kitchen  Menu Items  THE CAPITAL PIZZA 1450 /  ₹1150   
3  Capital Kitchen  Menu Items  CLASSIC PIZZA MARGHERITA  ₹1150   
4  Capital Kitchen  Menu Items               PIZZA FUNGI  ₹1150   

                                         description location  
0  Calamari, shrimp, smoked salmon, conﬁt garlic,...      NaN  
1  Pork pepperoni, crushed tomatoes, mozzarella, ...      NaN  
2  Barbecued chicken, red onion, fresh coriander,...      NaN  
3              Mozzarella, fresh basil, tomato sauce      NaN  
4  Assorted mushrooms, crushed tomatoes, crispy g...      NaN  

Data saved to 'final_merged_menu.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>