In [None]:
# Install required libraries
!pip install requests beautifulsoup4 PyPDF2 pandas

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import PyPDF2
import io
import re
import pandas as pd
import json

# 1. Function to scrape restaurant details from the website
def scrape_restaurant_details():
    url = "https://www.tajhotels.com/en-in/hotels/taj-mahal-new-delhi/restaurants/ricks-bar?hotelId=51478b7d-f166-4ab9-9f53-167fbe45aac8"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract restaurant name
        name = "Rick's"
        name_tag = soup.find('h1')
        if name_tag:
            name = name_tag.text.strip()

        # Extract location
        location = "Number One Mansingh Road, New Delhi, Delhi, 110011, India"
        location_div = soup.find(string=re.compile("CONTACT", re.IGNORECASE))
        if location_div:
            location_p = location_div.find_next('p')
            if location_p:
                location = location_p.text.strip()

        # Extract contact information
        contact = {
            "phone": "+91 11665 13245 / +91 11665 13246",
            "email": "tmhricks.del@tajhotels.com"
        }

        email_a = soup.find('a', href=lambda href: href and 'mailto:' in href)
        if email_a:
            contact["email"] = email_a.text.strip()

        phone_a = soup.find('a', href=lambda href: href and 'tel:' in href)
        if phone_a:
            contact["phone"] = phone_a.text.strip()

        # Extract timings
        timings = {
            "hours": "4:00 pm - 12:45 am"
        }

        timing_div = soup.find(string=re.compile("TIMING", re.IGNORECASE))
        if timing_div:
            timing_p = timing_div.find_next('p')
            if timing_p:
                timings["hours"] = timing_p.text.strip()

        # Extract cuisine type
        cuisine = "Finger food"
        cuisine_div = soup.find(string=re.compile("CUISINE", re.IGNORECASE))
        if cuisine_div:
            cuisine_p = cuisine_div.find_next('p')
            if cuisine_p:
                cuisine = cuisine_p.text.strip()

    except Exception as e:
        print(f"Error scraping website: {e}")
        print("Using default values...")

    # Use the direct menu PDF link from search results
    menu_url = "https://www.vivantahotels.com/content/dam/luxury/hotels/Taj_Mahal_Delhi/documents/menu-pdfs/Rick's_Menu.PDF"

    return {
        "name": name,
        "location": location,
        "contact": contact,
        "timings": timings,
        "cuisine": cuisine,
        "menu_url": menu_url
    }

# 2. Function to download and extract menu information from PDF
def extract_menu_from_pdf(pdf_url):
    print(f"Downloading PDF from: {pdf_url}")

    try:
        response = requests.get(pdf_url)

        if response.status_code == 200:
            # Save the PDF locally
            with open("ricks_menu.pdf", "wb") as f:
                f.write(response.content)

            print("PDF downloaded successfully. Extracting text...")

            # Extract text using PyPDF2
            pdf_file = io.BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)

            # Extract text from all pages
            menu_text = ""
            for page in pdf_reader.pages:
                menu_text += page.extract_text() + "\n\n"

            # Process the text to extract menu items
            menu_items = []
            current_section = "Menu Items"
            lines = menu_text.split('\n')

            for i, line in enumerate(lines):
                line = line.strip()
                if not line:
                    continue

                # Check if this line could be a section header (all caps, no price)
                if line.isupper() and not re.search(r'\d{3,4}$', line):
                    current_section = line
                    continue

                # Look for menu items with prices (item followed by 3-4 digit number)
                price_match = re.search(r'(\d{3,4})$', line)
                if price_match:
                    price = price_match.group(1)
                    # Extract the item name (everything before the price)
                    item_name = line[:line.rfind(price)].strip()

                    # Look for description in the next line
                    description = ""
                    if i + 1 < len(lines):
                        next_line = lines[i + 1].strip()
                        # If next line doesn't have a price and isn't a section header, it's likely a description
                        if not re.search(r'\d{3,4}$', next_line) and not next_line.isupper() and next_line:
                            description = next_line

                    menu_items.append({
                        "section": current_section,
                        "item": item_name,
                        "price": "₹" + price,
                        "description": description
                    })

            return menu_items
        else:
            print(f"Failed to download PDF. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error extracting menu: {e}")
        return None

# Main function to run the scraping and display results
def scrape_ricks_bar():
    print("==== Rick's Bar Restaurant Scraper ====")

    # Get restaurant details
    restaurant_details = scrape_restaurant_details()

    # Display restaurant details
    print("\n=== Restaurant Details ===")
    print(f"Name: {restaurant_details['name']}")
    print(f"Location: {restaurant_details['location']}")
    print(f"Contact: {restaurant_details['contact']['phone']} | {restaurant_details['contact']['email']}")
    print(f"Cuisine: {restaurant_details['cuisine']}")
    print(f"Timings: {restaurant_details['timings']['hours']}")
    print(f"\nMenu URL: {restaurant_details['menu_url']}")

    # Extract menu from PDF
    print("\nExtracting menu from PDF...")
    menu_items = extract_menu_from_pdf(restaurant_details['menu_url'])

    if menu_items:
        # Create DataFrame to display menu items
        menu_df = pd.DataFrame(menu_items)

        # Save to CSV and JSON
        menu_df.to_csv("ricks_bar_menu.csv", index=False)
        with open("ricks_bar_menu.json", "w") as f:
            json.dump(menu_items, f, indent=2)

        # Display sample items
        print(f"\nExtracted {len(menu_items)} menu items. Sample items:")
        print(menu_df.head(10))
        print("\nData saved to 'ricks_bar_menu.csv' and 'ricks_bar_menu.json'")
    else:
        print("Failed to extract menu items.")

    # Save restaurant details
    with open("ricks_bar_details.json", "w") as f:
        json.dump(restaurant_details, f, indent=2)

    print("Restaurant details saved to 'ricks_bar_details.json'")

    return restaurant_details, menu_items

# Run the scraper
restaurant_details, menu_items = scrape_ricks_bar()


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
==== Rick's Bar Restaurant Scraper ====

=== Restaurant Details ===
Name: Rick’s
Location: Number One Mansingh Road, New Delhi, Delhi, 110011, India
Contact: 1-800-111-825 | reservations@ihcltata.com
Cuisine: None
Timings: None

Menu URL: https://www.vivantahotels.com/content/dam/luxury/hotels/Taj_Mahal_Delhi/documents/menu-pdfs/Rick's_Menu.PDF

Extracting menu from PDF...
Downloading PDF from: https://www.vivantahotels.com/content/dam/luxury/hotels/Taj_Mahal_Delhi/documents/menu-pdfs/Rick's_Menu.PDF
PDF downloaded successfully. Extracting text...

Extracted 98 menu items. Sample items:
      section                                 item  price  description
0  M

In [None]:
# prompt: download files

from google.colab import files
files.download('ricks_bar_menu.csv')
files.download('ricks_bar_menu.json')
files.download('ricks_bar_details.json')
files.download('ricks_menu.pdf')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>