<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/Najd_Village_Menu_Attempt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests  # Library to make HTTP requests
from bs4 import BeautifulSoup  # Library for parsing HTML
import csv  # Library for handling CSV files
import os  # Library for interacting with the operating system (like creating folders)
import re  # Library for regular expressions
import time  # Library for time-related functions

# --- Main Script Execution Starts Here ---

# Define the URL for the menu page
base_url = "https://najdvillage.com/menu/?lang=en"

# Define the folder to save downloaded images
images_folder = 'najd_village_images'
# Check if the folder exists, and create it if it doesn't
if not os.path.exists(images_folder):
    os.makedirs(images_folder)

# List to store all scraped data
scraped_data = []

# --- Step 1: Fetch the HTML content ---
print("Fetching HTML content from the website...")
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
    # Send a GET request to the URL with headers and a timeout
    response = requests.get(base_url, headers=headers, timeout=10)
    # Raise an HTTPError if the response status code is a client or server error
    response.raise_for_status()
    # Store the HTML content
    html_content = response.text
    print("HTML content fetched successfully.")
except requests.exceptions.RequestException as e:
    print(f"Error fetching {base_url}: {e}")
    html_content = None # Set content to None to stop the script

# --- Step 2: Parse the HTML and extract data ---
if html_content:
    print("Parsing HTML and extracting menu items...")
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # --- FIX: Find all div containers with the class 'item main' ---
    # Find all div containers that represent a single menu item
    main_containers = soup.find_all('div', class_='item main')

    # Loop through each item container found
    for main in main_containers:
        # Initialize variables to hold the extracted data for this item
        image_link = "Not found"
        item_name = "Not found"
        ingredients = "Not found"
        local_image_path = "Not found"

        # Find the image tag and extract its source URL
        img_tag = main.find('img', class_='item-img')
        if img_tag:
            image_link = img_tag.get('src')

        # Find the item name from the h3 tag
        title_tag = main.find('div', class_='item-title')
        if title_tag:
            h3_tag = title_tag.find('h3')
            # Extract only the text, ignoring other tags like <p> inside the <h3>
            item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()

        # --- Step 3: Download the image ---
        if image_link != "Not found" and item_name != "Not found":
            try:
                # Sanitize the item name to create a valid filename
                sanitized_name = re.sub(r'[^\w\s\-]', '', item_name).strip().replace(' ', '_')
                if not sanitized_name:
                    sanitized_name = "unknown_item"

                # Construct the full path where the image will be saved
                image_filename = f"{sanitized_name}.jpg"
                local_image_path = os.path.join(images_folder, image_filename)

                # Send a request to get the image data
                img_data = requests.get(image_link, stream=True).content

                # Write the image data to a local file
                with open(local_image_path, 'wb') as handler:
                    handler.write(img_data)

                print(f"Downloaded image for '{item_name}' to {local_image_path}")
            except Exception as e:
                print(f"Could not download image from {image_link} for '{item_name}': {e}")
                local_image_path = "Download failed"

        # Find the ingredients from the p tag inside the 'item-excerpt' div
        ingredients_tag = main.find('div', class_='item-excerpt')
        if ingredients_tag:
            p_tag = ingredients_tag.find('p', style="direction: rtl;")
            if p_tag:
                ingredients = p_tag.get_text(strip=True)

        # Print the extracted data to the console
        print("--- Extracted Menu Item Details ---")
        print(f"Image URL: {image_link}")
        print(f"Item Name: {item_name}")
        print(f"Ingredients: {ingredients}")
        print(f"Local Image Path: {local_image_path}")
        print("-----------------------------------")

        # Append a dictionary of the extracted data to the list
        if item_name != "Not found":
            scraped_data.append({
                'Item Name': item_name,
                'Ingredients': ingredients,
                'Image Link': image_link,
                'Local Image Path': local_image_path
            })

# --- Step 4: Save the scraped data to a CSV file ---
if scraped_data:
    print("\nSaving scraped data to CSV...")
    csv_filename = 'najd_village_menu.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Item Name', 'Ingredients', 'Image Link', 'Local Image Path']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(scraped_data)
    print(f"Data successfully saved to {csv_filename}")
else:
    print("No data was scraped to save.")


Fetching HTML content from the website...
HTML content fetched successfully.
Parsing HTML and extracting menu items...


  item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()


Downloaded image for 'Chicken Badya' to najd_village_images/Chicken_Badya.jpg
--- Extracted Menu Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/بادية-نصف-دجاجة-Half-Chicken-Badya_1-1.png
Item Name: Chicken Badya
Ingredients: Trio of jareesh, Goursan and white rice topped with lamb
Local Image Path: najd_village_images/Chicken_Badya.jpg
-----------------------------------
Downloaded image for 'Chicken Saleek' to najd_village_images/Chicken_Saleek.jpg
--- Extracted Menu Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/سليق-نصف-دجاجة-Half-Chicken-Saleek_1-1.png
Item Name: Chicken Saleek
Ingredients: Rice cooked in milk and meat broth served with half chicken
Local Image Path: najd_village_images/Chicken_Saleek.jpg
-----------------------------------
Downloaded image for 'Camel Vertebrae Kabsa' to najd_village_images/Camel_Vertebrae_Kabsa.jpg
--- Extracted Menu Item Details ---
Image URL: https://najdvillage.com/wp-content/uploa