<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/DataSets/Najd_Village_Menu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests  # Library to make HTTP requests
from bs4 import BeautifulSoup  # Library for parsing HTML
import csv  # Library for handling CSV files
import json # Library for handling JSON files
import os  # Library for interacting with the operating system (like creating folders)
import re  # Library for regular expressions
import time  # Library for time-related functions
from datetime import datetime

# --- Main Script Execution Starts Here ---

# Define the URL for the menu page
base_url = "https://najdvillage.com/menu/?lang=en"

# Define the folder to save downloaded images
images_folder = 'najd_village_images'
# Check if the folder exists, and create it if it doesn't
if not os.path.exists(images_folder):
    os.makedirs(images_folder)

# List to store all scraped data
scraped_data = []


# HERE
# List of words to remove from the ingredients description
words_to_remove = ["sauteed","&","chunk","fresh","minced" ,"with", "and", "green", "whole", "preassure","circles", "cooked", "in", "chunks", "stew", "thin", "brown", "sheets", "of", "mixed", "boneless", "cubes", "seasoned", "strips", "garnished", "crushed", "trio", "white", "topped", "broth", "served", "half", "back", "choice", "red", "succulent", "cut", "into", "generous", "pieces", "a", "fragrant", "bed", "or", "requires", "booking", "prepayment", "at", "least", "3", "hours", "in", "advance", "serves", "two", "people", "topped", "rose", "disks", "upside", "down", "together", "pressure", "steam", "pot", "our", "special", "recipe", "millet", "boiled", "thickened", "corn", "four", "crushed", "seedless", "whole", "sauteed", "browen", "flour", "butter", "special", "ground", "all", "melted", "small", "dipped", "in", "garnished", "paste", "mixed", "najd", "village", "fried", "unsweatned", "spring"]

def clean_ingredients(text, words_to_remove):
    """
    Cleans a string by removing specified words, punctuation, and extra whitespace.
    """
    if not isinstance(text, str):
        return text

    # Remove punctuation first
    text_without_punc = re.sub(r'[^\w\s]', '', text)

    # Split the string into a list of words
    words = text_without_punc.split()

    # Create a new list with only the words not in the words_to_remove list
    cleaned_words = [word for word in words if word.lower() not in words_to_remove]

    # Join the words back into a single string
    cleaned_text = " ".join(cleaned_words)

    # Remove any extra spaces and return the result
    return re.sub(r'\s+', ' ', cleaned_text).strip()
# TO HERE


# --- Step 1: Fetch the HTML content ---
print("Fetching HTML content from the website...")
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
    # Send a GET request to the URL with headers and a timeout
    response = requests.get(base_url, headers=headers, timeout=10)
    # Raise an HTTPError if the response status code is a client or server error
    response.raise_for_status()
    # Store the HTML content
    html_content = response.text
    print("HTML content fetched successfully.")
except requests.exceptions.RequestException as e:
    print(f"Error fetching {base_url}: {e}")
    html_content = None # Set content to None to stop the script

# --- Step 2: Parse the HTML and extract data ---
if html_content:
    print("Parsing HTML and extracting menu items...")
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # --- STARTERS ---
    # Find all div containers with the class 'item main'
    starter_containers = soup.find_all('div', class_='item starters')

    # Loop through each main container found
    for starter in starter_containers:
        # Initialize variables to hold the extracted data for this item
        image_link = "Not found"
        item_name = "Not found"
        ingredients = "Not found"
        local_image_path = "Not found"

        # Find the image tag and extract its source URL
        img_tag = starter.find('img', class_='item-img')
        if img_tag:
            image_link = img_tag.get('src')

        # Find the item name from the h3 tag
        title_tag = starter.find('div', class_='item-title')
        if title_tag:
            h3_tag = title_tag.find('h3')
            # Extract only the text, ignoring other tags like <p> inside the <h3>
            item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()

        # --- Step 3: Download the image ---
        if image_link != "Not found" and item_name != "Not found":
            try:
                # Sanitize the item name to create a valid filename
                sanitized_name = re.sub(r'[^\w\s\-]', '', item_name).strip().replace(' ', '_')
                if not sanitized_name:
                    sanitized_name = "unknown_item"

                # Construct the full path where the image will be saved
                image_filename = f"{sanitized_name}.jpg"
                local_image_path = os.path.join(images_folder, image_filename)

                # Send a request to get the image data
                img_data = requests.get(image_link, stream=True).content

                # Write the image data to a local file
                with open(local_image_path, 'wb') as handler:
                    handler.write(img_data)

                print(f"Downloaded image for '{item_name}' to {local_image_path}")
            except Exception as e:
                print(f"Could not download image from {image_link} for '{item_name}': {e}")
                local_image_path = "Download failed"

        # Find the ingredients from the p tag inside the 'item-excerpt' div
        ingredients_tag = starter.find('div', class_='item-excerpt')
        if ingredients_tag:
            p_tag = ingredients_tag.find('p', style="direction: rtl;")
            if p_tag:
                ingredients = p_tag.get_text(strip=True)

        #HERE
        # Clean the ingredients string
        ingredients = clean_ingredients(ingredients, words_to_remove)
        # TO HERE

        # Print the extracted data to the console
        print("--- Extracted Starter Item Details ---")
        print(f"Image URL: {image_link}")
        print(f"Item Name: {item_name}")
        print(f"Ingredients: {ingredients}")
        print(f"Local Image Path: {local_image_path}")
        print("-----------------------------------")

        # Append a dictionary of the extracted data to the list
        if item_name != "Not found":
            scraped_data.append({
                'Item Name': item_name,
                'Ingredients': ingredients,
                'Image Link': image_link,
                'Local Image Path': local_image_path,
                'Date Scraped': datetime.utcnow().date().isoformat()
            })

    print()



    # --- MAIN COURSES ---
    # Find all div containers with the class 'item main'
    main_containers = soup.find_all('div', class_='item main')

    # Loop through each main container found
    for main in main_containers:
        # Initialize variables to hold the extracted data for this item
        image_link = "Not found"
        item_name = "Not found"
        ingredients = "Not found"
        local_image_path = "Not found"

        # Find the image tag and extract its source URL
        img_tag = main.find('img', class_='item-img')
        if img_tag:
            image_link = img_tag.get('src')

        # Find the item name from the h3 tag
        title_tag = main.find('div', class_='item-title')
        if title_tag:
            h3_tag = title_tag.find('h3')
            # Extract only the text, ignoring other tags like <p> inside the <h3>
            item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()

        # --- Step 3: Download the image ---
        if image_link != "Not found" and item_name != "Not found":
            try:
                # Sanitize the item name to create a valid filename
                sanitized_name = re.sub(r'[^\w\s\-]', '', item_name).strip().replace(' ', '_')
                if not sanitized_name:
                    sanitized_name = "unknown_item"

                # Construct the full path where the image will be saved
                image_filename = f"{sanitized_name}.jpg"
                local_image_path = os.path.join(images_folder, image_filename)

                # Send a request to get the image data
                img_data = requests.get(image_link, stream=True).content

                # Write the image data to a local file
                with open(local_image_path, 'wb') as handler:
                    handler.write(img_data)

                print(f"Downloaded image for '{item_name}' to {local_image_path}")
            except Exception as e:
                print(f"Could not download image from {image_link} for '{item_name}': {e}")
                local_image_path = "Download failed"

        # Find the ingredients from the p tag inside the 'item-excerpt' div
        ingredients_tag = main.find('div', class_='item-excerpt')
        if ingredients_tag:
            p_tag = ingredients_tag.find('p', style="direction: rtl;")
            if p_tag:
                ingredients = p_tag.get_text(strip=True)

        #HERE
        # Clean the ingredients string
        ingredients = clean_ingredients(ingredients, words_to_remove)
        # TO HERE

        # Print the extracted data to the console
        print("--- Extracted Main Item Details ---")
        print(f"Image URL: {image_link}")
        print(f"Item Name: {item_name}")
        print(f"Ingredients: {ingredients}")
        print(f"Local Image Path: {local_image_path}")
        print("-----------------------------------")

        # Append a dictionary of the extracted data to the list
        if item_name != "Not found":
            scraped_data.append({
                'Item Name': item_name,
                'Ingredients': ingredients,
                'Image Link': image_link,
                'Local Image Path': local_image_path,
                'Date Scraped': datetime.utcnow().date().isoformat()
            })

    print()

    # --- NAJDIAH DISHES ---
    # Find all div containers with the class 'item main'
    dish_containers = soup.find_all('div', class_='item dishes')

    # Loop through each dish container found
    for dish in dish_containers:
      # Initialize variables to hold the extracted data for this item
        image_link = "Not found"
        item_name = "Not found"
        ingredients = "Not found"
        local_image_path = "Not found"

        # Find the image tag and extract its source URL
        img_tag = dish.find('img', class_='item-img')
        if img_tag:
            image_link = img_tag.get('src')

        # Find the item name from the h3 tag
        title_tag = dish.find('div', class_='item-title')
        if title_tag:
            h3_tag = title_tag.find('h3')
            # Extract only the text, ignoring other tags like <p> inside the <h3>
            item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()

        # --- Step 3: Download the image ---
        if image_link != "Not found" and item_name != "Not found":
            try:
                # Sanitize the item name to create a valid filename
                sanitized_name = re.sub(r'[^\w\s\-]', '', item_name).strip().replace(' ', '_')
                if not sanitized_name:
                    sanitized_name = "unknown_item"

                # Construct the full path where the image will be saved
                image_filename = f"{sanitized_name}.jpg"
                local_image_path = os.path.join(images_folder, image_filename)

                # Send a request to get the image data
                img_data = requests.get(image_link, stream=True).content

                # Write the image data to a local file
                with open(local_image_path, 'wb') as handler:
                    handler.write(img_data)

                print(f"Downloaded image for '{item_name}' to {local_image_path}")
            except Exception as e:
                print(f"Could not download image from {image_link} for '{item_name}': {e}")
                local_image_path = "Download failed"

        # Find the ingredients from the p tag inside the 'item-excerpt' div
        ingredients_tag = dish.find('div', class_='item-excerpt')
        if ingredients_tag:
            p_tag = ingredients_tag.find('p', style="direction: rtl;")
            if p_tag:
                ingredients = p_tag.get_text(strip=True)

        #HERE
        # Clean the ingredients string
        ingredients = clean_ingredients(ingredients, words_to_remove)
        # TO HERE

        # Print the extracted data to the console
        print("--- Extracted Dish Item Details ---")
        print(f"Image URL: {image_link}")
        print(f"Item Name: {item_name}")
        print(f"Ingredients: {ingredients}")
        print(f"Local Image Path: {local_image_path}")
        print("-----------------------------------")

        # Append a dictionary of the extracted data to the list
        if item_name != "Not found":
            scraped_data.append({
                'Item Name': item_name,
                'Ingredients': ingredients,
                'Image Link': image_link,
                'Local Image Path': local_image_path,
                'Date Scraped': datetime.utcnow().date().isoformat()
            })

    print()

    # --- MAGLOBA DISHES ---
    # Find all div containers with the class 'item main'
    magloba_containers = soup.find_all('div', class_='item maglloba')

    # Loop through each dish container found
    for magloba in magloba_containers:
      # Initialize variables to hold the extracted data for this item
        image_link = "Not found"
        item_name = "Not found"
        ingredients = "Not found"
        local_image_path = "Not found"

        # Find the image tag and extract its source URL
        img_tag = magloba.find('img', class_='item-img')
        if img_tag:
            image_link = img_tag.get('src')

        # Find the item name from the h3 tag
        title_tag = magloba.find('div', class_='item-title')
        if title_tag:
            h3_tag = title_tag.find('h3')
            # Extract only the text, ignoring other tags like <p> inside the <h3>
            item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()

        # --- Step 3: Download the image ---
        if image_link != "Not found" and item_name != "Not found":
            try:
                # Sanitize the item name to create a valid filename
                sanitized_name = re.sub(r'[^\w\s\-]', '', item_name).strip().replace(' ', '_')
                if not sanitized_name:
                    sanitized_name = "unknown_item"

                # Construct the full path where the image will be saved
                image_filename = f"{sanitized_name}.jpg"
                local_image_path = os.path.join(images_folder, image_filename)

                # Send a request to get the image data
                img_data = requests.get(image_link, stream=True).content

                # Write the image data to a local file
                with open(local_image_path, 'wb') as handler:
                    handler.write(img_data)

                print(f"Downloaded image for '{item_name}' to {local_image_path}")
            except Exception as e:
                print(f"Could not download image from {image_link} for '{item_name}': {e}")
                local_image_path = "Download failed"

        # Find the ingredients from the p tag inside the 'item-excerpt' div
        ingredients_tag = magloba.find('div', class_='item-excerpt')
        if ingredients_tag:
            p_tag = ingredients_tag.find('p', style="direction: rtl;")
            if p_tag:
                ingredients = p_tag.get_text(strip=True)

        #HERE
        # Clean the ingredients string
        ingredients = clean_ingredients(ingredients, words_to_remove)
        # TO HERE

        # Print the extracted data to the console
        print("--- Extracted Magloba Item Details ---")
        print(f"Image URL: {image_link}")
        print(f"Item Name: {item_name}")
        print(f"Ingredients: {ingredients}")
        print(f"Local Image Path: {local_image_path}")
        print("-----------------------------------")

        # Append a dictionary of the extracted data to the list
        if item_name != "Not found":
            scraped_data.append({
                'Item Name': item_name,
                'Ingredients': ingredients,
                'Image Link': image_link,
                'Local Image Path': local_image_path,
                'Date Scraped': datetime.utcnow().date().isoformat()
            })


    # --- DESSERTS ---
    # Find all div containers with the class 'item main'
    dessert_containers = soup.find_all('div', class_='item dessert')

    # Loop through each dish container found
    for dessert in dessert_containers:
      # Initialize variables to hold the extracted data for this item
        image_link = "Not found"
        item_name = "Not found"
        ingredients = "Not found"
        local_image_path = "Not found"

        # Find the image tag and extract its source URL
        img_tag = dessert.find('img', class_='item-img')
        if img_tag:
            image_link = img_tag.get('src')

        # Find the item name from the h3 tag
        title_tag = dessert.find('div', class_='item-title')
        if title_tag:
            h3_tag = title_tag.find('h3')
            # Extract only the text, ignoring other tags like <p> inside the <h3>
            item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()

        # --- Step 3: Download the image ---
        if image_link != "Not found" and item_name != "Not found":
            try:
                # Sanitize the item name to create a valid filename
                sanitized_name = re.sub(r'[^\w\s\-]', '', item_name).strip().replace(' ', '_')
                if not sanitized_name:
                    sanitized_name = "unknown_item"

                # Construct the full path where the image will be saved
                image_filename = f"{sanitized_name}.jpg"
                local_image_path = os.path.join(images_folder, image_filename)

                # Send a request to get the image data
                img_data = requests.get(image_link, stream=True).content

                # Write the image data to a local file
                with open(local_image_path, 'wb') as handler:
                    handler.write(img_data)

                print(f"Downloaded image for '{item_name}' to {local_image_path}")
            except Exception as e:
                print(f"Could not download image from {image_link} for '{item_name}': {e}")
                local_image_path = "Download failed"

        # Find the ingredients from the p tag inside the 'item-excerpt' div
        ingredients_tag = dessert.find('div', class_='item-excerpt')
        if ingredients_tag:
            p_tag = ingredients_tag.find('p', style="direction: rtl;")
            if p_tag:
              ingredients = p_tag.get_text(strip=True)

        #HERE
        # Clean the ingredients string
        ingredients = clean_ingredients(ingredients, words_to_remove)
        # TO HERE

        # Print the extracted data to the console
        print("--- Extracted Dessert Item Details ---")
        print(f"Image URL: {image_link}")
        print(f"Item Name: {item_name}")
        print(f"Ingredients: {ingredients}")
        print(f"Local Image Path: {local_image_path}")
        print("-----------------------------------")

        # Append a dictionary of the extracted data to the list
        if item_name != "Not found":
            scraped_data.append({
                'Item Name': item_name,
                'Ingredients': ingredients,
                'Image Link': image_link,
                'Local Image Path': local_image_path,
                'Date Scraped': datetime.utcnow().date().isoformat()
            })


# --- Step 4: Save the scraped data to a CSV file ---
if scraped_data:
    print("\nSaving scraped data to CSV...")
    csv_filename = 'najd_village_menu.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Item Name', 'Ingredients', 'Image Link', 'Local Image Path', 'Date Scraped']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(scraped_data)
    print(f"Data successfully saved to {csv_filename}")


    # --- Step 5: Save the scraped data to a JSON file ---
    print("\nSaving scraped data to JSON...")
    json_filename = 'najd_village_menu.json'
    with open(json_filename, 'w', encoding='utf-8') as jsonfile:
        json.dump(scraped_data, jsonfile, indent=4, ensure_ascii=False)
    print(f"Data successfully saved to {json_filename}")

else:
    print("No data was scraped to save.")


Fetching HTML content from the website...
HTML content fetched successfully.
Parsing HTML and extracting menu items...


  item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()
  'Date Scraped': datetime.utcnow().date().isoformat()


Downloaded image for 'Jareesh Soup' to najd_village_images/Jareesh_Soup.jpg
--- Extracted Starter Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/شوربة-جريش-Jareesh-Soup-2.png
Item Name: Jareesh Soup
Ingredients: Jareesh Soup lamp spices
Local Image Path: najd_village_images/Jareesh_Soup.jpg
-----------------------------------
Downloaded image for 'Lamb Chunk Soup' to najd_village_images/Lamb_Chunk_Soup.jpg
--- Extracted Starter Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/شوربة-قـطع-اللـحم-Lamb-Chunk-Soup-1.png
Item Name: Lamb Chunk Soup
Ingredients: Lamp soup ginger
Local Image Path: najd_village_images/Lamb_Chunk_Soup.jpg
-----------------------------------
Downloaded image for 'Veg & lamb Stew' to najd_village_images/Veg__lamb_Stew.jpg
--- Extracted Starter Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/مرق-خضار-باللحم-Veg-lamb-Stew-1.png
Item Name: Veg & lamb Stew
Ingredients: Lamp veg

  item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()
  'Date Scraped': datetime.utcnow().date().isoformat()


Downloaded image for 'Chicken Saleek' to najd_village_images/Chicken_Saleek.jpg
--- Extracted Main Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/سليق-نصف-دجاجة-Half-Chicken-Saleek_1-1.png
Item Name: Chicken Saleek
Ingredients: Rice milk meat chicken
Local Image Path: najd_village_images/Chicken_Saleek.jpg
-----------------------------------
Downloaded image for 'Camel Vertebrae Kabsa' to najd_village_images/Camel_Vertebrae_Kabsa.jpg
--- Extracted Main Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2024/12/Untitled-1.png
Item Name: Camel Vertebrae Kabsa
Ingredients: Not found
Local Image Path: najd_village_images/Camel_Vertebrae_Kabsa.jpg
-----------------------------------
Downloaded image for 'Chunk of hashi meat on bone' to najd_village_images/Chunk_of_hashi_meat_on_bone.jpg
--- Extracted Main Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2024/12/Untitled-1-Rec.png
Item Name: Chunk of hashi meat on bone

  item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()
  'Date Scraped': datetime.utcnow().date().isoformat()


Downloaded image for 'Matazeez' to najd_village_images/Matazeez.jpg
--- Extracted Dish Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/مطازيز-Matazeez.png
Item Name: Matazeez
Ingredients: lamb wheat dough tomato sauce vegetables meat
Local Image Path: najd_village_images/Matazeez.jpg
-----------------------------------
Downloaded image for 'Margoog' to najd_village_images/Margoog.jpg
--- Extracted Dish Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2025/03/margog.png
Item Name: Margoog
Ingredients: Lamb vegetable dough
Local Image Path: najd_village_images/Margoog.jpg
-----------------------------------
Downloaded image for 'Goursan' to najd_village_images/Goursan.jpg
--- Extracted Dish Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2025/03/goursan.png
Item Name: Goursan
Ingredients: Lamb vegetable goursan bread
Local Image Path: najd_village_images/Goursan.jpg
-----------------------------------
Downloaded 

  item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()
  'Date Scraped': datetime.utcnow().date().isoformat()


Downloaded image for 'Hashi Steam' to najd_village_images/Hashi_Steam.jpg
--- Extracted Magloba Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/مضغوط-حاشي-Hashi-steam-2.png
Item Name: Hashi Steam
Ingredients: Rice Camel meat
Local Image Path: najd_village_images/Hashi_Steam.jpg
-----------------------------------
Downloaded image for 'Chicken Steam' to najd_village_images/Chicken_Steam.jpg
--- Extracted Magloba Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/مضغوط-دجاج-Chicken-steam-1.png
Item Name: Chicken Steam
Ingredients: Rice chicken
Local Image Path: najd_village_images/Chicken_Steam.jpg
-----------------------------------
Downloaded image for 'Naimi Meat Steam' to najd_village_images/Naimi_Meat_Steam.jpg
--- Extracted Magloba Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/مضغوط-لحم-Lamb-steam-1.png
Item Name: Naimi Meat Steam
Ingredients: Rice lamb
Local Image Path: najd_village_images/

  item_name = "".join(h3_tag.find_all(text=True, recursive=False)).strip()
  'Date Scraped': datetime.utcnow().date().isoformat()


Downloaded image for 'Cream Caramel' to najd_village_images/Cream_Caramel.jpg
--- Extracted Dessert Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2024/02/removal.ai_eb3f9dc9-41c5-4fbf-9343-f1e798042e38-caramel1-1.png
Item Name: Cream Caramel
Ingredients: 
Local Image Path: najd_village_images/Cream_Caramel.jpg
-----------------------------------
Downloaded image for 'Millet Cake (Dukh'n)' to najd_village_images/Millet_Cake_Dukhn.jpg
--- Extracted Dessert Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2024/02/كيكة-الدخن-للموقع.png
Item Name: Millet Cake (Dukh'n)
Ingredients: Cake
Local Image Path: najd_village_images/Millet_Cake_Dukhn.jpg
-----------------------------------
Downloaded image for 'Mohalabiya' to najd_village_images/Mohalabiya.jpg
--- Extracted Dessert Item Details ---
Image URL: https://najdvillage.com/wp-content/uploads/2020/07/Mahlabiya-مهلبية-1.png
Item Name: Mohalabiya
Ingredients: milk sugar pistachio
Local Image Path: