<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/Sayyidaty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests   # Library for making HTTP requests
from bs4 import BeautifulSoup   # Library for parsing HTML documents
import csv    # Library for working with CSV files
import json   # Library for working with JSON data
from datetime import datetime   # Library for getting the current date and time
import time   # Library for handling time-related tasks (like pauses)
import re   # Library for regular expressions

class SaudiDishScraper:

    # Define the scraper class
    def __init__(self):
        # The constructor initializes the scraper's properties
        self.base_url = "https://kitchen.sayidaty.net"    # Base URL for the website
        # The URL for the Saudi cuisine category, using an f-string for formatting
        self.saudi_cuisine_url = f"{self.base_url}/recipes/index/cuisine/2419"
        self.dishes_data = []

    def normalize_ar(self, s: str) -> str:
      # A method to normalize Arabic text
        if not s:
            return ''   # Return an empty string if the input is empty
        # Remove Arabic diacritical marks (tashkeel) using a regular expression
        s = re.sub(r'[\u064B-\u0652\u0670]', '', s)
        # Replace the elongation character '_' with an empty string
        s = s.replace('ـ', '')
        # Replace all forms of 'alif' (آ، أ، إ) with a standard 'ا'
        s = re.sub('[إأآا]', 'ا', s)
         # Replace the final alif (ى) with 'ya' (ي)
        s = s.replace('ى', 'ي')
        # Standardize hamza on 'waw' (ؤ) and 'ya' (ئ)
        s = s.replace('ؤ', 'و').replace('ئ', 'ي')
        # Replace 'ta marbuta' (ة) with 'ha' (ه)
        s = s.replace('ة', 'ه')
        return s.strip()    # Return the stripped (no leading/trailing whitespace) string

    def tokenize(self, s: str):
        # A method to split a string into a list of words
        # Replace any non-alphanumeric or non-Arabic character with a space
        s = re.sub(r'[^\w\u0600-\u06FF]+', ' ', s)
        # Use a list comprehension to split the string into a list of words, filtering out any empty strings
        parts = [p for p in s.split() if p]
        return parts    # Return the list of words

    def get_page_content(self, url):
        # A method to fetch the HTML content of a given URL
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }   # Define headers to mimic a web browser
        try:
            # Send an HTTP GET request to the URL with a 10-second timeout
            response = requests.get(url, headers=headers, timeout=10)
            # Raise an HTTPError if the respone status code is an error
            response.raise_for_status()
            return response.text    # Return the HTML content as a string
        except requests.RequestException as e:
            # Catch any request-related errors
            print(f"Error fetching {url}: {e}")   # Print error message
            return None   # Return None to indicate failure

    def extract_dish_links(self, html_content):
        # A method to extract all dish links from an HTML page
        soup = BeautifulSoup(html_content, 'html.parser') # Parse the HTML content
        dish_links = []   # Initialize an empty list for the links
        # Find all 'a' tags with an 'href' attribute attribute containing '/node/'
        links = soup.select('a[href*="/node/"]')
        for link in links:
            href = link.get('href')   # Get the value of the 'href' attribute
            if href and '/node/' in href:   # Check if the href exists and contains '/node/'
                # Create the full URL by combining the base URL and the href
                full_url = href if href.startswith('http') else f"{self.base_url}{href}"
                dish_links.append(full_url)   # Add the full URL to the list
        # Return a list of unique links by converting to a set and back to a list
        return list(set(dish_links))

    def clean_ingredient_text(self, text):
        # A method to clean raw ingreadient text
        if not text:
            return ''   # Return an empty string if the input is empty
        if ':' in text:
            # If a colon exists, split the string and take the part before it
            text = text.split(':')[0]
        # Remove any text within paranthesis using a regular expression
        text = re.sub(r'\(.*?\)', '', text)
        # Normalize the text using the normalize_ar method and strip whitespace
        return self.normalize_ar(text).strip()

    def extract_dish_details(self, dish_url):
        # A method to scrape details from a single dish page
        html_content = self.get_page_content(dish_url)  # Fetch the page content
        if not html_content:
            return    # Return None if the page content could not be fetched
        soup = BeautifulSoup(html_content, 'html.parser')   # Parse the HTML
        # Find the dish name from the most likely heading tags (h1, h2) or the title tag
        title_element = soup.find('h1') or soup.find('h2') or soup.find('title')
        # Extract the text of the title element, or set to "Unknown Dish" if not found
        dish_name = title_element.get_text().strip() if title_element else "Unknown Dish"
        image_url = None    # Initialize the image URL to None
        # Try to find the image URL from the 'og:image' meta tag
        meta_image = soup.find('meta', property='og:image')
        if meta_image:
            image_url = meta_image.get('content', '')   # Get the URL from the content attribute
        else:
            # If not found, look for a general image tag with a common image extension
            img_element = soup.find('img', {'src': re.compile(r'\.(jpg|jpeg|png|webp)', re.I)})
            if img_element:
                image_url = img_element.get('src', '')
                if image_url and not image_url.startswith('http'):
                    image_url = f"{self.base_url}{image_url}"  # Make the URL absolute if it's relative
        ingredients = []    # Initialize an empty list for ingredients
        # Find the div with the class 'ingredients-area'
        ingredients_area = soup.find('div', class_='ingredients-area')
        if ingredients_area:
            # Iterate through the strings within the div, including text separated by <br>
            for line in ingredients_area.stripped_strings:
                cleaned_line = self.clean_ingredient_text(line) # Clean the ingredient line
                if cleaned_line:
                    ingredients.append(cleaned_line)  # Add the cleaned ingredient to the list
        # Remove duplicate ingredients while preserving order
        ingredients = list(dict.fromkeys(ingredients))
        # Get the current date and time for the scrape_date
        scrape_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return {
            'dish_name': dish_name,
            'ingredients': ingredients,
            'image_url': image_url,
            'dish_url': dish_url,
            'scrape_date': scrape_date
        }   # Return a dictionary of the extracted data

    def scrape_saudi_dishes(self, max_pages=5):
        # The main method to start the scraping process
        print("Starting to scrape Saudi dishes...")
        all_dish_links = []   # A list to store all collected dish links
        # Loop through a specified number of pages
        for page in range(1, max_pages + 1):
            # Construct the URL for the current page
            page_url = f"{self.saudi_cuisine_url}?page={page}" if page > 1 else self.saudi_cuisine_url
            print(f"Scraping page {page}: {page_url}")
            html_content = self.get_page_content(page_url)  # Fetch the HTML for the page
            if not html_content:
                print(f"Failed to retrieve page {page}")
                continue    # Skip to the next page if fetching fails
            dish_links = self.extract_dish_links(html_content)  # Extract dish links
            all_dish_links.extend(dish_links)   # Add the new links to the list
            print(f"Found {len(dish_links)} dishes on page {page}")
            time.sleep(1)   # Pause for 1 second
        # Remove any duplicate links found across pages
        all_dish_links = list(set(all_dish_links))
        print(f"Total unique dishes found: {len(all_dish_links)}")
        # Loop through each unique dish URL
        for i, dish_url in enumerate(all_dish_links, 1):
            print(f"Scraping dish {i}/{len(all_dish_links)}: {dish_url}")
            dish_data = self.extract_dish_details(dish_url) # Scrape the dish details
            if dish_data and dish_data['ingredients']:  # Only add data if ingredients were found
                self.dishes_data.append(dish_data)  # Add the data dictionary to the list
            time.sleep(1)   # Pause for 1 second
        print(f"Successfully scraped {len(self.dishes_data)} dishes with ingredients")
        return self.dishes_data   # Return the list of scraped dishes

    def save_to_csv(self, filename="saudi_dishes.csv"):
        # A method to save the scraped data to a CSV file
        if not self.dishes_data:
            print("No data to save")
            return
        # Open the file in write mode with UTF-8 encoding
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            # Define the column names for the CSV file
            fieldnames = ['dish_name', 'ingredients', 'image_url', 'dish_url', 'scrape_date']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()  # Write the header row
            for dish in self.dishes_data:
                dish_copy = dish.copy()   # Create a copy to avoid modifying the original data
                # Join the list of ingredients into a single string separated by '|'
                dish_copy['ingredients'] = '|'.join(dish['ingredients'])
                writer.writerow(dish_copy)    # Write the row to the CSV file
        print(f"Data saved to {filename}")

    def save_to_json(self, filename="saudi_dishes.json"):
        # A method to save the scraped data to a JSON file
        if not self.dishes_data:
            print("No data to save")
            return
        # Open the file in write mode with UTF-8 encoding
        with open(filename, 'w', encoding='utf-8') as jsonfile:
            # Dump the data list into the JSON file, ensuring non-ASCII characters are preserved
            json.dump(self.dishes_data, jsonfile, ensure_ascii=False, indent=2)
        print(f"Data saved to {filename}")

if __name__ == "__main__":
    # This block of code runs only when the script is executed directly
    scraper = SaudiDishScraper()  # Create an instance of the scraper class
    # Call the main scraping method to start the process
    dishes = scraper.scrape_saudi_dishes(max_pages=3)
    if dishes:
        scraper.save_to_csv()  # Save the data to a CSV file
        scraper.save_to_json()  # Save the data to a JSON file
        print("\nSample of scraped data:")
        # Print a sample of the first 3 dishes for verification
        for i, dish in enumerate(dishes[:3]):
            print(f"\nDish {i+1}:")
            print(f"Name: {dish['dish_name']}")
            print(f"Ingredients: {dish['ingredients']}")
            print(f"Image URL: {dish['image_url']}")
            print(f"Scraped on: {dish['scrape_date']}")

Starting to scrape Saudi dishes...
Scraping page 1: https://kitchen.sayidaty.net/recipes/index/cuisine/2419
Found 18 dishes on page 1
Scraping page 2: https://kitchen.sayidaty.net/recipes/index/cuisine/2419?page=2
Found 19 dishes on page 2
Scraping page 3: https://kitchen.sayidaty.net/recipes/index/cuisine/2419?page=3
Found 19 dishes on page 3
Total unique dishes found: 54
Scraping dish 1/54: https://kitchen.sayidaty.net/node/36100/الخبز-الأحمر-الحساوي/وصفات-الخبز/وصفات
Scraping dish 2/54: https://kitchen.sayidaty.net/node/36099/الحنيني-السعودي/حلويات/وصفات
Scraping dish 3/54: https://kitchen.sayidaty.net/node/34178/اللقيمات-الذهبية-الهشة-بدون-بيض/حلويات/وصفات
Scraping dish 4/54: https://kitchen.sayidaty.net/node/36120/الدبيازة-حلى-ضيافة-لليوم-الوطني-السعودي/حلويات/وصفات
Scraping dish 5/54: https://kitchen.sayidaty.net/node/36110/المصابيب-السعودية/حلويات/وصفات
Scraping dish 6/54: https://kitchen.sayidaty.net/node/36118/قهوة-اللوز-الحجازية-التقليدية/مشروبات-وعصائر/وصفات
Scraping dish 7/