<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/datasciphase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("hi");

hi


In [None]:
# Cell 1: Imports & Logging
import time, json, csv, re, sys, traceback
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

def log(msg, level="INFO"):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}][{level}] {msg}")

In [None]:
# Cell 2: Config
BASE = "https://kitchen.sayidaty.net"
START_LIST = "https://kitchen.sayidaty.net/recipes/index/cuisine/2419"  # المطبخ السعودي
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; research-bot/1.0; +https://example.edu)"
}
REQUEST_TIMEOUT = 20
SLEEP_BETWEEN_REQUESTS = 1.5
MAX_PAGES = 12  # زِدها إذا تحتاج صفحات أكثر

# مخرجات
OUT_JSONL = "sayidaty_saudi_recipes.jsonl"
OUT_CSV   = "sayidaty_saudi_recipes.csv"
URLS_CHECKPOINT = "sayidaty_urls.txt"

In [None]:
# Cell 3: HTTP helper
def get_soup(url):
    try:
        log(f"GET {url}")
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except requests.HTTPError as e:
        log(f"HTTPError {url}: {e}", "ERROR")
        raise
    except requests.RequestException as e:
        log(f"RequestException {url}: {e}", "ERROR")
        raise
    except Exception as e:
        log(f"Unknown error while fetching {url}: {e}", "ERROR")
        raise

In [None]:
# Cell 4: Listing page -> recipe URLs
def list_page_urls(list_url):
    try:
        soup = get_soup(list_url)
        links = []
        # فلتر روابط /node/ لأنها غالبًا للوصفات
        for a in soup.select("a[href*='/node/']"):
            href = a.get("href")
            if href and "/node/" in href:
                links.append(urljoin(BASE, href))
        unique_links = sorted(set(links))
        log(f"Found {len(unique_links)} recipe links in listing page")
        return unique_links
    except Exception:
        log("Failed to parse listing page; see traceback below", "ERROR")
        traceback.print_exc()
        return []

In [None]:
# Cell 5: Paginate listing
def paginate_listing(start_url, max_pages=12):
    all_urls = []
    for p in range(1, max_pages + 1):
        page_url = start_url if p == 1 else f"{start_url}/?page={p}"
        log(f"Listing page {p} -> {page_url}")
        try:
            urls = list_page_urls(page_url)
            if not urls:
                log("No more urls found, stopping pagination")
                break
            all_urls.extend(urls)
        except Exception:
            log("Error while listing this page; continuing to next page", "ERROR")
            traceback.print_exc()
        time.sleep(SLEEP_BETWEEN_REQUESTS)
    all_urls = sorted(set(all_urls))
    # تشيكبوينت للروابط لاستخدامه لو وقفت النصّاعة
    try:
        with open(URLS_CHECKPOINT, "w", encoding="utf-8") as f:
            f.write("\n".join(all_urls))
        log(f"Saved URLs checkpoint: {URLS_CHECKPOINT} ({len(all_urls)} urls)")
    except Exception:
        log("Failed to write URLs checkpoint", "ERROR")
        traceback.print_exc()
    return all_urls

In [None]:
# ====== المنظِّف خارج extract_recipe ======
def clean_ingredient(text):
    # إزالة الأرقام والكسور ووحدات القياس الشائعة
    text = re.sub(r"\d+\/\d+|\d+", "", text)
    text = re.sub(r"(كوب|أكواب|ملعقة|ملاعق|صغيره|صغيرة|كبيرة|حبة|حبات|½|¼|¾|جرام|غرام|لتر|رشة)", "", text)
    text = re.sub(r"\s+من\s+", " ", text)  # يحذف "من" الوسطية الشائعة
    return text.strip()

# ====== دالة استخراج وصفة واحدة ======
def extract_recipe(url):
    try:
        soup = get_soup(url)

        # العنوان
        title_el = soup.find(["h1", "h2"], string=re.compile(r".+"))
        title = title_el.get_text(strip=True) if title_el else None

        # التاريخ
        date_text = None
        if title_el:
            date_node = title_el.find_next(string=re.compile(r"\d{4}-\d{2}-\d{2}|\d{4}"))
            date_text = date_node.strip() if date_node else None

        # الوقت / يكفي ل
        time_text = None; serves = None
        time_label = soup.find(["span","div"], string=re.compile("وقت الطه"))
        if time_label:
            t = time_label.find_next()
            time_text = t.get_text(strip=True) if t else None
        serves_label = soup.find(["span","div"], string=re.compile("يكفي ل"))
        if serves_label:
            s = serves_label.find_next()
            serves = s.get_text(strip=True) if s else None

        # المكوّنات (منظّفة)
        ingredients = []
        ing_header = soup.find(["h3","h2"], string=re.compile("المقادير"))
        if ing_header:
            ul = ing_header.find_next(["ul","ol"])
            if ul:
                for li in ul.find_all("li"):
                    raw = li.get_text(" ", strip=True)
                    if raw:
                        cleaned = clean_ingredient(raw)
                        if cleaned:
                            ingredients.append(cleaned)

        # الخطوات
        steps = []
        steps_header = soup.find(["h3","h2"], string=re.compile("طريقة التحضير"))
        if steps_header:
            ol = steps_header.find_next(["ol","ul"])
            if ol:
                for i, li in enumerate(ol.find_all("li"), 1):
                    txt = li.get_text(" ", strip=True)
                    if txt: steps.append(f"{i}. {txt}")

        # الوسوم
        tags = []
        tags_header = soup.find(["h5","h4","h3"], string=re.compile("سمات"))
        if tags_header:
            tag_container = tags_header.find_next()
            if tag_container:
                for a in tag_container.find_all("a", href=True):
                    t = a.get_text(strip=True)
                    if t: tags.append(t)

        # الصورة
        img = None
        img_el = soup.find("img")
        if img_el and img_el.get("src"):
            img = urljoin(BASE, img_el["src"])

        data = {
            "title": title,
            "date": date_text,
            "time": time_text,
            "serves": serves,
            "ingredients": ingredients,   # الآن أسماء فقط (أرز، حليب، …)
            "steps": steps,
            "tags": tags,
            "image": img,
            "url": url,
        }
        return data

    except Exception:
        log(f"extract_recipe failed for: {url}", "ERROR")
        traceback.print_exc()
        return None

In [None]:
# Cell 7: Saudi filter
SAUDI_HINTS = {
    "المطبخ السعودي", "طبخات سعودية", "وصفات سعودية",
    "كبسة", "سليق", "ثريد", "مقلقل", "مندي",
    "قرصان", "مرقوق", "جريش", "كليجة", "لقيمات"
}

def is_saudi_recipe(data):
    if not data:
        return False
    tags = set(data.get("tags") or [])
    # بما أننا داخل قائمة المطبخ السعودي، هذا فلتر إضافي فقط
    return True if not tags else any(t in SAUDI_HINTS for t in tags)

In [None]:
# Cell 8: Crawl loop
def crawl_all():
    try:
        urls = paginate_listing(START_LIST, max_pages=MAX_PAGES)
        log(f"Total URLs gathered: {len(urls)}")
    except Exception:
        log("Failed during pagination stage", "ERROR")
        traceback.print_exc()
        return []

    out = []
    for i, u in enumerate(urls, 1):
        log(f"[{i}/{len(urls)}] Extract -> {u}")
        try:
            d = extract_recipe(u)
            if d and is_saudi_recipe(d):
                out.append(d)
            else:
                log("Skipped (not Saudi or extraction returned None)", "INFO")
        except Exception:
            log("Unexpected error in crawl loop for this URL", "ERROR")
            traceback.print_exc()
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    log(f"Crawl finished. Valid recipes: {len(out)}")
    return out

In [None]:
# Cell 9: Exporters
def export_jsonl(rows, path=OUT_JSONL):
    try:
        with open(path, "w", encoding="utf-8") as f:
            for r in rows:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
        log(f"Saved JSONL -> {path}")
    except Exception:
        log("Failed to write JSONL", "ERROR")
        traceback.print_exc()

def export_csv(rows, path=OUT_CSV):
    try:
        with open(path, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["title","date","time","serves","ingredients","steps","tags","image","url"])
            for r in rows:
                w.writerow([
                    r.get("title"),
                    r.get("date"),
                    r.get("time"),
                    r.get("serves"),
                    " | ".join(r.get("ingredients",[])),
                    " | ".join(r.get("steps",[])),
                    " | ".join(r.get("tags",[])),
                    r.get("image"),
                    r.get("url"),
                ])
        log(f"Saved CSV -> {path}")
    except Exception:
        log("Failed to write CSV", "ERROR")
        traceback.print_exc()

In [None]:
# Cell 10: Main
def main():
    log("=== START SCRAPE ===")
    try:
        rows = crawl_all()
        export_jsonl(rows, OUT_JSONL)
        export_csv(rows, OUT_CSV)
        log(f"=== DONE; Recipes: {len(rows)} ===")
    except KeyboardInterrupt:
        log("Interrupted by user", "ERROR")
    except Exception:
        log("Fatal error in main()", "ERROR")
        traceback.print_exc()

if __name__ == "__main__":
    main()

[2025-09-13 18:37:17][INFO] === START SCRAPE ===
[2025-09-13 18:37:17][INFO] Listing page 1 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419
[2025-09-13 18:37:17][INFO] GET https://kitchen.sayidaty.net/recipes/index/cuisine/2419
[2025-09-13 18:37:18][INFO] Found 19 recipe links in listing page
[2025-09-13 18:37:20][INFO] Listing page 2 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=2
[2025-09-13 18:37:20][INFO] GET https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=2
[2025-09-13 18:37:21][INFO] Found 19 recipe links in listing page
[2025-09-13 18:37:22][INFO] Listing page 3 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=3
[2025-09-13 18:37:22][INFO] GET https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=3
[2025-09-13 18:37:23][INFO] Found 19 recipe links in listing page
[2025-09-13 18:37:25][INFO] Listing page 4 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=4
[2025-09-13 18:37:25][INFO] GET https://k

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 1430, in getresponse
    response.begin()
  File "/usr/lib/python3.12/http/client.py", line 331, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 292, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/ssl.py", line

[2025-09-13 18:38:25][INFO] [6/215] Extract -> https://kitchen.sayidaty.net/node/24769/القرصان-السعودي-بالخضار/وصفات-طبخ/وصفات
[2025-09-13 18:38:25][INFO] GET https://kitchen.sayidaty.net/node/24769/القرصان-السعودي-بالخضار/وصفات-طبخ/وصفات
[2025-09-13 18:38:29][INFO] [7/215] Extract -> https://kitchen.sayidaty.net/node/25041/سمبوسة-حجازية/وصفات-رمضانية/وصفات
[2025-09-13 18:38:29][INFO] GET https://kitchen.sayidaty.net/node/25041/سمبوسة-حجازية/وصفات-رمضانية/وصفات
[2025-09-13 18:38:33][INFO] [8/215] Extract -> https://kitchen.sayidaty.net/node/25366/حلى-الكليجا/حلويات/وصفات
[2025-09-13 18:38:33][INFO] GET https://kitchen.sayidaty.net/node/25366/حلى-الكليجا/حلويات/وصفات
[2025-09-13 18:38:36][INFO] [9/215] Extract -> https://kitchen.sayidaty.net/node/25367/الشكلمه-بدون-بيض/حلويات/وصفات
[2025-09-13 18:38:36][INFO] GET https://kitchen.sayidaty.net/node/25367/الشكلمه-بدون-بيض/حلويات/وصفات
[2025-09-13 18:38:40][INFO] [10/215] Extract -> https://kitchen.sayidaty.net/node/25434/الرز-الحساوي/وصفات

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 1430, in getresponse
    response.begin()
  File "/usr/lib/python3.12/http/client.py", line 331, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 292, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/ssl.py", line

[2025-09-13 18:39:41][INFO] [23/215] Extract -> https://kitchen.sayidaty.net/node/26298/خبز-القرصان-السعودي/وصفات-الخبز/وصفات
[2025-09-13 18:39:41][INFO] GET https://kitchen.sayidaty.net/node/26298/خبز-القرصان-السعودي/وصفات-الخبز/وصفات
[2025-09-13 18:39:45][INFO] [24/215] Extract -> https://kitchen.sayidaty.net/node/26415/كبسة-بيضاء-بالدجاج/وصفات-طبخ/وصفات
[2025-09-13 18:39:45][INFO] GET https://kitchen.sayidaty.net/node/26415/كبسة-بيضاء-بالدجاج/وصفات-طبخ/وصفات
[2025-09-13 18:39:49][INFO] [25/215] Extract -> https://kitchen.sayidaty.net/node/26517/سليق-طائفي-بالدجاج/وصفات-طبخ/وصفات
[2025-09-13 18:39:49][INFO] GET https://kitchen.sayidaty.net/node/26517/سليق-طائفي-بالدجاج/وصفات-طبخ/وصفات
[2025-09-13 18:39:57][INFO] [26/215] Extract -> https://kitchen.sayidaty.net/node/26519/قهوة-اللوز-الخفيفة/مشروبات-وعصائر/وصفات
[2025-09-13 18:39:57][INFO] GET https://kitchen.sayidaty.net/node/26519/قهوة-اللوز-الخفيفة/مشروبات-وعصائر/وصفات
[2025-09-13 18:40:00][INFO] [27/215] Extract -> https://kitchen.

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 1430, in getresponse
    response.begin()
  File "/usr/lib/python3.12/http/client.py", line 331, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 292, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/ssl.py", line

[2025-09-13 18:41:53][INFO] [53/215] Extract -> https://kitchen.sayidaty.net/node/28379/هريس-أحمر-مع-صلصة-الطماطم-والفليفلة-الحارة/وصفات-طبخ/وصفات
[2025-09-13 18:41:53][INFO] GET https://kitchen.sayidaty.net/node/28379/هريس-أحمر-مع-صلصة-الطماطم-والفليفلة-الحارة/وصفات-طبخ/وصفات
[2025-09-13 18:41:57][INFO] [54/215] Extract -> https://kitchen.sayidaty.net/node/28400/رز-بخاري-باللحم-والزبيب/وصفات-طبخ/وصفات
[2025-09-13 18:41:57][INFO] GET https://kitchen.sayidaty.net/node/28400/رز-بخاري-باللحم-والزبيب/وصفات-طبخ/وصفات
[2025-09-13 18:42:00][INFO] [55/215] Extract -> https://kitchen.sayidaty.net/node/28401/القرصان-السعودي-بالخضار-والكشنة/وصفات-طبخ/وصفات
[2025-09-13 18:42:00][INFO] GET https://kitchen.sayidaty.net/node/28401/القرصان-السعودي-بالخضار-والكشنة/وصفات-طبخ/وصفات
[2025-09-13 18:42:03][INFO] [56/215] Extract -> https://kitchen.sayidaty.net/node/28408/كبسة-الدجاج-على-الطريقة-السعودية-التقليدية/وصفات-طبخ/وصفات
[2025-09-13 18:42:03][INFO] GET https://kitchen.sayidaty.net/node/28408/كبسة-ال