<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/datasciphase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================
# Cell 1: Imports & Logging
# =========================
import time, json, csv, re, sys, traceback
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime, timezone
from collections import OrderedDict

def log(msg, level="INFO"):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}][{level}] {msg}")

In [2]:
# =================
# Cell 2: Config
# =================
BASE = "https://kitchen.sayidaty.net"
START_LIST = f"{BASE}/recipes/index/cuisine/2419"  # المطبخ السعودي

REQUEST_TIMEOUT = 20
SLEEP_BETWEEN_REQUESTS = 1.2
MAX_PAGES = 12

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; research-bot/1.0; +https://example.edu)"
}

# المخرجات
OUT_JSONL = "sayidaty_saudi_recipes.jsonl"
OUT_CSV   = "sayidaty_saudi_recipes.csv"
URLS_CHECKPOINT = "sayidaty_urls.txt"

In [3]:
# =====================================
# Cell 3: HTTP session with retries
# =====================================
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.8,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.headers.update(HEADERS)
    return s

SESSION = make_session()

def get_soup(url):
    try:
        log(f"GET {url}")
        r = SESSION.get(url, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        log(f"Request failed for {url}: {e}", "ERROR")
        raise

In [4]:
# Cell 4: HTTP + listing
def get_soup(url):
    log(f"GET {url}")
    r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def list_page_urls(list_url):
    try:
        soup = get_soup(list_url)
        links = []
        for a in soup.select("a[href*='/node/']"):
            href = a.get("href")
            if href and "/node/" in href:
                links.append(urljoin(BASE, href))
        uniq = sorted(set(links))
        log(f"Found {len(uniq)} recipe links in listing page")
        return uniq
    except Exception:
        log("Failed to parse listing page", "ERROR")
        traceback.print_exc()
        return []

def paginate_listing(start_url, max_pages=12):
    all_urls = []
    for p in range(1, max_pages + 1):
        page_url = start_url if p == 1 else f"{start_url}/?page={p}"
        log(f"Listing page {p} -> {page_url}")
        try:
            urls = list_page_urls(page_url)
            if not urls:
                log("No more urls, stop"); break
            all_urls.extend(urls)
        except Exception:
            log("Error while listing, continue", "ERROR")
            traceback.print_exc()
        time.sleep(SLEEP_BETWEEN_REQUESTS)
    all_urls = sorted(set(all_urls))
    with open(URLS_CHECKPOINT, "w", encoding="utf-8") as f:
        f.write("\n".join(all_urls))
    log(f"Saved URLs checkpoint: {URLS_CHECKPOINT} ({len(all_urls)} urls)")
    return all_urls

In [5]:
# ==========================================
# Cell 4: Listing page -> recipe URLs
# ==========================================
def list_page_urls(list_url):
    try:
        soup = get_soup(list_url)
        links = []
        # روابط الوصفات تكون عادة تحت /node/ أو /recipes/
        for a in soup.select("a[href]"):
            href = a.get("href")
            if not href:
                continue
            if "/node/" in href or "/recipes/" in href:
                full = urljoin(BASE, href)
                # تجاهل روابط الأقسام/الوسوم العامة
                if "/recipes/index" in full:
                    continue
                links.append(full)
        unique_links = sorted(set(links))
        log(f"Found {len(unique_links)} recipe links in listing page")
        return unique_links
    except Exception:
        log("Failed to parse listing page; see traceback below", "ERROR")
        traceback.print_exc()
        return []

In [6]:
# ================================
# Cell 5: Paginate listing
# ================================
def paginate_listing(start_url, max_pages=12):
    all_urls = []
    for p in range(1, max_pages + 1):
        page_url = start_url if p == 1 else f"{start_url}/?page={p}"
        log(f"Listing page {p} -> {page_url}")
        try:
            urls = list_page_urls(page_url)
            if not urls:
                log("No more urls found, stopping pagination")
                break
            all_urls.extend(urls)
        except Exception:
            log("Error while listing this page; continuing", "ERROR")
            traceback.print_exc()
        time.sleep(SLEEP_BETWEEN_REQUESTS)
    all_urls = list(OrderedDict.fromkeys(all_urls))  # unique & keep order

    # حفظ تشيكبوينت
    try:
        with open(URLS_CHECKPOINT, "w", encoding="utf-8") as f:
            f.write("\n".join(all_urls))
        log(f"Saved URLs checkpoint: {URLS_CHECKPOINT} ({len(all_urls)} urls)")
    except Exception:
        log("Failed to write URLs checkpoint", "ERROR")
        traceback.print_exc()
    return all_urls

In [7]:
# =====================================
# Cell 6: Helpers — clean ingredients
# =====================================
import re
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone

AR_NUMS = "٠١٢٣٤٥٦٧٨٩"
FRACTIONS = "¼½¾⅓⅔⅛⅜⅝⅞"

# Words to strip (units / adjectives we don't want to keep as standalone tokens)
UNIT_WORDS = {
    "كوب","أكواب","ملعقة","ملاعق","صغيرة","صغيره","كبيرة","كبيره","حبة","حبات",
    "جرام","غرام","غ","غم","كغ","كيلو","مل","ملل","ملليلتر","لتر","رشة","رشّة",
    "حسب","الرغبة","قليل","القليل","مقدار","تقريباً","تقريبا","حوالي",
    "مطحون","ناعم","خشن","مقطع","شرائح","كاملة","مجروش","مهروس",
    "للقلي","للتزيين","للتقديم","طازج","طازجة","طازجه","أو"
}

# Any line containing these is considered "method/verb" → drop entirely
VERB_HINTS = {
    "أضيفي","اضيفي","نضيف","يضاف","أضف","اضف",
    "اخلطي","اخلط","نخلط","يُخلط",
    "قلّبي","قلبي","قلّب","نقلب","يُقلب",
    "اطحني","نطحن","يطحن",
    "اسكبي","نسكب","يسكب",
    "قطّعي","قطعي","نقطّع","نقطع","يقطع",
    "اخبزي","نخبز","يخبز",
    "اسلقي","نسلق","يسلق",
    "قدّمي","قدمي","نقدّم","نقدم","يقدّم","يقدم",
    "ضعي","ضع","نضع","يوضع",
    "حمّري","حمري","نحمّر","نحمر","يحمّر","يحمر",
    "اقلي","نقلي","يقلى","اضربي","اخفقي","افردي","اعجني",
    "طريقة","التحضير","خطوة","نضيفها","نقلبها","نتركها","نقدّمها"
}

# allow compounds like "زيت الزيتون" / "فلفل أسود" / "صلصة الطماطم"
COMPOUND_HEADS = {"زيت","صلصة","معجون","لبن","لبنة","حليب","مرق","فلفل","سكر","ملح","طحين","دقيق","ماء","روح","ماءُ"}
ALLOWED_ADJ   = {"أسود","أبيض","أخضر","أحمر","حار","حلو","مالح","مجروش","مطحون","خشِن","ناعم","طماطم","الزيتون","الورد","الزهر"}

SPLIT_TOKENS = r"[،,\+\-\–\—;:()\[\]{}\/\\\|]+"

def _has_verb(text: str) -> bool:
    # single quick checks
    if any(h in text for h in VERB_HINTS):
        return True
    return False

def _strip_numbers_units(text: str) -> str:
    # remove digits: Arabic/Latin + fractions
    text = re.sub(fr"[0-9{AR_NUMS}{FRACTIONS}]+", " ", text)
    # kill ranges like 1-2 / ½ كوب
    text = re.sub(r"\b\d+\s*[-–]\s*\d+\b", " ", text)
    # remove unit words
    if UNIT_WORDS:
        units = r"|".join(map(re.escape, UNIT_WORDS))
        text = re.sub(fr"\b({units})\b", " ", text, flags=re.IGNORECASE)
    # common filler
    text = re.sub(r"\s+من\s+", " ", text)
    # collapse spaces
    text = re.sub(r"\s+", " ", text).strip(" .،")
    return text.strip()

def _pick_compound(tokens):
    """
    Return one or two-word ingredient:
    - if first token is in COMPOUND_HEADS and there's a next ⇒ use first two
    - or if second starts with 'ال' ⇒ first two
    - or if second in ALLOWED_ADJ ⇒ first two
    else ⇒ first only
    """
    if not tokens:
        return None
    if len(tokens) == 1:
        return tokens[0]
    w1, w2 = tokens[0], tokens[1]
    if (w1 in COMPOUND_HEADS) or w2.startswith("ال") or (w2 in ALLOWED_ADJ):
        return f"{w1} {w2}"
    return w1

def clean_ingredients_list(raw_lines):
    """
    Input: raw <li> texts from the ingredients section only.
    Output: unique, ordered tokens (each 1–2 words), e.g.: ['رز','ماء','زيت الزيتون']
    """
    seen, out = set(), []
    for raw in raw_lines:
        if not raw:
            continue
        # drop anything that smells like a step
        if _has_verb(raw):
            continue

        # primary cleaning
        txt = _strip_numbers_units(raw)

        # split by punctuation first
        chunks = re.split(SPLIT_TOKENS, txt)
        for ch in chunks:
            ch = ch.strip()
            if not ch or _has_verb(ch):
                continue

            # split by " و "
            parts = re.split(r"\s+و\s+", ch)
            for p in parts:
                t = p.strip(" .،")
                if not t or _has_verb(t):
                    continue

                # keep Arabic words only
                toks = [w for w in t.split() if re.match(r"^[\u0600-\u06FF]+$", w)]
                # remove very short tokens + unit words
                toks = [w for w in toks if len(w) > 1 and w not in UNIT_WORDS]
                if not toks:
                    continue

                phrase = _pick_compound(toks[:3])
                if not phrase:
                    continue

                # tighten: common bare nouns mapping (normalize variations)
                phrase = phrase.strip(" .،")
                if phrase and phrase not in seen:
                    seen.add(phrase)
                    out.append(phrase)
    return out

def extract_title(soup):
    # Prefer og:title; fall back to h1/h2
    og = soup.find("meta", property="og:title")
    if og and og.get("content"):
        t = og["content"].split("|")[0].strip()
        if t: return t
    for tag in ("h1","h2"):
        el = soup.find(tag)
        if el:
            t = el.get_text(" ", strip=True)
            if t: return t
    return None

def extract_image(soup):
    # Prefer og:image; ignore trackers/sharing images
    og = soup.find("meta", property="og:image")
    if og and og.get("content"):
        u = og["content"].strip()
        if "facebook.com/tr" not in u and "fb-sharing.jpg" not in u and "/uploads/" in u:
            return u
    # Otherwise first real upload image on sayidaty
    for img in soup.select("img[src]"):
        src = urljoin(BASE, img["src"])
        p = urlparse(src)
        if p.netloc.endswith("sayidaty.net") and "/uploads/" in p.path and not src.endswith("fb-sharing.jpg"):
            return src
    return None

In [8]:
# =====================================
# Cell 7: Robust extract_recipe
# =====================================
def extract_recipe(url):
    try:
        soup = get_soup(url)

        title = extract_title(soup)
        image = extract_image(soup)

        # ---- ingredients: STRICTLY from the ingredients section ----
        raw_ing = []

        # 1) schema.org (if present)
        for el in soup.select('[itemprop="recipeIngredient"]'):
            tx = el.get_text(" ", strip=True)
            if tx: raw_ing.append(tx)

        # 2) headings named "المقادير"
        if not raw_ing:
            header = soup.find(["h2","h3","h4"], string=re.compile(r"المقادير"))
            if header:
                lst = header.find_next(["ul","ol"])
                if lst:
                    for li in lst.find_all("li"):
                        tx = li.get_text(" ", strip=True)
                        if tx: raw_ing.append(tx)

        # 3) common ingredient container classes (fallback)
        if not raw_ing:
            for cls in ("ingredients","ing","recipe-ingredients"):
                box = soup.find(class_=re.compile(cls))
                if box:
                    for li in box.find_all("li"):
                        tx = li.get_text(" ", strip=True)
                        if tx: raw_ing.append(tx)

        ingredients = clean_ingredients_list(raw_ing)

        # scrape time (UTC ISO)
        scraped_at = datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

        data = {
            "title": title,
            "ingredients": ingredients,     # just tokens!
            "image": image,
            "url": url,
            "scraped_at": scraped_at,
        }
        return data

    except Exception:
        log(f"extract_recipe failed for: {url}", "ERROR")
        traceback.print_exc()
        return None

In [9]:
# ===========================
# Cell 8: Saudi pass filter
# ===========================
# بما أننا نبدأ من صفحة "المطبخ السعودي"، نمرّر كل الروابط.
# لكن نضع تلميحات احتياطية لو استُخدمت لاحقًا خارج ذلك.
SAUDI_HINTS = {
    "المطبخ السعودي","طبخات سعودية","وصفات سعودية",
    "كبسة","سليق","ثريد","مقلقل","مندي","قرصان","مرقوق","جريش","كليجة","لقيمات",
    "بخاري","مظبي","المفطح","الحنيذ","الحنيني","العريكة","السقدانة","الحيسة","الرز الحساوي"
}

def is_saudi_recipe(data):
    return bool(data)  # نمرر الكل لأن قائمة البداية سعودية

In [13]:
# ===========================
# Cell 9: Crawl loop
# ===========================
def crawl_all():
    try:
        urls = paginate_listing(START_LIST, max_pages=MAX_PAGES)
        log(f"Total URLs gathered: {len(urls)}")
    except Exception:
        log("Failed during pagination stage", "ERROR")
        traceback.print_exc()
        return []

    out = []
    for i, u in enumerate(urls, 1):
        log(f"[{i}/{len(urls)}] Extract -> {u}")
        try:
            d = extract_recipe(u)
            if d and is_saudi_recipe(d):
                out.append(d)
            else:
                log("Skipped (not Saudi or extraction returned None)", "INFO")
        except Exception:
            log("Unexpected error in crawl loop for this URL", "ERROR")
            traceback.print_exc()
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    log(f"Crawl finished. Valid recipes: {len(out)}")
    return out

In [14]:
# =========================
# Cell 9: Exporters (CSV/JSONL)
# =========================
def export_jsonl(rows, path=OUT_JSONL):
    try:
        with open(path, "w", encoding="utf-8") as f:
            for r in rows:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
        log(f"Saved JSONL -> {path}")
    except Exception:
        log("Failed to write JSONL", "ERROR")
        traceback.print_exc()

def export_csv(rows, path=OUT_CSV):
    try:
        with open(path, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["title","ingredients","image","url","scraped_at"])
            for r in rows:
                w.writerow([
                    r.get("title"),
                    " | ".join(r.get("ingredients", [])),
                    r.get("image"),
                    r.get("url"),
                    r.get("scraped_at"),
                ])
        log(f"Saved CSV -> {path}")
    except Exception:
        log("Failed to write CSV", "ERROR")
        traceback.print_exc()

In [15]:
# ===========================
# Cell 11: Main
# ===========================
def main():
    log("=== START SCRAPE ===")
    try:
        rows = crawl_all()
        export_jsonl(rows, OUT_JSONL)
        export_csv(rows, OUT_CSV)
        log(f"=== DONE; Recipes: {len(rows)} ===")
    except KeyboardInterrupt:
        log("Interrupted by user", "ERROR")
    except Exception:
        log("Fatal error in main()", "ERROR")
        traceback.print_exc()

if __name__ == "__main__":
    main()

[2025-09-19 21:07:40][INFO] === START SCRAPE ===
[2025-09-19 21:07:40][INFO] Listing page 1 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419
[2025-09-19 21:07:40][INFO] GET https://kitchen.sayidaty.net/recipes/index/cuisine/2419
[2025-09-19 21:07:43][INFO] Found 19 recipe links in listing page
[2025-09-19 21:07:44][INFO] Listing page 2 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=2
[2025-09-19 21:07:44][INFO] GET https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=2
[2025-09-19 21:07:47][INFO] Found 20 recipe links in listing page
[2025-09-19 21:07:48][INFO] Listing page 3 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=3
[2025-09-19 21:07:48][INFO] GET https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=3
[2025-09-19 21:07:51][INFO] Found 20 recipe links in listing page
[2025-09-19 21:07:52][INFO] Listing page 4 -> https://kitchen.sayidaty.net/recipes/index/cuisine/2419/?page=4
[2025-09-19 21:07:52][INFO] GET https://k

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 1430, in getresponse
    response.begin()
  File "/usr/lib/python3.12/http/client.py", line 331, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/http/client.py", line 292, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/ssl.py", line

[2025-09-19 21:21:51][INFO] [202/209] Extract -> https://kitchen.sayidaty.net/node/26519/قهوة-اللوز-الخفيفة/مشروبات-وعصائر/وصفات
[2025-09-19 21:21:51][INFO] GET https://kitchen.sayidaty.net/node/26519/قهوة-اللوز-الخفيفة/مشروبات-وعصائر/وصفات
[2025-09-19 21:21:55][INFO] [203/209] Extract -> https://kitchen.sayidaty.net/node/26605/كليجا-بالهيل-والتمر/حلويات/وصفات
[2025-09-19 21:21:55][INFO] GET https://kitchen.sayidaty.net/node/26605/كليجا-بالهيل-والتمر/حلويات/وصفات
[2025-09-19 21:21:59][INFO] [204/209] Extract -> https://kitchen.sayidaty.net/node/26729/مظبي-لحم/وصفات-طبخ/وصفات
[2025-09-19 21:21:59][INFO] GET https://kitchen.sayidaty.net/node/26729/مظبي-لحم/وصفات-طبخ/وصفات
[2025-09-19 21:22:03][INFO] [205/209] Extract -> https://kitchen.sayidaty.net/node/26946/البف-المديني-طبق-سعودي-متوارث-شهي/معجنات/وصفات
[2025-09-19 21:22:03][INFO] GET https://kitchen.sayidaty.net/node/26946/البف-المديني-طبق-سعودي-متوارث-شهي/معجنات/وصفات
[2025-09-19 21:22:07][INFO] [206/209] Extract -> https://kitchen.s