<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/ALsaudi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# === Cell 1: Install & Base Setup ===
!pip -q install playwright==1.46.0 requests nest_asyncio pillow
!playwright install chromium

import os, re, csv, json, unicodedata, asyncio
from datetime import datetime
from urllib.parse import urljoin, urlparse

import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright

# ------------------ CONFIG ------------------
BASE_URL = "https://www.alsaudi.sa"
LANG = "ar"
CATEGORY_IDS = [3, 9, 8, 2, 19, 6, 5]  # your sections
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

os.makedirs("images", exist_ok=True)

def build_category_url(cat_id: int) -> str:
    return f"{BASE_URL}/menu/category/{cat_id}?language={LANG}"

def slugify(text, maxlen=90):
    if not text: return "image"
    t = unicodedata.normalize("NFKD", text)
    t = "".join(ch for ch in t if not unicodedata.combining(ch))
    t = re.sub(r"[^0-9A-Za-z\u0600-\u06FF _.-]+", "", t)
    t = re.sub(r"\s+", "_", t.strip())
    return (t[:maxlen] or "image")


In [17]:
# === Cell 2 (REPLACE): Saudi-focused classifier with Arabic-safe overrides ===
import re, unicodedata

def _norm_ar(s: str) -> str:
    if not s: return ""
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return (s.replace("أ","ا").replace("إ","ا").replace("آ","ا")
             .replace("ى","ي").replace("ئ","ي").replace("ؤ","و")
             .replace("ة","ه"))

# Section fallback if nothing matches
CATEGORY_FALLBACK = {
    3: "Appetizer/Side",
    9: "Rice",
    8: "Dessert",
    2: "Rice - Chicken",
    19: "Rice - Chicken",
    6: "Rice",
    5: "Meat",
}

# Core triggers (compact, Saudi-focused)
RICE     = r"(رز|ارز|كبسه|برياني|بخاري|مندي|مظبي|مدفون|مضغوط|قوزي|سليق|منسف|مقلوبه|جريش|قرصان|مطازيز|مرقوق|هريس|كابلي|بشاور)"
CHICKEN  = r"(دجاج|شوايه|شواية|مسحب|تكا)"
MEAT     = r"(لحم|غنم|خروف|بقر|تيس|حاشي|جمل|كباب|كفته|كبد|مقلقل)"
SEAFOOD  = r"(سمك|روبيان|جمبري|هامور)"
BREAD    = r"(خبز|تميس)"
SALAD    = r"(سلطه|سلطة|حمص|متبل|بابا ?غنوج|تبوله|تبولة|فتوش)"
SOUP     = r"(شوربه|شوربة|حساء|مرق|ملوخيه|ملوخيh?|باميه|بامية|عدس)"
DESSERT  = r"(حلى|حلويات|لقيمات|كنافه|كنافة|ام علي|معصوب|عريكه|رز ?بحليب|مهلبيه|مهلبية|كريم ?كراميل|جيلي|بقلاوه|بقلاوة)"
DRINKS   = r"(عصير|مشروب|مشروبات|شاي|قهوه|قهوة|لبن|حليب)"
VEG      = r"(خضار( ?مشكل)?|خيار|طماطم|باذنجان|كوسا)"
CHEESE   = r"(جبن|جبنه|جبنة|قشطه|قشطة|لبنه|لبنة)"

STOPWORDS = r"(نفر|فرد|حبه|حبة|نص|نصف|كامل[هة]?|ربع|صغير|كبير|ساده|سادة|وجبه|وجبة|علبه|علبة)"

def classify_food(name, extra="", category_id=None):
    text = _norm_ar(f"{name or ''} {extra or ''}")
    text = re.sub(r"[ـ،؛!؟:()\[\]{}«»\"'\/\\\-\.]", " ", text)
    text = re.sub(STOPWORDS, " ", text, flags=re.IGNORECASE)
    text = re.sub(r"\s+", " ", text).strip().lower()

    # ---- Arabic-safe hard overrides (substring checks, no \b) ----
    t = text  # alias
    # exact short names / very common items
    if "خضار مشكل" in t:           return "Vegetables"
    if "باميه" in t:               return "Soup/Stew"
    if "ملوخيه" in t:              return "Soup/Stew"
    if "كريم كراميل" in t:         return "Dessert"
    if "جيلي" in t:                return "Dessert"
    if "مهلبيه" in t:              return "Dessert"
    # sambosa cheese
    if ("سمبوس" in t or "سنبوس" in t) and "جبن" in t:
        return "Starter - Cheese"

    labels = []

    # Special rule: any sambosa → Starter (+ fillings below)
    if "سمبوس" in t or "سنبوس" in t:
        labels.append("Starter")
        if re.search(CHEESE, t): labels.append("Cheese")
        if re.search(MEAT,   t): labels.append("Meat")
        if re.search(VEG,    t): labels.append("Vegetables")
        return " - ".join(dict.fromkeys(labels))

    # Main ingredient groups
    if re.search(RICE, t):    labels.append("Rice")
    if re.search(CHICKEN, t): labels.append("Chicken")
    if re.search(MEAT, t):    labels.append("Meat")
    if re.search(SEAFOOD, t): labels.append("Fish/Seafood")
    if re.search(BREAD, t):   labels.append("Bread")
    if re.search(SALAD, t):   labels.append("Appetizer/Side")
    if re.search(SOUP, t):    labels.append("Soup/Stew")
    if re.search(DESSERT, t): labels.append("Dessert")
    if re.search(DRINKS, t):  labels.append("Drinks")
    if re.search(VEG, t):     labels.append("Vegetables")
    if re.search(CHEESE, t):  labels.append("Cheese")

    # Section fallback if still nothing
    if not labels and category_id in CATEGORY_FALLBACK:
        labels.append(CATEGORY_FALLBACK[category_id])

    return " - ".join(dict.fromkeys(labels)) if labels else "Unclassified"


In [18]:
# === Cell 3 (REPLACE): Playwright scraping helpers with better name fallback ===
async def download_image_with_context(context, img_url, dish_name):
    """Download image using Playwright context (keeps headers)."""
    if not img_url:
        return ""
    url = img_url if img_url.startswith("http") else urljoin(BASE_URL, img_url)
    try:
        resp = await context.request.get(url)
        if not resp.ok:
            return ""
        data = await resp.body()
        ext = os.path.splitext(urlparse(url).path)[1].lower() or ".jpg"
        if ext not in [".jpg", ".jpeg", ".png", ".webp"]:
            ext = ".jpg"
        fname = f"images/{slugify(dish_name or 'item')}{ext}"
        # avoid overwriting duplicate names
        base, e = os.path.splitext(fname)
        i = 1
        while os.path.exists(fname):
            fname = f"{base}_{i}{e}"
            i += 1
        with open(fname, "wb") as f:
            f.write(data)
        return fname
    except Exception:
        return ""

async def scrape_one_category(context, category_url: str, category_id: int):
    """Scrape a single category page."""
    rows = []
    page = await context.new_page()
    await page.goto(category_url, wait_until="domcontentloaded", timeout=60000)

    # Scroll to load lazy items
    last = 0
    for _ in range(20):
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(800)
        h = await page.evaluate("document.body.scrollHeight")
        if h == last:
            break
        last = h

    # Find cards
    cards = await page.query_selector_all("div.menu-list article.card-wrapper")
    print(f"[{category_url}] Cards found:", len(cards))

    for c in cards:
        # Dish name with robust fallbacks
        name = ""
        a = await c.query_selector("a.menu-product.card")
        if a:
            name = (await a.get_attribute("title")) or ""
            if not name:
                try:
                    name = (await a.inner_text()) or ""
                except:
                    name = ""
            name = name.strip()

        # Fallback: image alt
        if not name:
            img_for_alt = await c.query_selector("img")
            if img_for_alt:
                alt_txt = await img_for_alt.get_attribute("alt")
                if alt_txt:
                    name = alt_txt.strip()

        # Image URL
        img_url = ""
        img = await c.query_selector("img")
        if img:
            for attr in ["src","data-src","data-original","data-lazy","srcset"]:
                v = await img.get_attribute(attr)
                if v and v.strip():
                    if attr == "srcset":
                        img_url = v.split(",")[0].strip().split(" ")[0]
                    else:
                        img_url = v.strip()
                    break

        # Keep full card text as extra context for classification
        try:
            extra = (await c.inner_text()) or ""
        except:
            extra = ""

        rows.append({
            "name": name,
            "img_url": img_url,
            "extra": extra,
            "category_id": category_id
        })

    await page.close()
    return rows

async def scrape_many_categories(category_ids):
    """Launch browser once, scrape all categories, classify + download."""
    final = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=["--no-sandbox"])
        context = await browser.new_context(
            locale="ar-SA", user_agent=UA, viewport={"width":1366,"height":2200}
        )

        all_rows = []
        for cid in category_ids:
            url = build_category_url(cid)
            try:
                rows = await scrape_one_category(context, url, cid)
                all_rows.extend(rows)
            except Exception as e:
                print(f"Category {cid} failed: {e}")

        # Finalize rows (classify + download)
        scrape_date = datetime.now().strftime("%Y-%m-%d")
        for r in all_rows:
            cls = classify_food(r["name"], r.get("extra",""), category_id=r["category_id"])
            img_path = await download_image_with_context(context, r["img_url"], r["name"] or "item")
            final.append({
                "name": r["name"],
                "image_file": img_path,
                "classification": cls,
                "scrape_date": scrape_date
            })

        await context.close()
        await browser.close()
    return final


In [19]:
# === Cell 4: Save outputs ===
import csv, json

OUT_JSON = "alsaudi_menu.json"
OUT_CSV  = "alsaudi_menu.csv"

final = await scrape_many_categories(CATEGORY_IDS)

print(f"Total items scraped: {len(final)}")
missing_img = sum(1 for r in final if not r.get("image_file"))
print(f"Images saved: {len(final)-missing_img} | Missing images: {missing_img}")

# JSON
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(final, f, ensure_ascii=False, indent=2)

# CSV
fieldnames = ["name", "image_file", "classification", "scrape_date"]
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for row in final:
        w.writerow({k: row.get(k, "") for k in fieldnames})

print(f"Saved → {OUT_JSON} & {OUT_CSV}. Images in ./images/")


[https://www.alsaudi.sa/menu/category/3?language=ar] Cards found: 9
[https://www.alsaudi.sa/menu/category/9?language=ar] Cards found: 12
[https://www.alsaudi.sa/menu/category/8?language=ar] Cards found: 7
[https://www.alsaudi.sa/menu/category/2?language=ar] Cards found: 17
[https://www.alsaudi.sa/menu/category/19?language=ar] Cards found: 7
[https://www.alsaudi.sa/menu/category/6?language=ar] Cards found: 10
[https://www.alsaudi.sa/menu/category/5?language=ar] Cards found: 5
Total items scraped: 67
Images saved: 67 | Missing images: 0
Saved → alsaudi_menu.json & alsaudi_menu.csv. Images in ./images/
