<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/Shawayahouse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip -q install playwright==1.46.0 requests nest_asyncio pillow
!playwright install chromium

import os, re, csv, json, time, unicodedata, asyncio, requests
from datetime import datetime
from urllib.parse import urljoin, urlparse
import nest_asyncio
nest_asyncio.apply()

from urllib import robotparser
from playwright.async_api import async_playwright
from PIL import Image
from io import BytesIO

# ---------- Config ----------
BASE_URL = "https://shawayahouse.my.taker.io"
MENU_ROOT = f"{BASE_URL}/menu?language=ar"
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
os.makedirs("images", exist_ok=True)
# ----------------------------

# ---------- Utils ----------
AR_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]")  # تنوين/تشكيل + تطويل
def ar_clean(text):
    if not text: return ""
    t = unicodedata.normalize("NFKC", text)
    t = AR_DIACRITICS.sub("", t)          # شيل التشكيل والتمطيط
    t = re.sub(r"[^\w\s\u0600-\u06FF]", " ", t)  # شيل رموز غريبة
    t = re.sub(r"\s+", " ", t).strip()
    return t

def slugify(text, maxlen=80):
    if not text: return "image"
    t = unicodedata.normalize("NFKD", text)
    t = "".join(ch for ch in t if not unicodedata.combining(ch))
    t = re.sub(r"[^0-9A-Za-z\u0600-\u06FF _.-]+", "", t)
    t = re.sub(r"\s+", "_", t.strip())
    return (t[:maxlen] or "image")

def robots_allows(url, user_agent="*"):
    rp = robotparser.RobotFileParser()
    rp.set_url(urljoin(BASE_URL, "/robots.txt"))
    try: rp.read()
    except: return True
    return rp.can_fetch(user_agent, url)

def ensure_abs(url):
    return url if bool(urlparse(url).netloc) else urljoin(BASE_URL, url)

def download_image(img_url, name_hint, referer):
    if not img_url: return ""
    img_url = ensure_abs(img_url)
    headers = {"User-Agent": UA, "Referer": referer}
    r = requests.get(img_url, headers=headers, timeout=60, stream=True)
    r.raise_for_status()
    data = r.content
    # حدد الامتداد من المحتوى
    try:
        im = Image.open(BytesIO(data))
        ext = (im.format or "JPEG").lower()
        if ext == "jpeg": ext = "jpg"
        if ext not in ("jpg","png","webp"): ext = "jpg"
    except Exception:
        ext = "jpg"
    fn = f"{slugify(name_hint)}.{ext}"
    out_path = os.path.join("images", fn)
    with open(out_path, "wb") as f:
        f.write(data)
    return out_path

# ---------- Classification (مُحدّث مع تنظيف عربي + اعتماد على اسم القسم والوصف) ----------
CLASS_MAP = [
    (r"(رز|ارز|كبسه|كبسة|برياني|مندي|mandi|kabsa|rice|biryani|جريش|قرصان|هريس)", "Rice"),
    (r"(سلطه|سلطة|تبوله|تبولة|فتوش|salad|tabbouleh|fattoush|كولسلو)", "Salad"),
    (r"(دجاج|مسحب|شوايه|شواية|chicken|tawook|shawaya|broast)", "Chicken"),
    (r"(لحم|بقري|كباب|برجر|burger|beef)", "Meat"),
    (r"(غنم|ضأن|lamb|mutton|حاشي|camel)", "Lamb"),
    (r"(سمك|هامور|fish|روبيان|shrimp|جمبري)", "Fish"),
    (r"(خبز|صامولي|عربي|pita|tamees|tortilla|saj|bread)", "Bread"),
    (r"(بطاطس|بطاطا|فرايز|fries)", "Fries"),
    (r"(صلصه|صلصة|صوص|ثوم|طحينه|طحينة|مايونيز|كاتشب|sauce|garlic|tahini|ketchup)", "Sauce"),
    # الحلويات (Dessert)
    (r"(حلى|حلويات|كنافه|كنافة|لقيمات|بسبوسه|بسبوسة|معمول|dessert|kunafa|basbousa|maamoul|ام_علي|أم_علي|مهلبيه|مهلبية)", "Dessert"),
    (r"(مكرونه|مكرونة|باستا|pasta)", "Pasta"),
    (r"(شاورما|shawarma|راب|wrap|سندوتش|سندويتش|ساندوتش|ساندويتش)", "Sandwich"),
    # إضافاتك: sides/vegetables
    (r"(بشاميل|ملوخيه|ملوخية|خضار مشكل|خضار_مشكل|مصقعه|مصقعة)", "Sides"),
     # المشروبات
    (r"(مويه|موية|ماء|مياه|معدنيه|معدنية|مياه معدنية|مياه_معدنيه|لبن|لَبَن|laban|milk|مشلبي|مشلبيه|مشروبات غازيه|مشروبات غازية|بيبسي|pepsi|كوكاكولا|coca[- ]?cola|سفن اب|7up|sprite|سبرايت|fanta|فانتا|cola|coke)", "Drinks"),

]

SECTION_HINTS = [  # لو اسم القسم فيه كلمة تدل، نضيفها للمساعدة
    (r"(السلطات|سلطات|سلطه|سلطة)", "Salad"),
    (r"(الحلى|حلويات)", "Dessert"),
    (r"(السندوتشات|السندويتشات|الشاورما|ساندويتش|رابس?)", "Sandwich"),
    (r"(الاطباق الرئيسية|الرئيسية|الرز|الارز|المنسف|الكبسه)", "Rice"),
    (r"(المشويات|الشوي|grill|مشوي)", "Meat"),
    (r"(البطاطس|الفرايز)", "Fries"),
    (r"(الصوص|الصلصات)", "Sauce"),
]

def classify_food(name="", desc="", section_name=""):
    text = ar_clean(f"{name} {desc}")
    sec  = ar_clean(section_name)
    labels = set()
    # من اسم القسم أولاً
    for pat, lab in SECTION_HINTS:
        if re.search(pat, sec, flags=re.IGNORECASE):
            labels.add(lab)
    # من الاسم + الوصف
    for pat, lab in CLASS_MAP:
        if re.search(pat, text, flags=re.IGNORECASE):
            labels.add(lab)
    return " - ".join(labels) if labels else "Unclassified"

# ---------- Scraping ----------
async def scroll_to_bottom(page, max_rounds=20, pause=800):
    last = 0
    for _ in range(max_rounds):
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(pause)
        h = await page.evaluate("document.body.scrollHeight")
        if h == last: break
        last = h

async def get_all_category_links(context):
    page = await context.new_page()
    await page.goto(MENU_ROOT, wait_until="domcontentloaded", timeout=60000)
    await scroll_to_bottom(page, max_rounds=10)
    anchors = await page.query_selector_all('a[href*="/menu/category/"]')
    links = set()
    for a in anchors:
        href = await a.get_attribute("href")
        if href and "/menu/category/" in href:
            links.add(ensure_abs(href.split("?")[0] + "?language=ar"))
    await page.close()
    return sorted(links)

async def scrape_category(context, category_url):
    rows = []
    page = await context.new_page()
    await page.goto(category_url, wait_until="domcontentloaded", timeout=60000)
    await scroll_to_bottom(page)
    # اسم القسم (يساعد في التصنيف)
    section_name = (await page.title()) or ""
    # كروت المنتجات
    cards = await page.query_selector_all('a[href*="/menu/product"]')
    # لو صفر، جرب تحديد أوسع
    if not cards:
        cards = await page.query_selector_all("a.menu-product, a.card, a[href*='product']")
    # fallback: صور ببديل alt
    imgs = await page.query_selector_all("img")

    # من روابط المنتجات
    for a in cards:
        name = (await a.get_attribute("title")) or (await a.inner_text() or "")
        name = ar_clean(name)
        img_url = ""
        img = await a.query_selector("img")
        if not img:
            # جرب أقرب كارد
            img = await a.evaluate_handle(
                "el => el.closest('article,div,li,section') && el.closest('article,div,li,section').querySelector('img')"
            )
        if img:
            try:
                if hasattr(img, "get_attribute"):
                    for attr in ["src","data-src","data-original","data-lazy","data-srcset","srcset"]:
                        v = await img.get_attribute(attr)
                        if v and v.strip():
                            img_url = v.split()[0].strip(); break
            except:
                pass

        if name and img_url:
            rows.append({
                "name": name,
                "img_url": img_url,
                "section": section_name,
                "classification": classify_food(name=name, section_name=section_name)
            })

    # fallback: أي صورة alt فيها اسم أكلة
    if not rows and imgs:
        for img in imgs:
            alt = ar_clean((await img.get_attribute("alt") or ""))
            if not alt: continue
            src = ""
            for attr in ["src","data-src","data-original","data-lazy","data-srcset","srcset"]:
                v = await img.get_attribute(attr)
                if v and v.strip():
                    src = v.split()[0].strip(); break
            if alt and src:
                rows.append({
                    "name": alt,
                    "img_url": src,
                    "section": section_name,
                    "classification": classify_food(name=alt, section_name=section_name)
                })

    await page.close()
    return rows

async def run_all():
    # robots.txt check
    if not robots_allows(MENU_ROOT, user_agent="*"):
        print("[robots] Not allowed. Stop.")
        return []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=["--no-sandbox"])
        context = await browser.new_context(locale="ar-SA", user_agent=UA)

        # 1) اجمع كل روابط الأقسام
        cats = await get_all_category_links(context)
        print(f"[info] Categories found: {len(cats)}")
        for c in cats: print(" -", c)

        # 2) اكشط كل قسم
        all_rows = []
        for c in cats:
            try:
                rows = await scrape_category(context, c)
                print(f"[info] {c} -> {len(rows)} items")
                all_rows.extend(rows)
            except Exception as e:
                print("[warn] category failed:", c, e)

        await context.close()
        await browser.close()
    return all_rows

# ---------- Run ----------
rows = asyncio.get_event_loop().run_until_complete(run_all())
print("Total parsed rows:", len(rows))
print(rows[:5])

# ---------- Save (download images + CSV/JSON) ----------
final = []
scrape_date = datetime.now().strftime("%Y-%m-%d")
for r in rows:
    img_path = download_image(r["img_url"], r["name"] or "item", MENU_ROOT)
    final.append({
        "name": r["name"],
        "image_file": img_path,
        "classification": r["classification"],
        "scrape_date": scrape_date
    })

with open("dishes.csv","w",newline="",encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["name","image_file","classification","scrape_date"])
    w.writeheader()
    w.writerows(final)

with open("dishes.json","w",encoding="utf-8") as f:
    json.dump(final, f, ensure_ascii=False, indent=2)

print(f"Saved {len(final)} items → dishes.csv & dishes.json")

[info] Categories found: 10
 - https://shawayahouse.my.taker.io/menu/category/10?language=ar
 - https://shawayahouse.my.taker.io/menu/category/11?language=ar
 - https://shawayahouse.my.taker.io/menu/category/12?language=ar
 - https://shawayahouse.my.taker.io/menu/category/18?language=ar
 - https://shawayahouse.my.taker.io/menu/category/20?language=ar
 - https://shawayahouse.my.taker.io/menu/category/25?language=ar
 - https://shawayahouse.my.taker.io/menu/category/6?language=ar
 - https://shawayahouse.my.taker.io/menu/category/7?language=ar
 - https://shawayahouse.my.taker.io/menu/category/8?language=ar
 - https://shawayahouse.my.taker.io/menu/category/9?language=ar
[info] https://shawayahouse.my.taker.io/menu/category/10?language=ar -> 5 items
[info] https://shawayahouse.my.taker.io/menu/category/11?language=ar -> 3 items
[info] https://shawayahouse.my.taker.io/menu/category/12?language=ar -> 7 items
[info] https://shawayahouse.my.taker.io/menu/category/18?language=ar -> 1 items
[info]

In [9]:
# اضغطي مجلد الصور إلى ملف ZIP
import shutil, os, zipfile
from google.colab import files

IMAGES_DIR = "images"  # غيّريه إذا غيرتي مسار الصور
ZIP_NAME = "images_backup.zip"

# يصنع ملف ZIP من مجلد الصور
shutil.make_archive("images_backup", "zip", IMAGES_DIR)

# نزّلي ملف الـ ZIP + الملفات المرافقة
files.download(ZIP_NAME)            # ينزّل مجلد الصور مضغوط
if os.path.exists("dishes.csv"): files.download("dishes.csv")
if os.path.exists("dishes.json"): files.download("dishes.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>