<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/%D8%A7%D9%88%D9%81%D9%83%D8%A7%D8%AA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Install Google Chrome + Python deps (Colab) ---
!wget -q -O /tmp/chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt -qy install /tmp/chrome.deb
!pip -q install selenium webdriver-manager pillow pandas beautifulsoup4 lxml


Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  libvulkan1 mesa-vulkan-drivers
The following NEW packages will be installed:
  google-chrome-stable libvulkan1 mesa-vulkan-drivers
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 10.9 MB/132 MB of archives.
After this operation, 447 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libvulkan1 amd64 1.3.204.1-2 [128 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 mesa-vulkan-drivers amd64 23.2.1-1ubuntu3.1~22.04.3 [10.7 MB]
Get:3 /tmp/chrome.deb google-chrome-stable amd64 140.0.7339.185-1 [121 MB]
Fetched 10.9 MB in 2s (6,380 kB/s)
Selecting previously unselected package libvulkan1:amd64.
(Reading database ... 126435 files and directories currently installed.)
Preparing to unpack .../libvulkan1_1.3.204.1-2_amd64.deb ...
Unpacking libvulkan1:amd64 (1.3.204.1-

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def setup_driver():
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1280,2000")
    service = Service(ChromeDriverManager().install())  # matches Chrome version
    return webdriver.Chrome(service=service, options=opts)


In [None]:
# OVQAT (ovqat.tryorder.net) – menu scraper (Colab-ready)
# - Renders the JS site with Selenium
# - Extracts dish name, downloads image, classifies (rice/salad/chicken/...),
#   stores Base64 image in JSON and local filename in CSV, with scrape date.

!pip -q install selenium webdriver-manager pillow pandas beautifulsoup4 lxml

import base64, io, os, re, time, json, pandas as pd, requests
from datetime import datetime
from PIL import Image
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

START_URL = "https://ovqat.tryorder.net/menu"
OUT_JSON = "ovqat_menu.json"
OUT_CSV  = "ovqat_menu.csv"
IMG_DIR  = "ovqat_images"
os.makedirs(IMG_DIR, exist_ok=True)

# --- Classification keywords (Arabic + English) ---
# If a dish has rice + salad + chicken (by name/desc), we'll output: "rice;salad;chicken"
CLASS_MAP = {
    "rice":    [r"\b(rice|biryani|kabsa|bukhari|riz)\b", r"رز", r"كبسة", r"بخاري", r"مندي", r"مظبي"],
    "salad":   [r"\b(salad|fattoush|tabbouleh)\b", r"سلطة", r"فتوش", r"تبولة"],
    "chicken": [r"\b(chicken|chkn)\b", r"دجاج", r"فراخ"],
    "beef":    [r"\b(beef|meat|veal)\b", r"لحم", r"لحم بقري"],
    "lamb":    [r"\b(lamb|mutton)\b", r"غنم", r"ضأن", r"لحم غنم"],
    "fish":    [r"\b(fish|hamour|hammour|salmon)\b", r"سمك", r"هامور"],
    "shrimp":  [r"\b(shrimp|prawn)\b", r"روبيان", r"جمبري"],
    "bread":   [r"\b(bread|flatbread|khubz|tamees)\b", r"خبز", r"تميس"],
    "soup":    [r"\b(soup|shorba)\b", r"شوربة"],
    "pasta":   [r"\b(pasta|spaghetti|macaroni)\b", r"مكرونة"],
    "sauce":   [r"\b(sauce|gravy|dip)\b", r"صوص", r"صلصة"],
    "dessert": [r"\b(dessert|sweet|kunafa|qatayef|luqaimat|basbousa)\b", r"حلى", r"حلويات", r"كنافة", r"لقيمات", r"بسبوسة"],
}

def classify_food(text):
    found = []
    low = text.lower()
    for label, patterns in CLASS_MAP.items():
        for pat in patterns:
            if re.search(pat, low):
                found.append(label)
                break
    return sorted(set(found))

def setup_driver():
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1280,2000")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    return driver

def fully_load_menu(driver, url):
    driver.get(url)
    # Wait for any menu section or item to appear
    # We'll try several possible selectors common in menu apps
    possible = [
        # buttons/cards
        "button.menu-item", ".menu-item", ".card.menu-item",
        # generic product cards
        "[class*='menu'] [class*='item']", "[data-testid*='item']",
        # headings inside sections
        "section h2", "h2[class*='text-'], h3[class*='text-']"
    ]
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ",".join(possible)))
    )
    # try to scroll to load lazy content
    last = 0
    for _ in range(15):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.8)
        new = driver.execute_script("return document.body.scrollHeight")
        if new == last:
            break
        last = new
    html = driver.page_source
    return html

def parse_items_from_html(html, base_url):
    """
    Heuristic parser:
    - Find likely item blocks by locating images + nearby titles.
    - Works across many Tailwind/JS menu UIs.
    """
    soup = BeautifulSoup(html, "lxml")

    # Strategy:
    # 1) Find all images that look like dish images (have alt/title or are inside buttons/cards)
    # 2) For each, try to find the closest name element (alt, aria-label, nearby h3/h4, button text)
    items = []
    candidates = []

    # All images
    for img in soup.select("img"):
        src = img.get("src") or img.get("data-src") or ""
        if not src:
            continue
        # Skip logos/icons
        if any(x in src.lower() for x in ["logo", "icon", "sprite", "placeholder", "avatar"]):
            continue
        # wrap candidates with context
        parent = img.find_parent(["button","a","div","article","section","li"]) or img.parent
        candidates.append((img, parent, src))

    def nearest_text(node):
        # prefer alt/title/aria-label
        alt = (node.get("alt") or node.get("title") or "").strip()
        if alt:
            return alt
        # nearby heading
        for tag in ["h1","h2","h3","h4","h5","p","span","strong","button"]:
            sib = node.find_next(tag)
            if sib and sib.get_text(strip=True):
                return sib.get_text(strip=True)
        # parent text
        p = node.parent
        if p and p.get_text(strip=True):
            return p.get_text(strip=True)
        return ""

    seen = set()
    for img, parent, src in candidates:
        name = (img.get("alt") or "").strip()
        if not name:
            name = nearest_text(parent or img) or nearest_text(img)
        name = re.sub(r"\s+", " ", name).strip()

        # Keep only likely dish-like names (Arabic or English words, not very short)
        if not name or len(name) < 2:
            continue

        # Absolute URL for image
        src_abs = urljoin(base_url, src)

        key = (name, src_abs)
        if key in seen:
            continue
        seen.add(key)

        # Try to find a short description near the image to help classification
        desc = ""
        for tag in ["p","span","div"]:
            n = (parent or img).find_next(tag)
            if n and len(n.get_text(strip=True)) > 10:
                desc = n.get_text(" ", strip=True)
                break

        text_for_cls = f"{name} {desc}"
        items.append({
            "dish_name": name,
            "image_url": src_abs,
            "desc": desc
        })

    # Light de-dup based on name if many images repeat the same dish
    by_name = {}
    for it in items:
        nm = it["dish_name"]
        if nm not in by_name:
            by_name[nm] = it
    return list(by_name.values())

def fetch_image_to_file(url, fname):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        # Basic image validation + convert to JPEG/PNG if needed
        img = Image.open(io.BytesIO(r.content)).convert("RGB")
        img.save(fname, format="JPEG", quality=90)
        with open(fname, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("utf-8")
        return b64, True, None
    except Exception as e:
        return "", False, str(e)

def main():
    driver = setup_driver()
    try:
        html = fully_load_menu(driver, START_URL)
    finally:
        driver.quit()

    raw_items = parse_items_from_html(html, START_URL)
    print(f"Found ~{len(raw_items)} candidate dishes (before filtering).")

    rows = []
    today = datetime.utcnow().date().isoformat()
    for idx, it in enumerate(raw_items, 1):
        name = it["dish_name"]
        img_url = it["image_url"]
        desc = it.get("desc","")
        cls_tags = classify_food(f"{name} {desc}")
        # Keep food items only (optional filter: must match at least one class)
        if not cls_tags:
            # you can skip non-food lines; for now, keep them but with empty classification
            pass

        # download image
        safe_base = re.sub(r"[^-\w_\.]+", "_", name.lower())[:60] or f"item_{idx}"
        img_path = os.path.join(IMG_DIR, f"{safe_base}.jpg")
        image_b64, ok, err = fetch_image_to_file(img_url, img_path)
        if not ok:
            print(f"[img-fail] {name} -> {err}")

        row = {
            "dish_name": name,
            "food_classification": ";".join(cls_tags),
            "image_file": img_path,         # path for CSV use
            "image_b64": image_b64,         # actual image bytes (Base64) for JSON
            "date_scraped": today,
            "source_url": START_URL
        }
        rows.append(row)

    # Save JSON (with embedded images)
    with open(OUT_JSON, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)

    # Save CSV (filepaths are practical here)
    df = pd.DataFrame(rows)
    df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

    print(f"Saved {len(rows)} items → {OUT_JSON} & {OUT_CSV}. Images in {IMG_DIR}/")

if __name__ == "__main__":
    main()


Found ~38 candidate dishes (before filtering).


  today = datetime.utcnow().date().isoformat()


Saved 38 items → ovqat_menu.json & ovqat_menu.csv. Images in ovqat_images/
