<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/emiratiWebscriping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab-ready site-wide dish scraper for arabianteahouse.me
# Output: dishes_all.json + dishes_all.csv
# Fields: dish_name, labels, image_url, date_scraped, source_url
# Notes:
# - Crawls arabianteahouse.me, detects "Ingredients" sections to identify dish pages.
# - Throttled & domain-restricted. Adjust LIMITS to cover more pages if needed.

import requests, re, time, json, pandas as pd, sys
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin, urlparse

START_URLS = [
    "https://arabianteahouse.me/",                         # home
    "https://arabianteahouse.me/menu/",                    # menu hub (if exists)
    "https://arabianteahouse.me/podgorica-branch/",        # example branch
]
ALLOWED_DOMAIN = "arabianteahouse.me"

# ---- LIMITS / TUNABLES ----
MAX_PAGES_TO_VISIT = 2500      # raise if you want deeper coverage
REQUEST_TIMEOUT = 25
CRAWL_DELAY_SEC = 0.6          # be polite
SAVE_EVERY = 50                # checkpoint frequency

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

# A lightweight vocabulary to normalize "main" ingredients -> labels
# Extend freely for better recall
INGREDIENT_VOCAB = {
    # staples
    "rice","basmati","bread","vermicelli","noodles","bulgur","couscous",
    # proteins
    "chicken","meat","beef","lamb","mutton","fish","shrimp","prawn","egg","eggs",
    # veg & legumes
    "tomato","tomatoes","onion","onions","garlic","ginger","potato","potatoes",
    "chickpeas","lentils","okra","eggplant","zucchini","pepper","green chili",
    # herbs
    "coriander","cilantro","parsley","mint","dill",
    # dairy & sauces
    "yogurt","laban","cream","tomato paste","tomato sauce","tahini","ghee","milk",
    # spices
    "cumin","coriander powder","turmeric","cardamom","cinnamon","cloves",
    "black pepper","bay leaf","bay leaves","sumac","zaatar","saffron",
    "paprika","chili","red chili",
    # oils & basics
    "oil","olive oil","vegetable oil","salt","water","vinegar","lemon","lime"
}

SKIP_SUBSTRINGS = [
    # obvious non-content / utility pages to skip enqueuing
    "/wp-json", "/feed", "/xmlrpc", "/tag/", "/author/", "/category/",
    "/privacy", "/terms", "/contact", "/reservation", "/book", "/cart",
    "/checkout", "/my-account", "/login"
]

def same_domain(url):
    try:
        return urlparse(url).netloc.endswith(ALLOWED_DOMAIN)
    except Exception:
        return False

def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def looks_like_dish_page(soup):
    """Heuristic: presence of an 'Ingredients' header with following list items."""
    hdr = soup.find(lambda t: t and t.name in ("h2","h3","h4") and "ingredients" in t.get_text(strip=True).lower())
    if not hdr:
        return False
    ul = hdr.find_next(["ul","ol"])
    return bool(ul and ul.find_all("li"))

def extract_ingredients(soup):
    hdr = soup.find(lambda t: t and t.name in ("h2","h3","h4") and "ingredients" in t.get_text(strip=True).lower())
    if not hdr:
        return []
    ul = hdr.find_next(["ul","ol"])
    if not ul:
        return []
    out = []
    for li in ul.find_all("li"):
        txt = re.sub(r"\s+", " ", li.get_text(" ", strip=True))
        out.append(txt)
    return out

def normalize_labels(ingredients_list):
    labels = set()
    for line in ingredients_list:
        low = line.lower()
        for token in sorted(INGREDIENT_VOCAB, key=lambda x: -len(x)):
            if token in low:
                normalized = (token
                              .replace("tomatoes","tomato")
                              .replace("onions","onion")
                              .replace("potatoes","potato")
                              .replace("eggs","egg")
                              .replace("bay leaves","bay leaf")
                              .replace("prawns","shrimp"))
                labels.add(normalized)
    # prefer specific over generic
    if "coriander powder" in labels and "coriander" in labels:
        labels.discard("coriander")
    return sorted(labels)

def extract_title(soup):
    for sel in ["h1", "header h1", "article h1", "title"]:
        h = soup.select_one(sel)
        if h:
            return h.get_text(strip=True)
    return ""

def extract_image_url(soup):
    # Prefer og:image if present
    og = soup.find("meta", property="og:image")
    if og and og.get("content"):
        return og["content"]
    # Fallback to first image in main content
    content = soup.find("div", class_=re.compile(r"(entry|post|content)", re.I)) or soup
    img = content.find("img")
    if img and img.get("src"):
        return img["src"]
    # Final fallback: any image
    img = soup.find("img")
    return img["src"] if img and img.get("src") else ""

def is_skippable_link(href):
    if not href:
        return True
    if not same_domain(href):
        return True
    low = href.lower()
    if any(s in low for s in SKIP_SUBSTRINGS):
        return True
    # avoid media files and fragments
    if any(low.endswith(ext) for ext in [".jpg",".jpeg",".png",".gif",".webp",".pdf",".svg",".mp4",".zip"]):
        return True
    if "#" in low:
        return True
    return False

def extract_links(current_url, soup):
    links = set()
    for a in soup.select("a[href]"):
        href = a.get("href", "").strip()
        if not href:
            continue
        href = urljoin(current_url, href)
        if not is_skippable_link(href):
            links.add(href)
    return links

def scrape_dish(url, soup=None):
    if soup is None:
        soup = get_soup(url)
    title = extract_title(soup)
    ingredients = extract_ingredients(soup)
    labels = normalize_labels(ingredients)
    image_url = extract_image_url(soup)
    return {
        "dish_name": title,
        "labels": labels,
        "image_url": image_url,
        "date_scraped": datetime.utcnow().date().isoformat(),
        "source_url": url
    }

def checkpoint(data):
    # Save JSON
    with open("dishes_all.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    # Save CSV
    df = pd.DataFrame(data)
    if not df.empty:
        df["labels"] = df["labels"].apply(lambda xs: ";".join(xs))
    df.to_csv("dishes_all.csv", index=False, encoding="utf-8-sig")

def crawl():
    from collections import deque
    queue = deque(START_URLS)
    visited = set()
    results = []
    seen_sources = set()  # dedupe by URL
    pages_seen = 0

    while queue and pages_seen < MAX_PAGES_TO_VISIT:
        url = queue.popleft()
        if url in visited:
            continue
        visited.add(url)

        try:
            soup = get_soup(url)
        except Exception as e:
            # print errors but keep going
            print(f"[skip] {url} -> {e}", file=sys.stderr)
            continue

        pages_seen += 1
        # If page is a dish, extract
        if looks_like_dish_page(soup):
            try:
                item = scrape_dish(url, soup)
                if item["dish_name"] and item["labels"]:
                    if url not in seen_sources:
                        results.append(item)
                        seen_sources.add(url)
                        print(f"[dish] {item['dish_name']}  ({url})")
            except Exception as e:
                print(f"[dish-error] {url} -> {e}", file=sys.stderr)

        # Enqueue more links
        for link in extract_links(url, soup):
            if link not in visited:
                queue.append(link)

        # throttle
        time.sleep(CRAWL_DELAY_SEC)

        # periodic checkpoint
        if pages_seen % SAVE_EVERY == 0:
            print(f"[checkpoint] pages={pages_seen} dishes={len(results)}")
            checkpoint(results)

    # final save
    checkpoint(results)
    print(f"[done] visited={pages_seen} unique_dishes={len(results)}")
    return results

if __name__ == "__main__":
    crawl()


  "date_scraped": datetime.utcnow().date().isoformat(),


[dish] Luqaimat  (https://arabianteahouse.me/luqaimat/)
[dish] Kunafa  (https://arabianteahouse.me/kunafa/)
[dish] Umm ALI  (https://arabianteahouse.me/umm-ali/)
[dish] Chicken Biryani  (https://arabianteahouse.me/chicken-biryani/)
[dish] Chicken Machboos  (https://arabianteahouse.me/chicken-machboos/)
[dish] Beef Machboos  (https://arabianteahouse.me/beef-machboos/)


[skip] https://arabianteahouse.me/ar-home/dhmontenegro20@gmail.com -> 404 Client Error: Not Found for url: https://arabianteahouse.me/ar-home/dhmontenegro20@gmail.com


[dish] Liver Hummus  (https://arabianteahouse.me/liver-ummus/)
[checkpoint] pages=50 dishes=7
[done] visited=50 unique_dishes=7
