<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/fufu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Fufu's Kitchen - Middle Eastern category scraper (Colab-ready)
# Outputs: fufus_middle_eastern.json and fufus_middle_eastern.csv
# Fields: dish_name, labels (main ingredients), image_file, date_scraped

import requests, re, time, json, pandas as pd, os
from bs4 import BeautifulSoup
from datetime import datetime

BASE_CATEGORY = "https://www.fufuskitchen.com/category/middle-eastern/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}
REQUEST_TIMEOUT = 25
CRAWL_DELAY_SEC = 0.6   # Be polite to the site
SAVE_EVERY = 30

# Folder for images
IMG_DIR = "images"
os.makedirs(IMG_DIR, exist_ok=True)

# Vocabulary to collapse free-text ingredients into concise "main" labels
INGREDIENT_VOCAB = {
    "rice","bulgur","couscous","vermicelli","bread","pita","khubz","freekeh",
    "chicken","beef","lamb","mutton","fish","shrimp","prawn","egg","eggs","turkey",
    "chickpeas","lentils","fava","beans","tomato","tomatoes","onion","onions","garlic","ginger",
    "eggplant","zucchini","okra","spinach","parsley","cilantro","coriander","mint",
    "potato","potatoes","pepper","bell pepper","green chili",
    "yogurt","labneh","laban","cream","butter","ghee","olive oil","oil","tahini","milk",
    "tomato paste","tomato sauce","pomegranate molasses","harissa",
    "cumin","coriander powder","turmeric","cardamom","cinnamon","cloves","sumac","zaatar","allspice",
    "black pepper","paprika","chili","bay leaf","bay leaves","saffron","nutmeg",
    "lemon","lime","vinegar","salt","water","garlic powder","onion powder"
}

def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def find_category_pages(first_url):
    pages = []
    page_num = 1
    while True:
        url = first_url if page_num == 1 else first_url.rstrip('/') + f"/page/{page_num}/"
        try:
            soup = get_soup(url)
        except Exception:
            break
        posts = extract_post_links_from_category(soup)
        if not posts:
            break
        pages.append((url, posts))
        page_num += 1
        time.sleep(CRAWL_DELAY_SEC)
    return pages

def extract_post_links_from_category(soup):
    links = set()
    for a in soup.select("article h2 a, h2.entry-title a, .entry-title a, a[rel='bookmark']"):
        href = a.get("href")
        if href and href.startswith("https://www.fufuskitchen.com/"):
            links.add(href)
    if not links:
        for a in soup.select("a[href*='fufuskitchen.com/']"):
            href = a.get("href")
            if href and "/category/" not in href and "/page/" not in href:
                links.add(href)
    return sorted(links)

def detect_ingredients_blocks(soup):
    blocks = []
    wprm_items = soup.select(".wprm-recipe-ingredient, .wprm-recipe-ingredient-name")
    if wprm_items:
        for el in wprm_items:
            txt = el.get_text(" ", strip=True)
            if txt:
                blocks.append(txt)
    if not blocks:
        hdr = soup.find(lambda t: t and t.name in ("h2","h3","h4") and "ingredient" in t.get_text(strip=True).lower())
        if hdr:
            ul = hdr.find_next(["ul","ol"])
            if ul:
                for li in ul.find_all("li"):
                    txt = re.sub(r"\s+", " ", li.get_text(" ", strip=True))
                    if txt:
                        blocks.append(txt)
    if not blocks:
        for script in soup.find_all("script", type="application/ld+json"):
            try:
                data = json.loads(script.string)
            except Exception:
                continue
            def _extract_from_obj(obj):
                nonlocal blocks
                if isinstance(obj, dict):
                    if obj.get("@type") == "Recipe" and obj.get("recipeIngredient"):
                        for it in obj["recipeIngredient"]:
                            if isinstance(it, str) and it.strip():
                                blocks.append(it.strip())
                elif isinstance(obj, list):
                    for o in obj:
                        _extract_from_obj(o)
            _extract_from_obj(data)
    cleaned = []
    for line in blocks:
        line = re.sub(r"\s+", " ", line).strip()
        if line:
            cleaned.append(line)
    return cleaned

def normalize_labels(ingredients_list):
    labels = set()
    for line in ingredients_list:
        low = line.lower()
        for token in sorted(INGREDIENT_VOCAB, key=lambda x: -len(x)):
            if token in low:
                normalized = (token
                              .replace("tomatoes","tomato")
                              .replace("onions","onion")
                              .replace("potatoes","potato")
                              .replace("eggs","egg")
                              .replace("bay leaves","bay leaf")
                              .replace("prawns","shrimp"))
                labels.add(normalized)
    if "coriander powder" in labels and "coriander" in labels:
        labels.discard("coriander")
    return sorted(labels)

def extract_title(soup):
    rname = soup.select_one(".wprm-recipe-name")
    if rname:
        return rname.get_text(strip=True)
    h1 = soup.find("h1")
    if h1:
        return h1.get_text(strip=True)
    t = soup.find("title")
    return t.get_text(strip=True) if t else ""

def extract_image_url(soup):
    og = soup.find("meta", property="og:image")
    if og and og.get("content"):
        return og["content"]
    img = soup.select_one(".wprm-recipe img, article img")
    if img and img.get("src"):
        return img["src"]
    img = soup.find("img")
    return img["src"] if img and img.get("src") else ""

def download_image(url, dish_name):
    if not url:
        return ""
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True)
        r.raise_for_status()
        base = re.sub(r"[^\w\-]+", "_", dish_name.lower())[:50]
        ext = os.path.splitext(url)[1].split("?")[0] or ".jpg"
        fname = f"{base}{ext}"
        path = os.path.join(IMG_DIR, fname)
        with open(path, "wb") as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        return path
    except Exception:
        return ""

def scrape_recipe(url):
    soup = get_soup(url)
    dish_name = extract_title(soup)
    ing_lines = detect_ingredients_blocks(soup)
    labels = normalize_labels(ing_lines)
    image_url = extract_image_url(soup)
    image_file = download_image(image_url, dish_name)
    return {
        "dish_name": dish_name,
        "labels": labels,
        "image_url": image_url,
        "date_scraped": datetime.utcnow().date().isoformat()
    }

def checkpoint(rows, json_name="fufus_middle_eastern.json", csv_name="fufus_middle_eastern.csv"):
    with open(json_name, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)
    df = pd.DataFrame(rows)
    if not df.empty:
        df["labels"] = df["labels"].apply(lambda xs: ";".join(xs))
    df.to_csv(csv_name, index=False, encoding="utf-8-sig")

def main():
    pages = find_category_pages(BASE_CATEGORY)
    print(f"Found {len(pages)} category pages.")
    all_links = []
    for url, links in pages:
        print(f"- {url} -> {len(links)} posts")
        all_links.extend(links)

    seen, post_links = set(), []
    for u in all_links:
        if u not in seen:
            seen.add(u)
            post_links.append(u)
    print(f"Unique posts: {len(post_links)}")

    results = []
    for idx, url in enumerate(post_links, 1):
        try:
            item = scrape_recipe(url)
            if item["dish_name"] and item["labels"]:
                results.append(item)
                print(f"[{idx}/{len(post_links)}] {item['dish_name']}  (labels: {', '.join(item['labels'])})")
            else:
                print(f"[{idx}/{len(post_links)}] Skipped: {url}")
        except Exception as e:
            print(f"[{idx}/{len(post_links)}] Error: {url} -> {e}")
        time.sleep(CRAWL_DELAY_SEC)

        if idx % SAVE_EVERY == 0:
            checkpoint(results)
            print(f"Checkpoint saved at {idx} recipes...")

    checkpoint(results)
    print(f"Done. Saved {len(results)} recipes.")

if __name__ == "__main__":
    main()


Found 8 category pages.
- https://www.fufuskitchen.com/category/middle-eastern/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/2/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/3/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/4/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/5/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/6/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/7/ -> 18 posts
- https://www.fufuskitchen.com/category/middle-eastern/page/8/ -> 7 posts
Unique posts: 133


  "date_scraped": datetime.utcnow().date().isoformat()


[1/133] Baba Ghanoush (Roasted Eggplant Dip)  (labels: cloves, cumin, egg, eggplant, garlic, lemon, oil, olive oil, paprika, salt, tahini)
[2/133] Baked Lamb and Vegetables  (labels: allspice, bell pepper, black pepper, cloves, garlic, lamb, oil, olive oil, onion, pepper, salt, tomato, tomato paste, zucchini)
[3/133] Braised Chuck Roast  (labels: bay leaf, black pepper, cardamom, chili, cinnamon, cloves, garlic, garlic powder, oil, onion, onion powder, paprika, pepper, salt, tomato, tomato paste, turmeric)
[4/133] Date Stuffed Bread (Ma’arouk)  (labels: butter, cardamom, cinnamon, cream, egg, ghee, milk, salt)
[5/133] Easy Authentic Hummus Recipe  (labels: cloves, cumin, garlic, lemon, oil, olive oil, parsley, salt, sumac, tahini)
[6/133] Easy Labneh Dip Recipe  (labels: chili, cloves, garlic, labneh, lemon, oil, olive oil, pepper, salt)
[7/133] Herby Potato Salad  (labels: cilantro, cloves, garlic, lemon, mint, oil, olive oil, onion, parsley, potato, salt)
[8/133] Jaj Mahshi (Rice Stu