## CGAS Assignment 1
- Parisha Agrawal | 2021270
- Annu Kumari | 2021312

## Q1) Complete the following analysis using the recipes’ data.

### (a) Scrape (using libraries such as BeautifulSoup) any 10,000 recipes. Submit the raw data. [5]
Include recipe titles, ingredient phrases, cooking instructions, and other relevant details.

In [None]:
# !pip install requests
# !pip install beautifulsoup4
# !pip install pandas
# !pip install spacy

In [None]:
import re
import csv
import json
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

In [None]:
recipe_url_rx = r"https://www\.allrecipes\.com/recipe/\d+/.+"
recipes_url_rx = r"https:\/\/www\.allrecipes\.com\/recipes\/\d+\/.+\/"

In [None]:
def format_prep_time(duration):
    # PT10M to 10 mins
    minutes = re.search(r'(\d+)M', duration)
    hours = re.search(r'(\d+)H', duration)
    days = re.search(r'(\d+)D', duration)

    formatted_duration = []

    if days:
        formatted_duration.append(f"{days.group(1)} days")
    if hours:
        formatted_duration.append(f"{hours.group(1)} hours")
    if minutes:
        formatted_duration.append(f"{minutes.group(1)} mins")

    return " ".join(formatted_duration) if formatted_duration else "No prep time found"

In [None]:
def parse_urls(s, max_categories):
    recipes_categories = []
    try:
        categories_req = s.get("https://www.allrecipes.com/recipes-a-z-6735880")
    except Exception:
        print("Error in retrieving categories")
        return recipes_categories

    if categories_req.status_code == 200:
        soup = BeautifulSoup(categories_req.text, "html.parser")
        links = soup.find_all("a", class_="mntl-link-list__link")
        for link in links:
            if max_categories == len(recipes_categories):
                break
            href = link.get("href")
            if href and re.match(recipes_url_rx, href):
                recipes_categories.append(href)
    else:
        print("Failed to retrieve categories")

    return recipes_categories

In [None]:
def get_category(s, category_url):
    try:
        category_req = s.get(category_url)
        return category_req
    except Exception:
        print(f"Failed to retrieve the category: {category_url}")
        return None

In [None]:
def get_category_recipes_urls(category):
    category_urls = []
    recipes_soup = BeautifulSoup(category.text, "html.parser")
    recipes_soup = recipes_soup.find_all("a", {"class": "card"})
    for recipe_link in recipes_soup:
        if re.match(recipe_url_rx, recipe_link["href"]):
            category_urls.append(recipe_link["href"])
    return category_urls

In [None]:
def get_recipe(s, recipe_url):
    try:
        recipe_req = s.get(recipe_url)
    except Exception:
        print(f"Failed to retrieve the recipe: {recipe_url}")
        return

    if recipe_req and recipe_req.status_code == 200:
        recipe_soup = BeautifulSoup(recipe_req.text, "html.parser")
        recipe_data = recipe_soup.find("script", {"class": "allrecipes-schema"})

        if recipe_data:
            recipe_json = json.loads(recipe_data.text)[0]
            headline = recipe_json.get("headline", "No headline found")
            recipe_instructions = recipe_json.get("recipeInstructions", [])
            recipe_ingredients = recipe_json.get("recipeIngredient", [])

            # Optionals
            region_cuisine = recipe_json.get("recipeCuisine", "No cuisine found")
            if isinstance(region_cuisine, list):
                region_cuisine = ", ".join(region_cuisine)

            servings = recipe_json.get("recipeYield", "No servings found")
            if isinstance(servings, list):
                servings = ", ".join(servings)

            recipe_prep_time = recipe_json.get("prepTime", "No prep time found")
            formatted_prep_time = format_prep_time(recipe_prep_time)

            # Save to csv
            with open("Q1_a_scraped_recipes.csv", mode="a", newline='', encoding="utf-8") as file:
                writer = csv.writer(file)
                if file.tell() == 0:
                    writer.writerow(["Recipe name", "Recipe URL", "List of ingredient phrases", "List of instructions", "Region/Cuisine", "Servings", "Preparation time"])

                formatted_instructions = [step['text'] for step in recipe_instructions]
                # formatted_ingredients = " ".join(recipe_ingredients)
                writer.writerow([headline, recipe_url, recipe_ingredients, formatted_instructions, region_cuisine, servings, formatted_prep_time])

In [None]:
def scrape_recipes(s, progress, recipe_categories, max_categories, max_recipes):
    global recipes_count
    for i, category_url in enumerate(recipe_categories):
        if i == max_categories:
            break
        progress["category"] = category_url
        category = get_category(s, category_url)
        if not category:
            continue
        category_urls = get_category_recipes_urls(category)
        progress["category_urls"] = category_urls
        for j, recipe_url in enumerate(category_urls):
            if recipes_count == max_recipes:
                break
            recipes_count += 1
            progress["recipe"] = recipe_url
            get_recipe(s, recipe_url)
            progress["downloaded_count"] += 1
            progress["category_urls"].pop(j)
            if len(progress["categories"]) != 0:
                progress["categories"].pop(i)

In [None]:
s = requests.Session()
retries = Retry(total=6, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.headers = {"User-Agent": "Mozilla/5.0 (X11 Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome /"}
s.timeout = 12

In [None]:
max_recipes = 10005
max_categories = 2000
recipes_count = 0
progress = {
    "category": None,
    "recipe": None,
    "downloaded_count": 0,
    "categories": [],
    "category_urls": [],
    "recipe_urls": [],
    "failed_recipes": []
}

In [None]:
recipe_categories = parse_urls(s, max_categories)

In [None]:
scrape_recipes(s, progress, recipe_categories, max_categories, max_recipes)

### (b) Write a script to extract information about the ‘name of the ingredients’ from the ingredients section using Named Entity Recognition.

In [None]:
import re
import spacy
import pandas as pd
from spacy.matcher import Matcher

In [None]:
# spaCy's english model
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

In [None]:
pattern = [{"POS": "NOUN"}, {"IS_ALPHA": True, "OP": "?"}]
matcher.add("INGREDIENTS_PATTERN", [pattern])

fluffwords = set([
    'a', 'all', 'an', 'and', 'as', 'beaten', 'baked', 'boiled', 'browned', 'chopped', 'clove', 'coarsely',
    'cold', 'cooked', 'crushed', 'cubed', 'cup', 'cups', 'dash', 'diced', 'divided', 'finely', 'fresh', 'g',
    'garnished', 'gram', 'grams', 'grated', 'ground', 'halved', 'inch', 'kg', 'l', 'large', 'liter', 'liters',
    'melted', 'medium', 'medium-sized', 'minced', 'ml', 'of', 'or', 'ounce', 'ounces', 'package', 'pinch',
    'pound', 'pounds', 'purpose', 'raw', 'roasted', 'shredded', 'slice', 'sliced', 'small', 'steamed',
    'stick', 'sweetened', 'tablespoon', 'tablespoons', 'taste', 'teaspoon', 'teaspoons', 'the', 'thick',
    'thickly', 'thinly', 'to', 'unsalted', 'unsweetened', 'with', 'whole'
])

In [None]:
num_rx = re.compile(r'\b\d+(\.\d+)?\b')
fluffwords_rx = [re.compile(r'\b' + re.escape(fluff) + r'\b') for fluff in fluffwords]

In [None]:
df_recipes = pd.read_csv('Q1_a_scraped_recipes.csv')

In [None]:
# assign random Recipe IDs to all recipes
df_recipes['Recipe ID'] = ['Recipe ' + str(i + 1) for i in range(len(df_recipes))]

In [None]:
def clean_ingredient(ingredient):
    ingredient = re.sub(r'\(.*?\)', '', ingredient)
    for pattern in fluffwords_rx:
        ingredient = pattern.sub('', ingredient)
    ingredient = num_rx.sub('', ingredient)
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()
    return ingredient

def extract_ingredient_entities(ingredient_phrase):
    ingredient_phrase = re.sub(r'\(.*?\)', '', ingredient_phrase)
    for pattern in fluffwords_rx:
        ingredient_phrase = pattern.sub('', ingredient_phrase)
    ingredient_phrase = num_rx.sub('', ingredient_phrase)
    cleaned_text = re.sub(r'\s+', ' ', ingredient_phrase).strip()

    doc = nlp(cleaned_text)
    ingredients = [chunk.text for chunk in doc.noun_chunks] or [cleaned_text]
    cleaned_ingredients = [clean_ingredient(ingredient) for ingredient in ingredients]
    return cleaned_ingredients

In [None]:
# Process ingredient phrases and extract cleaned ingredients
ingredient_data = []
for index, row in df_recipes.iterrows():
    recipe_id = row['Recipe ID']
    ingredient_phrases = eval(row['List of ingredient phrases'])
    for phrase in ingredient_phrases:
        ingredients = extract_ingredient_entities(phrase)
        ingredient_data.extend({'Recipe ID': recipe_id, 'Ingredient': ingredient} for ingredient in ingredients if ingredient)

In [None]:
# save extracted ingredients
ingredient_df = pd.DataFrame(ingredient_data).drop_duplicates()
ingredient_df.to_csv('Q1_b_ingredient_entities.csv', index=False)
print("Ingredient entities saved to 'Q1_b_ingredient_entities.csv'.")

Ingredient entities saved to 'Q1_b_ingredient_entities.csv'.


### (c) Store recipes in the form of a (Recipe ID)—(Ingredient Name) form.

In [None]:
import pandas as pd
import random

In [None]:
df = pd.read_csv('Q1_b_ingredient_entities.csv')
recipe_ids = df['Recipe ID'].unique()

# 100 random recipe IDs
selected_recipe_ids = random.sample(list(recipe_ids), 100)

filtered_df = df[df['Recipe ID'].isin(selected_recipe_ids)]

with open('Q1_c_selected_recipes.txt', 'w') as file:
    # file.write(f"Recipe ID — Ingredient Name\n")
    for _, row in filtered_df.iterrows():
        file.write(f"{row['Recipe ID']} — {row['Ingredient']}\n")

print("Selected recipes-ingredients saved to 'Q1_c_selected_recipes.txt'")

Selected recipes-ingredients saved to 'Q1_c_selected_recipes.txt'
