# Web Scraping Test

# Process User query
## Detect User Intent

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np

RECIPE_INTENT_EXAMPLES = [
    "give me a recipe",
    "find me a recipe",
    "how do you make this",
    "how to cook this",
    "show me how to cook",
    "I want to cook something",
    "tell me the cooking instructions",
]

model = SentenceTransformer("all-MiniLM-L6-v2")

intent_embeddings = model.encode(RECIPE_INTENT_EXAMPLES, normalize_embeddings=True, convert_to_numpy=True)

def detect_recipe_intent(query, threshold=0.50):
    q_emb = model.encode(query, normalize_embeddings=True, convert_to_numpy=True)

    sims = np.dot(q_emb, intent_embeddings.T)
    best = np.max(sims)

    return best >= threshold, best




## Find Recipe Name in User Query

In [2]:
import spacy
spacy.cli.download("en_core_web_sm")
spacy_nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
generic_recipes = [
    "meatloaf","lasagna","chicken parmesan","kung pao chicken","beef and broccoli","Beef tacos","Mac and cheese","Pot roast",
    "Fried rice","Veggie stir fry","Lentil soup","Grilled cheese","Baked potatoes","Quesadillas","Pancakes","French toast",
    "Rice pilaf","Shepherd’s pie","Sloppy joes","Roasted vegetables","Burgers","Vegetable pasta","Chili","Baked salmon"
]

generic_foods = [
    "chicken", "beef", "pork", "salmon", "tofu",
    "carrot", "tomato", "bread", "rice"
]

food_kind_emb = model.encode(generic_recipes, normalize_embeddings=True, convert_to_numpy=True).mean(axis=0)

In [4]:
NON_FOOD_WORDS = set(word.lower() for word in ["recipe","ingredients"])

def remove_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in NON_FOOD_WORDS]
    return ' '.join(filtered_words)

def get_nounchunks(text):
    chunks = []
    doc = spacy_nlp(text)
    for chunk in doc.noun_chunks:
        chunks.append(chunk.text)
    return chunks

def extract_recipe_name(text):
    text_filtered = remove_words(text)
    noun_chunks = get_nounchunks(text_filtered)

    if not noun_chunks:
        return None, 0

    ng_embs = model.encode(noun_chunks, normalize_embeddings=True, convert_to_numpy=True)
    
    sims = np.dot(ng_embs, food_kind_emb.T)
    best_idx = sims.argmax()
    #print("Noun chunks:", noun_chunks)
    #print("Similarities:", sims)
    return noun_chunks[best_idx], sims[best_idx]
extract_recipe_name("Get me a low calorie meatloaf recipe")

('a low calorie meatloaf', 0.31611693)

## Scrape allrecipes.com

In [6]:
import requests
from bs4 import BeautifulSoup
import time

url = "https://www.allrecipes.com/recipe/16354/easy-meatloaf/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')

In [18]:
query = "a+lowfat+meatloaf"
search_url = f"https://www.allrecipes.com/search?q={query}"

response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

results = soup.find('a', id="mntl-card-list-card--extendable_1-0",
                    class_="comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image")

print(results['href'])

https://www.allrecipes.com/recipe/260225/low-fat-slow-cooker-glazed-meatloaf/


NameError: name 'reuslts' is not defined

In [21]:
def getRecipeDetails(soup):
    detail_list = []
    l = soup.find('div', class_='mm-recipes-details__content')

    if not l:
        print("Could not find recipe details section")
        return []
    
    for item in l.find_all('div', class_='mm-recipes-details__item'):
        label = item.find('div', class_='mm-recipes-details__label')
        value = item.find('div', class_='mm-recipes-details__value')
        
        detail_dict = {
            'label': label.text.strip() if label else '',
            'value': value.text.strip() if value else '',
        }
        
        detail_list.append(detail_dict)
    
    #for detail in detail_list:
        #print(detail)
    return detail_list
getRecipeDetails(soup)

Could not find recipe details section


[]

In [9]:
def getIngredients(soup):
    ingredients_list = []
    l = soup.find('ul', class_='mm-recipes-structured-ingredients__list')

    if not l:
        print("Could not find ingredients section")
        return []
    
    for item in l.find_all('li'):
        quantity = item.find('span', attrs={'data-ingredient-quantity': 'true'})
        unit = item.find('span', attrs={'data-ingredient-unit': 'true'})
        name = item.find('span', attrs={'data-ingredient-name': 'true'})
        
        ingredient_dict = {
            'quantity': quantity.text.strip() if quantity else '',
            'unit': unit.text.strip() if unit else '',
            'name': name.text.strip() if name else '',
            'full_text': item.text.strip()  # Keep original text too
        }
        
        ingredients_list.append(ingredient_dict)
    
    #for ing in ingredients_list:
        #print(ing)
    return ingredients_list

getIngredients(soup)

Could not find ingredients section


[]

In [25]:
def getDirections(soup):
    dir_list = []
    l = soup.find('ol', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--OL')

    if not l:
        print("Could not find recipe details section")
        return []
    
    for i, item in enumerate(l.find_all('li', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI')):
        direction = item.find('p', class_='comp mntl-sc-block mntl-sc-block-html')
        
        detail_dict = {
            'step': "Step " + str(i),
            'direction': direction.text.strip() if direction else '',
        }
        
        dir_list.append(detail_dict)
    
    #for direc in dir_list:
        #print(direc)
    return dir_list
getDirections(soup)

Could not find recipe details section


[]

In [22]:
def getNutrition(soup):
    nutrition_list = []
    l = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')

    if not l:
        print("Could not find nutrition details section")
        return []
        
    for item in l.find_all('tr', class_='mm-recipes-nutrition-facts-summary__table-row'):
        label = item.find('td', class_='mm-recipes-nutrition-facts-summary__table-cell text-body-100')
        value = item.find('td', class_='mm-recipes-nutrition-facts-summary__table-cell text-body-100-prominent')
        
        nut_dict = {
            'label': label.text.strip() if label else '',
            'value': value.text.strip() if value else '',
        }
        
        nutrition_list.append(nut_dict)
    
    #for nut in nutrition_list:
        #print(nut)
    return nutrition_list
getNutrition(soup)

Could not find nutrition details section


[]

# Put it all together

In [27]:
def recipe_request(text):
    if not detect_recipe_intent(text):
        return None
    item = extract_recipe_name(text)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    query = "a+lowfat+meatloaf"
    search_url = f"https://www.allrecipes.com/search?q={query}"

    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")

    results = soup.find('a', id="mntl-card-list-card--extendable_1-0",
                    class_="comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image")
    
    url = results['href']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    
    recipe_details = getRecipeDetails(soup)
    ingredients = getIngredients(soup)
    directions = getDirections(soup)
    recipe_nutirion = getNutrition(soup)
    return recipe_details, ingredients, directions, recipe_nutirion

user_query = "Get me a recipe for meatloaf"
recipe_request(user_query)

([{'label': 'Prep Time:', 'value': '15 mins'},
  {'label': 'Cook Time:', 'value': '7 hrs'},
  {'label': 'Total Time:', 'value': '7 hrs 15 mins'},
  {'label': 'Servings:', 'value': '6'},
  {'label': 'Yield:', 'value': '1 loaf'}],
 [{'quantity': '',
   'unit': '',
   'name': 'cooking spray',
   'full_text': 'cooking spray'},
  {'quantity': '1 ½',
   'unit': 'pounds',
   'name': 'ground round beef',
   'full_text': '1 ½ pounds ground round beef'},
  {'quantity': '1',
   'unit': 'pound',
   'name': 'ground turkey breast',
   'full_text': '1 pound ground turkey breast'},
  {'quantity': '¾',
   'unit': 'cup',
   'name': 'unseasoned dry bread crumbs',
   'full_text': '¾ cup unseasoned dry bread crumbs'},
  {'quantity': '1',
   'unit': '(1 ounce) package',
   'name': 'dry onion soup mix',
   'full_text': '1 (1 ounce) package dry onion soup mix'},
  {'quantity': '1',
   'unit': 'tablespoon',
   'name': 'parsley flakes',
   'full_text': '1 tablespoon parsley flakes'},
  {'quantity': '½',
   'uni