# ***Web Scrapping***

"First, with the help of the browser, we identify the target page and the HTML code that we will focus on."

![Alt text](assets/Screenshot_20240806_192408.png)


In [1]:
!curl -o data/recipes-a-z.html https://www.allrecipes.com/recipes-a-z-6735880


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  251k    0  251k    0     0   413k      0 --:--:-- --:--:-- --:--:--  413k


In [3]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [13]:
from bs4 import BeautifulSoup

# Abre el archivo y crea el objeto BeautifulSoup
with open('./data/recipes-a-z.html', 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# Encuentra el div principal con el ID especificado
main_div = soup.find('div', id='mntl-alphabetical-list_1-0')

# Lista para almacenar los enlaces
links = []

if main_div:
    # Encuentra todos los ul dentro del div principal
    ul_elements = main_div.find_all('ul', class_='mntl-link-list')

    for ul in ul_elements:
        # Encuentra todos los li dentro de cada ul
        li_elements = ul.find_all('li')

        for li in li_elements:
            # Encuentra el elemento <a> dentro de cada li
            a_tag = li.find('a', href=True)
            if a_tag:
                # Extrae el href y lo agrega a la lista de enlaces
                links.append(a_tag['href'])

# Imprime los enlaces extraídos
for link in links:
    print(link)


https://www.allrecipes.com/recipes/23070/everyday-cooking/cookware-and-equipment/air-fryer/
https://www.allrecipes.com/recipes/16492/everyday-cooking/special-collections/allrecipes-allstars/
https://www.allrecipes.com/recipes/385/desserts/cakes/angel-food-cake/
https://www.allrecipes.com/recipes/102/appetizers-and-snacks/antipasto/
https://www.allrecipes.com/recipes/76/appetizers-and-snacks/
https://www.allrecipes.com/recipes/788/desserts/pies/apple-pie/
https://www.allrecipes.com/recipes/1333/side-dish/applesauce/
https://www.allrecipes.com/recipes/14913/appetizers-and-snacks/dips-and-spreads/artichoke-dip/
https://www.allrecipes.com/recipes/1537/bread/yeast-bread/bagels/
https://www.allrecipes.com/recipes/1673/side-dish/beans-and-peas/baked-beans/
https://www.allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/
https://www.allrecipes.com/recipes/836/desserts/cookies/bar-cookies/
https://www.allrecipes.com/recipes/200/meat-and-poultry/beef/
https://www.allrecipes.com

In [18]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0'}

product_links = []
for url in tqdm(links):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Esto lanzará una excepción para códigos de estado HTTP 4xx/5xx
        soup = BeautifulSoup(response.content, "html.parser")

        recipes = soup.find_all('a', class_='comp mntl-card-list-items mntl-document-card mntl-card card card--no-image')
        for recipe in recipes:
            link = recipe.get("href")
            if link:
                product_links.append(link)

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")

# Imprime los enlaces obtenidos
for link in product_links:
    print(link)

100%|██████████| 378/378 [03:22<00:00,  1.87it/s]

https://www.allrecipes.com/air-fryer-buffalo-wings-recipe-8645970
https://www.allrecipes.com/air-fryer-smashed-potatoes-recipe-8644582
https://www.allrecipes.com/air-fryer-quesadillas-recipe-8651442
https://www.allrecipes.com/air-fryer-truffle-polenta-fries-recipe-8643717
https://www.allrecipes.com/air-fryer-firecracker-salmon-bites-recipe-8623119
https://www.allrecipes.com/air-fryer-chicken-bites-recipe-8599352
https://www.allrecipes.com/4-ingredient-air-fryer-pepper-poppers-recipe-8584965
https://www.allrecipes.com/air-fryer-bell-pepper-poppers-recipe-8584648
https://www.allrecipes.com/air-fryer-cinnamon-roll-bites-recipe-8553617
https://www.allrecipes.com/air-fryer-ham-and-cheese-wraps-recipe-8365118
https://www.allrecipes.com/recipe/8537905/air-fryer-buffalo-cauliflower/
https://www.allrecipes.com/air-fryer-honey-mustard-chicken-thighs-recipe-7970816
https://www.allrecipes.com/air-fryer-hearts-of-palm-sticks-recipe-7814346
https://www.allrecipes.com/air-fryer-cheesy-bacon-ranch-fre




In [19]:
print(len(product_links))

18122


In [20]:
titles = []
description = []
ingredients = []
recipes = []


In [21]:
for link in tqdm(product_links):
    try:
        response = requests.get(link, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        #Título
        try:
            titles.append(soup.find("title").text)
        except Exception as e:
            titles.append(None)

        #Descripción
        try:
            description.append(soup.find('p', class_='article-subheading type--dog').text)
        except Exception as e:
            description.append(None)

        #Ingredientes
        try:
            items = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
            content = ""
            for item in items:
                spans = item.find_all("span")

                for span in spans:
                    content += span.text + ""
                content += "\n"

            ingredients.append(content)
        except Exception as e:
            ingredients.append(None)

        try:
            paragraphs = soup.find_all('p', class_='comp mntl-sc-block mntl-sc-block-html')
            text = ""
            for p in paragraphs:
                text += p.text + ' '
                text += '\n'

            recipes.append(text)
        except Exception as e:
            recipes.append(None)

    except Exception as e:
        print("failed to fetch " + link)
        titles.append(None)
        description.append(None)
        ingredients.append(None)
        recipes.append(None)

100%|██████████| 18122/18122 [2:32:43<00:00,  1.98it/s]  


In [22]:
df = pd.DataFrame({'Title': titles, 'Description': description, 'Ingredients': ingredients, 'Recipe': recipes,
                   'URL': product_links})

In [23]:
df


Unnamed: 0,Title,Description,Ingredients,Recipe,URL
0,Air Fryer Buffalo Wings Recipe,These crispy air fryer Buffalo wings are seaso...,2teaspoonssea salt\n1teaspoongarlic powder\n1t...,Preheat an air fryer to 380 degrees F (190 de...,https://www.allrecipes.com/air-fryer-buffalo-w...
1,Air Fryer Smashed Potatoes Recipe,"These golden, crispy air fryer smashed potatoe...",8ouncesbaby gold potatoes\n1tablespoonmelted u...,Preheat an air fryer to 400 degrees F (200 de...,https://www.allrecipes.com/air-fryer-smashed-p...
2,Air Fryer Quesadillas Recipe,These air fryer quesadillas are golden and cri...,2flour tortillas\n1/2cupshredded cheese\nnonst...,Heat tortillas in the microwave until pliable...,https://www.allrecipes.com/air-fryer-quesadill...
3,Air Fryer Truffle Polenta Fries Recipe,"These air fryer truffle polenta fries, flavore...",1(18 ounce) tubepolenta\n1 1/2tablespoonsblack...,Preheat an air fryer to 400 degrees F (200 de...,https://www.allrecipes.com/air-fryer-truffle-p...
4,Air Fryer Firecracker Salmon Bites Recipe,These air fryer firecracker salmon bites get a...,1/4cupbalsamic vinegar\n1/4cupbrown sugar\n3ta...,"Combine balsamic vinegar, brown sugar, oil, s...",https://www.allrecipes.com/air-fryer-firecrack...
...,...,...,...,...,...
18117,Vegan Zucchini Banana Bread Recipe,"This yummy, moist, rich zucchini banana bread ...",3cupsall-purpose flour\n1teaspoonsalt\n1teaspo...,Preheat the oven to 325 degrees F (165 degree...,https://www.allrecipes.com/recipe/215489/vegan...
18118,Zucchini-Raspberry Bread Recipe,It's a simple zucchini nut bread with a splash...,1 ½cupsself-rising flour\n1teaspoonground cinn...,Preheat an oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/202259/zucch...
18119,Healthier Mom's Zucchini Bread Recipe,We packed even more zucchini into Mom's wonder...,1 ½cupsall-purpose flour\n1 ½cupswhite whole w...,Preheat oven to 325 degrees F (165 degrees C)...,https://www.allrecipes.com/recipe/222078/healt...
18120,"Zucchini Bread, Pumpkin Style Recipe","Although I love zucchini bread, I wanted a new...","3mediumzucchini, cut into chunks\n4 ¾cupsall-p...",Preheat an oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/152103/zucch...


In [24]:
df.to_csv('recipes.csv')