In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

### Create a list of links to access
- The idea is to browse to find the url of the food types (category)
- Then go to each category and browse each page (1,2,3...)
- crawl the article links on each page to create a list of links to serve the detail crawling

In [2]:
soup = BeautifulSoup(requests.get('https://vnexpress.net/doi-song/cooking/thuc-don').content, 'html.parser')

url_categories = soup.find_all('h3', class_= 'title_news')
for link in url_categories:
    link = link.find('a')
    print(link.get('href'))

/doi-song/cooking/mon-tet-25905
/doi-song/cooking/mon-ngon-hang-ngay-25532
/doi-song/cooking/mon-ngon-ngay-lanh-25839
/doi-song/cooking/mon-ngon-cho-cuoi-tuan-25533
/doi-song/cooking/mon-ngon-theo-vung-mien-25534
/doi-song/cooking/qua-mon-an-vat-25570
/doi-song/cooking/mon-trang-mieng-giai-khat-25536
/doi-song/cooking/mon-chay-26342
/doi-song/cooking/thuc-don-cho-ngay-nang-nong-25535
/doi-song/cooking/thuc-don-hang-ngay-25531
/doi-song/cooking/bua-sang-don-gian-25574
/doi-song/cooking/cac-loai-banh-26379


In [3]:
# Get the url of the food categories
def get_url_of_each_category(soup):
    list_url_category = []
    list_name_category = []

    # Get all <h3 class="title_news"> tags on the page
    title_blocks = soup.find_all('h3', class_='title_news')

    # Browse through each tag, get link and text
    for h3 in title_blocks:
        a_tag = h3.find('a', href=True)
        if not a_tag:
            continue

        title_text = a_tag.get_text(strip=True)
        href = 'https://vnexpress.net' + a_tag['href']

        list_url_category.append(href)
        list_name_category.append(title_text)

    return list_url_category, list_name_category

In [4]:
list_url_category, list_name_category = get_url_of_each_category(soup)
for name, url in zip(list_name_category, list_url_category):
    print(f"Category: {name} - {url}")

Category: Món Tết - https://vnexpress.net/doi-song/cooking/mon-tet-25905
Category: Món ngon hàng ngày - https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532
Category: Món ngon ngày lạnh - https://vnexpress.net/doi-song/cooking/mon-ngon-ngay-lanh-25839
Category: Món ngon cho cuối tuần - https://vnexpress.net/doi-song/cooking/mon-ngon-cho-cuoi-tuan-25533
Category: Món ngon theo vùng miền - https://vnexpress.net/doi-song/cooking/mon-ngon-theo-vung-mien-25534
Category: Quà - Món ăn vặt - https://vnexpress.net/doi-song/cooking/qua-mon-an-vat-25570
Category: Món tráng miệng, giải khát - https://vnexpress.net/doi-song/cooking/mon-trang-mieng-giai-khat-25536
Category: Món chay - https://vnexpress.net/doi-song/cooking/mon-chay-26342
Category: Thực đơn cho ngày nắng nóng - https://vnexpress.net/doi-song/cooking/thuc-don-cho-ngay-nang-nong-25535
Category: Thực đơn hàng ngày - https://vnexpress.net/doi-song/cooking/thuc-don-hang-ngay-25531
Category: Bữa sáng đơn giản - https://vnexpress

In [5]:
# Get a list of page urls for each type of food
def get_list_page_for_each_category(url_categories: list, max_pages: int = 100) -> list:
    list_pages_for_each_category = []

    for category in url_categories:
        seen_pages = []  # save the title of the pages seen
        for page in range(1, max_pages + 1):
            url_page = f'{category}-p{page}'
            response = requests.get(url_page)

            soup = BeautifulSoup(response.text, 'html.parser')
            food_titles = [a.get_text(strip=True) for a in soup.select('h2.title_news a')]

            # If the page is empty or fails to load
            if not food_titles:
                break

            titles_set = frozenset(food_titles)

            # If current page duplicates any previous page → stop
            if titles_set in seen_pages:
                break

            seen_pages.append(titles_set)
            list_pages_for_each_category.append(url_page)

    return list_pages_for_each_category


In [6]:
list_pages_for_each_category = get_list_page_for_each_category(list_url_category)
list_pages_for_each_category

['https://vnexpress.net/doi-song/cooking/mon-tet-25905-p1',
 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p2',
 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p3',
 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p4',
 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p5',
 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p6',
 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p7',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p1',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p2',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p3',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p4',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p5',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p6',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p7',
 'https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p8',
 'https://vn

In [7]:
len(list_pages_for_each_category)

95

In [8]:
# Get a list of urls for each dish on each page
def get_foods_url_on_each_page(url):
  try:
    list_url_foods = []

    Content = requests.get(url).content
    soup = BeautifulSoup(Content, 'html.parser')

    # Get all <h2 class="title_news"> tags on the page
    food_title = soup.find_all('h2', class_='title_news')

    # Browse through each tag, get link and text
    for h2 in food_title:
        a_tag = h2.find('a', href=True)
        if not a_tag:
           continue
        else:
           list_url_foods.append(a_tag['href'])
    return list_url_foods
  
  except Exception as e:
    print(f"[ERROR] Unable to crawl: {url}: {e}")
    return []

In [9]:
test_url = 'https://vnexpress.net/doi-song/cooking/mon-tet-25905-p2'
test_foods = get_foods_url_on_each_page(test_url)
test_foods

['https://vnexpress.net/doi-song-cooking-mam-co-30-tet-truyen-thong-cua-nguoi-ha-noi-4709863.html',
 'https://vnexpress.net/doi-song-cooking-chao-chan-gio-mon-ngon-tet-vung-ha-nam-ninh-4709895.html',
 'https://vnexpress.net/doi-song-cooking-nem-ran-kieu-truyen-thong-mien-bac-4709469.html',
 'https://vnexpress.net/doi-song-cooking-bi-quyet-lam-canh-mang-muc-mon-tien-vua-xua-4708892.html',
 'https://vnexpress.net/doi-song-cooking-goi-gio-mo-4708130.html',
 'https://vnexpress.net/doi-song-cooking-thit-kho-hot-vit-truyen-thong-mien-tay-4707556.html',
 'https://vnexpress.net/doi-song-cooking-cha-ca-chep-bach-hoa-4707251.html',
 'https://vnexpress.net/doi-song-cooking-mien-xao-long-me-ga-4706819.html',
 'https://vnexpress.net/doi-song-cooking-cat-banh-chung-bang-lat-hay-dao-4706418.html',
 'https://vnexpress.net/doi-song-cooking-5-sai-lam-khien-luoc-ga-le-bi-hong-4705899.html',
 'https://vnexpress.net/doi-song-cooking-bong-bi-xao-thap-cam-kieu-bac-4705604.html',
 'https://vnexpress.net/doi-s

In [10]:
list_url_category, list_name_category = get_url_of_each_category(soup)

seen_urls = set()
all_foods = []

for name, url in zip(list_name_category, list_url_category):
    print(f"Crawling category: {name} - {url}")
    list_pages_for_each_category = get_list_page_for_each_category([url])
    for page in list_pages_for_each_category:
        print(f"---Crawling page: {page}")
        foods = get_foods_url_on_each_page(page)
        for link in foods:
            if link not in seen_urls:
                all_foods.append((name, link))
                seen_urls.add(link)

print(f"Total number of foods: {len(all_foods)}")

Crawling category: Món Tết - https://vnexpress.net/doi-song/cooking/mon-tet-25905
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p1
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p2
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p3
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p4
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p5
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p6
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-tet-25905-p7
Crawling category: Món ngon hàng ngày - https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p1
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p2
---Crawling page: https://vnexpress.net/doi-song/cooking/mon-ngon-hang-ngay-25532-p3
---Crawling page: https://vnexpress.net/d

In [11]:
all_foods[:10]

[('Món Tết',
  'https://vnexpress.net/doi-song-cooking-co-tet-ha-noi-xua-4843017.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-bon-mon-canh-trong-co-tet-ha-noi-xua-4842990.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-cach-muoi-dua-hanh-truyen-thong-4842499.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-meo-luoc-ga-da-gion-mong-nuoc-khong-tham-mao-4841940.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-goi-y-mam-cung-ong-tao-2025-4841198.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-su-hao-xao-muc-4840299.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-canh-mang-ngay-tet-co-truyen-ha-noi-4839393.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-gia-hanh-nhan-4835734.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-cha-bi-ot-xiem-xanh-4834169.html'),
 ('Món Tết',
  'https://vnexpress.net/doi-song-cooking-moc-dong-mon-ngon-tet-ha-noi-xua-4832776.html')]

In [12]:
all_foods_df = pd.DataFrame(all_foods, columns=['category', 'url'])
all_foods_df

Unnamed: 0,category,url
0,Món Tết,https://vnexpress.net/doi-song-cooking-co-tet-...
1,Món Tết,https://vnexpress.net/doi-song-cooking-bon-mon...
2,Món Tết,https://vnexpress.net/doi-song-cooking-cach-mu...
3,Món Tết,https://vnexpress.net/doi-song-cooking-meo-luo...
4,Món Tết,https://vnexpress.net/doi-song-cooking-goi-y-m...
...,...,...
934,Các loại bánh,https://vnexpress.net/cach-lam-banh-it-tran-de...
935,Các loại bánh,https://vnexpress.net/cach-lam-banh-tieu-cap-t...
936,Các loại bánh,https://vnexpress.net/cach-lam-banh-khoai-tay-...
937,Các loại bánh,https://vnexpress.net/cach-lam-banh-gao-tokbok...


In [13]:
all_foods_df.to_csv('vnexpress_foods_urls.csv', index=False)
print("DataFrame successfully saved to output.csv")

DataFrame successfully saved to output.csv


### Go to each article and get the necessary information

In [14]:
# Use this line to load the saved CSV file
all_foods_urls_df = pd.read_csv('vnexpress_foods_urls.csv')

In [134]:
import requests
from bs4 import BeautifulSoup

def get_food_detail(url: str, category: str) -> dict:
    # Initialize default result (make sure any exception returns this structure)
    result = {
        "link": url,
        "type_of_food":  category,
        "title": None,
        "description": None,
        "author_name": None,
        "cook_time": None,
        "num_of_people": None,
        "calories": None,
        "num_of_ingredients": None,
        "ingredients": [],
        "step": [],
        "note": [],
        "post_date": None,
    }

    def safe_text(node):
        return node.get_text(strip=True) if node else None

    try:
        resp = requests.get(url, timeout=10)
        resp.encoding = "utf-8"
        soup = BeautifulSoup(resp.text, "html.parser")

        # Title
        try:
            title_tag = soup.find("h1", class_="title-detail")
            result["title"] = safe_text(title_tag)
        except Exception:
            result["title"] = None

        # Short description
        try:
            description_tag = soup.find("p", class_="description")
            result["description"] = safe_text(description_tag)
        except Exception:
            result["description"] = None

        # Author name (safe access)
        try:
            author_tag = soup.select_one("div.name b")
            result["author_name"] = safe_text(author_tag)
        except Exception:
            result["author_name"] = None

        # Cooking time, number of people, calories
        time = None
        num_of_people = None
        calories = None
        try:
            items_tag = soup.find("div", class_="status flex")
            if items_tag:
                items = items_tag.find_all("p", class_="itemt")
                for item in items:
                    text = item.get_text(strip=True)
                    if "phút" in text or "giờ" in text:
                        time = text
                    elif "người" in text:
                        num_of_people = text
                    elif "kcal" in text or "calo" in text:
                        calories = text
        except Exception:
            # leave as None if parsing fails
            pass

        result["cook_time"] = time
        result["num_of_people"] = num_of_people
        result["calories"] = calories

        # Number of ingredients
        try:
            num_of_ingredients_tag = soup.find("div", class_="title-detail2 checklist_num")
            if num_of_ingredients_tag:
                span = num_of_ingredients_tag.find("span")
                result["num_of_ingredients"] = safe_text(span)
        except Exception:
            result["num_of_ingredients"] = None

        # Ingredients
        try:
            ingredients_tag = soup.find("ul", class_="choose-ingredients")
            ingredients = []
            if ingredients_tag:
                name_tags = ingredients_tag.find_all("div", class_="name")
                ingredients = [tag.get_text(strip=True) for tag in name_tags if tag]
            result["ingredients"] = ingredients
        except Exception:
            result["ingredients"] = []

        # Cooking steps
        try:
            steps = []
            if soup.find("div", class_="steep"):
                steps_section = soup.find("div", class_="steep")

                if steps_section.find_all("li"):
                    step_content = steps_section.find_all("li")
                    for i, tag in enumerate(step_content, start=1):
                        text = tag.get_text(separator=" ", strip=True)
                        if text:
                            steps.append(f"Bước {i}: {text}")

            result["step"] = steps
        except Exception:
            result["step"] = []

        # Notes (attention)
        try:
            notes = []
            extra_info = soup.find("div", class_="extra_info")
            if extra_info.find("ol"):
                note_section =  extra_info.find("ol")
                for li in note_section.find_all("li"):
                    text = li.get_text(separator=" ", strip=True)
                    if text:
                        notes.append(text)

            elif extra_info.find("ul"):
                note_section =  extra_info.find("ul")
                for li in note_section.find_all("li"):
                    text = li.get_text(separator=" ", strip=True)
                    if text:
                        notes.append(text)

            result["note"] = notes
        except Exception:
            result["note"] = []

        # Post date
        try:
            date_tag = soup.find("span", class_="date")
            result["post_date"] = safe_text(date_tag)
        except Exception:
            result["post_date"] = None

        return result

    except Exception as e:
        print(f" [ERROR] {url}: {e}")
        return result


### Check valid_format page

In [34]:
def check_valid_format(url):
    try:
        resp = requests.get(url, timeout=10)
        resp.encoding = "utf-8"
        soup = BeautifulSoup(resp.text, "html.parser")

        # Check the important components
        title_tag = soup.find("h1", class_="title-detail")
        ingredients_tag = soup.find("ul", class_="choose-ingredients")

        if title_tag and ingredients_tag:
            return True
        else:
            return False

    except Exception as e:
        print(f"   [ERROR] {url}: {e}")
        return False

In [19]:
check_valid_format("https://vnexpress.net/doi-song-cooking-gia-hanh-nhan-4835734.html")

True

In [20]:
false_format = 0

for index, row in all_foods_urls_df.iterrows():
    print(f"{index + 1}/{len(all_foods_urls_df)} - {row['url']}")
    url = row['url']
    is_valid_format = check_valid_format(url)
    if is_valid_format:
        print("TRUE")
    else:
        print("FALSE")
        false_format += 1
    
print(f"\n\n [RESULTS] Total number of pages in incorrect format: {false_format}")
    

1/939 - https://vnexpress.net/doi-song-cooking-co-tet-ha-noi-xua-4843017.html
FALSE
2/939 - https://vnexpress.net/doi-song-cooking-bon-mon-canh-trong-co-tet-ha-noi-xua-4842990.html
FALSE
3/939 - https://vnexpress.net/doi-song-cooking-cach-muoi-dua-hanh-truyen-thong-4842499.html
TRUE
4/939 - https://vnexpress.net/doi-song-cooking-meo-luoc-ga-da-gion-mong-nuoc-khong-tham-mao-4841940.html
FALSE
5/939 - https://vnexpress.net/doi-song-cooking-goi-y-mam-cung-ong-tao-2025-4841198.html
FALSE
6/939 - https://vnexpress.net/doi-song-cooking-su-hao-xao-muc-4840299.html
TRUE
7/939 - https://vnexpress.net/doi-song-cooking-canh-mang-ngay-tet-co-truyen-ha-noi-4839393.html
TRUE
8/939 - https://vnexpress.net/doi-song-cooking-gia-hanh-nhan-4835734.html
TRUE
9/939 - https://vnexpress.net/doi-song-cooking-cha-bi-ot-xiem-xanh-4834169.html
TRUE
10/939 - https://vnexpress.net/doi-song-cooking-moc-dong-mon-ngon-tet-ha-noi-xua-4832776.html
TRUE
11/939 - https://vnexpress.net/doi-song-cooking-bon-mon-giai-cuu-gi

### Crawl data (combined with valid check)

In [129]:
all_foods_urls_df = pd.read_csv('vnexpress_foods_urls.csv')
len(all_foods_urls_df)

939

In [130]:
test_all_foods_urls_df = all_foods_urls_df[110:120]
test_all_foods_urls_df

Unnamed: 0,category,url
110,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-xoi-man...
111,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-lon-qua...
112,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-long-th...
113,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-ca-ri-b...
114,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-bi-baby...
115,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-bo-kho-...
116,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-trung-v...
117,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-muc-chi...
118,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-long-no...
119,Món ngon hàng ngày,https://vnexpress.net/doi-song-cooking-luon-om...


In [131]:
all_test = []

for _, row in test_all_foods_urls_df.iterrows():
    print(f'{row["category"]} - {row["url"]}')
    if not check_valid_format(row["url"]):
        print("   [SKIP] Page is not in the correct format.")
        continue
    data = get_food_detail(row["url"], row["category"])      
    all_test.append(data)         

len(all_test)   

Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-xoi-man-thap-cam-4573609.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-lon-quay-kho-dua-cai-chua-4571409.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-long-thuon-hanh-ram-4569189.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-ca-ri-bo-4568144.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-bi-baby-nhoi-thit-hap-4552430.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-bo-kho-gung-4550930.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-trung-vit-lon-nuong-muoi-ot-4547679.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-muc-chien-nuoc-mam-4546892.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-long-non-doi-sun-4546649.html
Món ngon hàng ngày - https://vnexpress.net/doi-song-cooking-luon-om-chuoi-dau-4545372.html


10

In [132]:
all_test

[{'link': 'https://vnexpress.net/doi-song-cooking-xoi-man-thap-cam-4573609.html',
  'type_of_food': 'Món ngon hàng ngày',
  'title': 'Cách nấu xôi mặn thập cẩm dẻo rền',
  'description': 'Xôi căng bóng, dẻo thơm gói vào lá chuối làm cho mọi thứ quyện lại: chút béo béo từ mỡ hành, mằn mặn từ xì dầu, ngọt nhẹ từ lạp xưởng, beo béo từ pate làm nên mỹ vị khó quên.',
  'author_name': 'Bùi Thủy',
  'cook_time': '60 phút',
  'num_of_people': '6-8 người',
  'calories': '2.388 kcal',
  'num_of_ingredients': '11',
  'ingredients': ['1 kg gạo nếp cái hoa vàng',
   '100 ml nước cốt dừa',
   '100 gr xúc xích',
   '50 gr tôm khô',
   '50 gr lạp xưởng',
   '1/2 củ cà rốt (tùy chọn)',
   '1 chén hành phi',
   '1 bó hành lá',
   'Hành khô, tỏi khô',
   'Gia vị: Nước tương, đường, muối, hạt nêm, hạt tiêu',
   'Mỡ gà hoặc dầu ăn'],
  'step': ['Bước 1: Gạo nếp vo sạch, ngâm nước lạnh từ 6 - 8 giờ tùy theo từng loại để giúp gạo ngậm no nước, khi đồ sẽ nở ra dẻo mềm. Sau đó đổ ra rổ cho ráo nước, xóc 1/2 th

In [135]:
all_foods_data = []

for _, row in all_foods_urls_df.iterrows():
    print(f'{row["category"]} - {row["url"]}')
    if not check_valid_format(row["url"]):
        print("   [SKIP] Page is not in the correct format.")
        continue
    data = get_food_detail(row["url"], row["category"])      
    all_foods_data.append(data)         

len(all_foods_data)            

Món Tết - https://vnexpress.net/doi-song-cooking-co-tet-ha-noi-xua-4843017.html
   [SKIP] Page is not in the correct format.
Món Tết - https://vnexpress.net/doi-song-cooking-bon-mon-canh-trong-co-tet-ha-noi-xua-4842990.html
   [SKIP] Page is not in the correct format.
Món Tết - https://vnexpress.net/doi-song-cooking-cach-muoi-dua-hanh-truyen-thong-4842499.html
Món Tết - https://vnexpress.net/doi-song-cooking-meo-luoc-ga-da-gion-mong-nuoc-khong-tham-mao-4841940.html
   [SKIP] Page is not in the correct format.
Món Tết - https://vnexpress.net/doi-song-cooking-goi-y-mam-cung-ong-tao-2025-4841198.html
   [SKIP] Page is not in the correct format.
Món Tết - https://vnexpress.net/doi-song-cooking-su-hao-xao-muc-4840299.html
Món Tết - https://vnexpress.net/doi-song-cooking-canh-mang-ngay-tet-co-truyen-ha-noi-4839393.html
Món Tết - https://vnexpress.net/doi-song-cooking-gia-hanh-nhan-4835734.html
Món Tết - https://vnexpress.net/doi-song-cooking-cha-bi-ot-xiem-xanh-4834169.html
Món Tết - https:/

810

In [136]:
all_foods_data_df = pd.DataFrame(all_foods_data)
all_foods_data_df.head()

Unnamed: 0,link,type_of_food,title,description,author_name,cook_time,num_of_people,calories,num_of_ingredients,ingredients,step,note,post_date
0,https://vnexpress.net/doi-song-cooking-cach-mu...,Món Tết,Cách muối dưa hành truyền thống,Dưa hành muối là món ăn truyền thống ngày Tết ...,Bùi Thủy,45 phút,8-10 người,459 kcal,5,"[1 kg hành củ tươi, Tro bếp hoặc nước vo gọa, ...",[Bước 1: Chọn hành củ: Nên chọn hành củ ta bán...,[],"Thứ năm, 23/1/2025, 16:32 (GMT+7)"
1,https://vnexpress.net/doi-song-cooking-su-hao-...,Món Tết,Su hào xào mực - món cổ Tết Bát Tràng,Đĩa xào khô ráo với su hào giòn ngọt quyện với...,Bùi Thủy,50 phút,4 - 5 người,1.162 kcal,6,"[2 củ su hào non, 1 con mực khô, 1/2 củ cà rốt...",[Bước 1: Chọn và sơ chế mực: Người dân làng gố...,[Su hào xào mực cùng với canh măng mực là hai ...,"Thứ sáu, 17/1/2025, 17:24 (GMT+7)"
2,https://vnexpress.net/doi-song-cooking-canh-ma...,Món Tết,Canh măng ngày Tết cổ truyền Hà Nội,"Măng ngấu vị, giòn ngon, móng giò hầm vừa độ s...",Bùi Thủy,100 phút,8 - 10 người,4.930 kcal,6,"[800 gr măng khô, 2 móng giò lợn, Nước dùng (g...","[Bước 1: Chọn măng khô: Theo lối cũ, người nội...",[Nếu tận dụng nước luộc gà nấu canh măng thì k...,"Thứ tư, 15/1/2025, 19:00 (GMT+7)"
3,https://vnexpress.net/doi-song-cooking-gia-han...,Món Tết,Giả hạnh nhân - món ngon Tết xưa Hà Nội,Đây là món ăn cổ truyền thường thấy trong cỗ T...,Bùi Thủy,60 phút,4-5 người,1.112 kcal,8,"[2 bộ lòng mề gà, 100 gr lạc, 50 gr hạt đậu Hà...",[Bước 1: Chọn và sơ chế lạc: Chọn lạc khô chắc...,[Hạnh nhân xào (hay giả hạnh nhân) là món ăn c...,"Thứ ba, 7/1/2025, 17:31 (GMT+7)"
4,https://vnexpress.net/doi-song-cooking-cha-bi-...,Món Tết,Chả bì ớt xiêm xanh,"Chả bì bóng đẹp, gói đều tay. Khi ăn vị ngọt m...",Bùi Thủy,60 phút,5-6 người,2.512 kcal,6,"[500 gr giò sống, 300 gr bì lợn, 20 - 30 gr ớt...","[Bước 1: Chọn và sơ chế bì lợn, chuẩn bị giò s...",[Nên sơ chế kỹ bì lợn để chả được thơm. Tùy th...,"Thứ ba, 31/12/2024, 18:00 (GMT+7)"


In [137]:
all_foods_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   link                810 non-null    object
 1   type_of_food        810 non-null    object
 2   title               810 non-null    object
 3   description         810 non-null    object
 4   author_name         461 non-null    object
 5   cook_time           447 non-null    object
 6   num_of_people       446 non-null    object
 7   calories            438 non-null    object
 8   num_of_ingredients  810 non-null    object
 9   ingredients         810 non-null    object
 10  step                810 non-null    object
 11  note                810 non-null    object
 12  post_date           810 non-null    object
dtypes: object(13)
memory usage: 82.4+ KB


In [138]:
# Save to CSV
all_foods_data_df.to_csv("vnexpress_foods_detail.csv", index=False)

### Test get main content

In [127]:
steps = []
url = 'https://vnexpress.net/nhung-mon-ngon-dam-da-vi-moi-4311459.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

if soup.find("div", class_="steep"):
    steps_section = soup.find("div", class_="steep")

    if steps_section.find_all("li"):
        step_content = steps_section.find_all("li")
        for i, tag in enumerate(step_content, start=1):
            text = tag.get_text(separator=" ", strip=True)
            if text:
                steps.append(f"Bước {i}: {text}")

steps

[]

### Test get note content

In [125]:
notes = []

url = 'https://vnexpress.net/doi-song-cooking-nong-heo-nuong-4523802.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

extra_info = soup.find("div", class_="extra_info")
if extra_info.find("ol"):
    note_section =  extra_info.find("ol")
    for li in note_section.find_all("li"):
        text = li.get_text(separator=" ", strip=True)
        if text:
            notes.append(text)

elif extra_info.find("ul"):
    note_section =  extra_info.find("ul")
    for li in note_section.find_all("li"):
        text = li.get_text(separator=" ", strip=True)
        if text:
            notes.append(text)

notes

['Hành, tỏi nên giã vắt nước cốt ướp để thấm vào thịt thơm ngon. Đồng thời khi nướng không bị cháy khét.',
 'Nên nướng thịt 2 lần lửa: Lần 1 cho xém thơm các mặt. Lần 2 phết thêm nước sốt để tạo độ bóng đẹp, đậm đà cho món ăn.',
 'Khi ướp thêm sữa đặc giúp cho món nướng ngọt mềm hơn.']