In [79]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests

In [None]:
categories=['dairy','drinks','fruits','grains','other','proteins','vegetables']

In [72]:
href_dict = []

for category in categories:
    print(f"Scraping category: {category}")
    url = f'https://eatbydate.com/{category}'
    html = urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')

    data = []

    fusion_columns = soup.find_all('ul', {'class': 'sub-menu'})

    for fusion_column in fusion_columns:
        links = fusion_column.find_all('a')

        for link in links:
            href = link.get('href')
            text = link.text.strip()
            data.append({"category": category, "sub_category": text, "href": href})

    # Filter out the entries with sub_category equal to "Oil"
    data = [entry for entry in data if entry["sub_category"] != "Oil"] 

    # Extend the datalist with the data for the current category
    href_dict.extend(data)

with open('eat_by_date_href.json', "w") as json_file:
    json.dump(href_dict, json_file, indent=4)



Scraping category: dairy
Scraping category: drinks
Scraping category: fruits
Scraping category: grains
Scraping category: other
Scraping category: proteins
Scraping category: vegetables
[{'category': 'dairy', 'sub_category': 'Butter and Margarine', 'href': '/dairy/spreads/'}, {'category': 'dairy', 'sub_category': 'Cheese', 'href': '/dairy/cheese/'}, {'category': 'dairy', 'sub_category': 'Eggs', 'href': '/dairy/eggs/'}, {'category': 'dairy', 'sub_category': 'Eggs', 'href': '/eggs-shelf-life-expiration-date/'}, {'category': 'dairy', 'sub_category': 'Hard Boiled Eggs', 'href': '/hard-boiled-eggs-shelf-life-expiration-date/'}, {'category': 'dairy', 'sub_category': 'Egg Nog', 'href': '/dairy/milk/how-long-does-egg-nog-last-shelf-life-expiration-date/'}, {'category': 'dairy', 'sub_category': 'Milk & Cream', 'href': '/dairy/milk/'}, {'category': 'dairy', 'sub_category': 'Buttermilk', 'href': '/dairy/milk/buttermilk/'}, {'category': 'dairy', 'sub_category': 'Dairy (Coffee) Cream', 'href': '/da

In [77]:
data_list=[]
for item in href_dict:
    # Construct the full URL
    url = "https://eatbydate.com" + item['href']
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")
    # Find the table containing shelf life data
    table = soup.find("table")
    if table:
        rows = table.find_all("tr")
        if rows:
            second_heading_index = next((idx for idx, row in enumerate(rows) if row.find("th") and row.find("th").text.strip() == "(Opened)"), None)
            if second_heading_index is not None:
                headers_1 = [header.get_text(strip=True).lower() for header in rows[0].find_all("th")]
                for row in rows[1:second_heading_index]:
                    data_dict = {}
                    cells = row.find_all("td")
                    if cells:
                        for idx, cell in enumerate(cells):
                            data_dict[headers_1[idx]] = cell.get_text(strip=True)
                        if any(data_dict.values()):
                            data_list.append(data_dict)
                headers_2 = [header.get_text(strip=True).lower() for header in rows[second_heading_index].find_all("th")]
                for row in rows[second_heading_index+1:]:
                    data_dict = {}
                    cells = row.find_all("td")
                    if cells:
                        for idx, cell in enumerate(cells):
                            data_dict[headers_2[idx]] = cell.get_text(strip=True)
                        if any(data_dict.values()):
                            data_list.append(data_dict)
            else:
                headers = [header.get_text(strip=True).lower() for header in rows[0].find_all("th")]
                for row in rows[1:]:
                    data_dict = {}
                    cells = row.find_all("td")
                    if cells:
                        for idx, cell in enumerate(cells):
                            data_dict[headers[idx]] = cell.get_text(strip=True)
                        if any(data_dict.values()):
                            data_list.append(data_dict)

    # Add fixed key-value pair for each item
    for entry in data_list:
        new_entry = {"category": item['category'],"subcategory":item['sub_category']}
        for key, value in entry.items():
            # Replace key name with item_description
            new_key = 'item_description' if key in ['(unopened)', '(opened)','(raw)','(unopened/opened)','open/unopened','(opened/unopened)','(whole)',''] else key
            new_entry[new_key] = value
        if any(key == '(unopened)' for key in entry.keys()):
            new_entry['type'] = 'unopened'
        if any(key == '(opened)' for key in entry.keys()):
            new_entry['type'] = 'opened'
        # If opened/unopened is not specified, set type as unknown
        if any(key in ['','(raw)','(unopened/opened)','open/unopened','(opened/unopened)','(whole)'] for key in entry.keys()):
            new_entry['type'] = 'unspecified'
        entry.clear()
        entry.update(new_entry)

# Dump the resulting data list to a JSON file
with open('eat_by_date.json', "w") as json_file:
    json.dump(data_list, json_file, indent=4)

{'category': 'dairy', 'sub_category': 'Butter and Margarine', 'href': '/dairy/spreads/'}
{'category': 'dairy', 'sub_category': 'Cheese', 'href': '/dairy/cheese/'}
{'category': 'dairy', 'sub_category': 'Eggs', 'href': '/dairy/eggs/'}
{'category': 'dairy', 'sub_category': 'Eggs', 'href': '/eggs-shelf-life-expiration-date/'}
{'category': 'dairy', 'sub_category': 'Hard Boiled Eggs', 'href': '/hard-boiled-eggs-shelf-life-expiration-date/'}
{'category': 'dairy', 'sub_category': 'Egg Nog', 'href': '/dairy/milk/how-long-does-egg-nog-last-shelf-life-expiration-date/'}
{'category': 'dairy', 'sub_category': 'Milk & Cream', 'href': '/dairy/milk/'}
{'category': 'dairy', 'sub_category': 'Buttermilk', 'href': '/dairy/milk/buttermilk/'}
{'category': 'dairy', 'sub_category': 'Dairy (Coffee) Cream', 'href': '/dairy/milk/dairy-coffee-cream-shelf-life-expiration-date/'}
{'category': 'dairy', 'sub_category': 'Coffee Mate', 'href': '/dairy/milk/how-long-does-coffee-mate-last-shelf-life/'}
{'category': 'dair