In [1]:
# === Cell 1: Import Necessary Libraries ===
# Description: We've added the 'selenium' library to control a web browser.
# 'webdriver_manager' will automatically handle the browser driver for us.

import os
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

print("Libraries imported successfully.")

Libraries imported successfully.


# Pet Nutrition & Safety Knowledge Base Builder

This notebook scrapes and builds a high-quality knowledge base on dog and cat nutrition, safe/unsafe foods, and essential feeding guidelines. It uses robust, universal scraping logic and post-processing to ensure only reliable, relevant content is included.

**Sources include:**
- American Kennel Club (AKC)
- FDA Animal & Veterinary
- Humane Society of the United States
- Tufts University Cummings School of Veterinary Medicine
- Cornell Feline Health Center
- VCA Animal Hospitals
- American Veterinary Medical Association (AVMA)
- ASPCA Animal Poison Control
- PetMD

The knowledge base is filtered to remove affiliate/junk content and saved for downstream use.

In [2]:
# === CELL 2 (New and Improved) ===
# Description: This universal function no longer needs a specific selector.
# It finds the main article content by identifying the densest cluster of paragraphs.

def extract_main_content(url):
    """
    Scrapes a URL using Selenium, then intelligently finds the main content
    by locating the parent element with the most paragraph text, making it
    resilient to layout changes.
    """
    print(f"Attempting to scrape URL: {url}")
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--log-level=3")
    # Some sites block default headless user agents. This one is more common.
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        driver.get(url)
        time.sleep(3) # Wait for dynamic content
        page_source = driver.page_source
    except Exception as e:
        print(f"Error: Selenium could not process the URL {url}. {e}")
        return []
    finally:
        driver.quit()

    soup = BeautifulSoup(page_source, 'html.parser')
    
    # --- Heuristic Logic to Find Content ---
    # 1. Find all parent elements (div, article, main)
    # 2. For each parent, calculate the total length of text in its child <p> tags.
    # 3. The parent with the highest score is most likely the main content block.
    
    max_score = 0
    best_element = None
    
    # We check common article containers.
    for element in soup.find_all(['div', 'article', 'main']):
        paragraphs = element.find_all('p', recursive=False) # Only direct children
        score = sum(len(p.get_text(strip=True)) for p in paragraphs)
        
        if score > max_score:
            max_score = score
            best_element = element

    if not best_element:
        # Fallback if the first method fails
        best_element = soup.find('body')
        print("Warning: Could not find a dense content block. Falling back to body.")

    # --- Text Extraction and Cleaning (from the best element) ---
    text_chunks = []
    # Now we find all paragraphs within the winner element
    for p in best_element.find_all('p'):
        text = p.get_text(strip=True)
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Stricter length check to avoid sidebar/footer text
        if len(text) > 150: 
            text_chunks.append(text)
            
    print(f"-> Found and cleaned {len(text_chunks)} text chunks.")
    return text_chunks

In [3]:
# === Cell 3: Define the Function to Save Data ===
# Description: This function is unchanged.

def save_to_json(data, folder="knowledge_base", filename="pet_health_data.json"):
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Created directory: {folder}")
    filepath = os.path.join(folder, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    print(f"\nData successfully saved to {filepath}")

# === Add this new function after your save_to_json function ===

def post_process_and_filter(knowledge_base):
    """
    Cleans the final knowledge base by removing junk entries based on keywords.
    """
    cleaned_kb = []
    # List of phrases that identify junk/unwanted paragraphs
    filter_phrases = [
        "affiliate advertising", 
        "participant in affiliate",
        "receive a portion of the sale",
        "registered as a 501(c)(3)",
        "tax-deductible",
        "Dr. Cailin Heinze" # To remove the specific author bio
    ]

    for entry in knowledge_base:
        chunk_text = entry['chunk'].lower() # Check in lowercase
        is_junk = False
        for phrase in filter_phrases:
            if phrase in chunk_text:
                is_junk = True
                break
        
        if not is_junk:
            cleaned_kb.append(entry)
            
    print(f"\n--- Post-Processing ---")
    print(f"Removed {len(knowledge_base) - len(cleaned_kb)} junk entries.")
    return cleaned_kb

In [4]:
def scrape_and_clean(url, selector):
    """
    Scrapes a URL using Selenium and extracts text from the specified selector.
    Returns a list of cleaned text chunks.
    """
    print(f"Scraping: {url}")
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--log-level=3")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    try:
        driver.get(url)
        time.sleep(3)
        page_source = driver.page_source
    except Exception as e:
        print(f"Error: Selenium could not process the URL {url}. {e}")
        return []
    finally:
        driver.quit()
    soup = BeautifulSoup(page_source, 'html.parser')
    tag = selector.get("tag", "div")
    attrs = selector.get("attrs", {})
    # Convert 'class_' to 'class' for BeautifulSoup compatibility
    if "class_" in attrs:
        attrs["class"] = attrs.pop("class_")
    element = soup.find(tag, attrs=attrs)
    if not element:
        print(f"Warning: Could not find element {tag} with {attrs} in {url}")
        return []
    text_chunks = []
    for p in element.find_all('p'):
        text = p.get_text(strip=True)
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        if len(text) > 100:
            text_chunks.append(text)
    print(f"-> Found and cleaned {len(text_chunks)} text chunks.")
    return text_chunks

# === Main Execution Block - Cell 4 (Updated) ===

if __name__ == "__main__":
    
    # List of sources with corrected selectors for June 2025.
    sources = [
        {
            "url": "https://www.akc.org/expert-advice/nutrition/human-foods-dogs-can-and-cant-eat/",
            # The content div is now simply called 'article-body'.
            "selector": {"tag": "div", "attrs": {"class_": "article-body"}}
        },
        {
            "url": "https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food",
            # This selector is still working correctly.
            "selector": {"tag": "article", "attrs": {}}
        },
        {
            "url": "https://www.aspca.org/pet-care/animal-poison-control/people-foods-avoid-feeding-your-pets",
            # The content is now within a div with a specific data attribute.
            "selector": {"tag": "div", "attrs": {"data-once": "main-content"}}
        },
        {
            "url": "https://www.petmd.com/dog/nutrition/best-human-food-for-dogs",
            # The main content is now within a div with the class 'article-content-body'.
            "selector": {"tag": "div", "attrs": {"class_": "article-content-body"}}
        },
        {
            "url": "https://www.humanesociety.org/resources/dangerous-foods-dogs",
            # They changed their layout; the main content is now in a <main> tag.
            "selector": {"tag": "main", "attrs": {"id": "main-content"}}
        },
        {
             "url": "https://vcahospitals.com/know-your-pet/nutrition-general-feeding-guidelines-for-dogs",
             # This selector has also changed. The content is now in an article tag.
             "selector": {"tag": "article", "attrs": {"class_": "article-layout"}}
        }
    ]
    
    knowledge_base = []
    
    print("--- Starting Advanced Scraping Process with Updated Selectors ---")
    for source_info in sources:
        scraped_chunks = scrape_and_clean(source_info["url"], source_info["selector"])
        # Only add to knowledge base if data was actually found
        if scraped_chunks:
            for chunk in scraped_chunks:
                knowledge_base.append({"source": source_info["url"], "chunk": chunk})
        print("-" * 20)
            
    print("--- Scraping Process Finished ---\n")

    if knowledge_base:
        save_to_json(knowledge_base)
        print(f"\nTotal chunks collected from all sources: {len(knowledge_base)}")
    else:
        print("\nNo data was scraped. Please check selectors or network connection.")

--- Starting Advanced Scraping Process with Updated Selectors ---
Scraping: https://www.akc.org/expert-advice/nutrition/human-foods-dogs-can-and-cant-eat/
-> Found and cleaned 42 text chunks.
--------------------
Scraping: https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food
-> Found and cleaned 42 text chunks.
--------------------
Scraping: https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food
-> Found and cleaned 13 text chunks.
--------------------
Scraping: https://www.aspca.org/pet-care/animal-poison-control/people-foods-avoid-feeding-your-pets
-> Found and cleaned 13 text chunks.
--------------------
Scraping: https://www.aspca.org/pet-care/animal-poison-control/people-foods-avoid-feeding-your-pets
--------------------
Scraping: https://www.petmd.com/dog/nutrition/best-human-food-for-dogs
--------------------
Scraping: https://www.petmd.com/dog/nutrition/best-human-food-for-dogs
--------------------
Scra

In [5]:
# === CELL 4 (Updated with new sources and new function call) ===

if __name__ == "__main__":
    
    # A new list of reliable sources. Selectors are no longer needed.
    sources = [
        "https://www.akc.org/expert-advice/nutrition/human-foods-dogs-can-and-cant-eat/",
        "https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food",
        "https://www.humanesociety.org/resources/dangerous-foods-dogs",
        # New, reliable source: Tufts University Veterinary School
        "https://vetnutrition.tufts.edu/2016/07/top-5-myths-about-pet-food/",
        # New, reliable source: Cornell University Veterinary School
        "https://www.vet.cornell.edu/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics/feeding-your-cat",
        # Adding back VCA with the universal extractor
        "https://vcahospitals.com/know-your-pet/nutrition-general-feeding-guidelines-for-dogs"
    ]
    
    knowledge_base = []
    
    print("--- Starting Universal Scraping Process ---")
    for url in sources:
        # We now call our new function, which doesn't need a selector
        scraped_chunks = extract_main_content(url)
        
        if scraped_chunks:
            for chunk in scraped_chunks:
                knowledge_base.append({"source": url, "chunk": chunk})
        print("-" * 20)
            
    print("--- Scraping Process Finished ---\n")

    if knowledge_base:
        save_to_json(knowledge_base, filename="universal_pet_health_data.json")
        print(f"\nTotal chunks collected: {len(knowledge_base)}")
    else:
        print("\nNo data was scraped. Check network or site availability.")

--- Starting Universal Scraping Process ---
Attempting to scrape URL: https://www.akc.org/expert-advice/nutrition/human-foods-dogs-can-and-cant-eat/
-> Found and cleaned 41 text chunks.
--------------------
Attempting to scrape URL: https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food
-> Found and cleaned 41 text chunks.
--------------------
Attempting to scrape URL: https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food
-> Found and cleaned 10 text chunks.
--------------------
Attempting to scrape URL: https://www.humanesociety.org/resources/dangerous-foods-dogs
-> Found and cleaned 10 text chunks.
--------------------
Attempting to scrape URL: https://www.humanesociety.org/resources/dangerous-foods-dogs
-> Found and cleaned 1 text chunks.
--------------------
Attempting to scrape URL: https://vetnutrition.tufts.edu/2016/07/top-5-myths-about-pet-food/
-> Found and cleaned 1 text chunks.
--------------------
At

In [6]:
# === CELL 4 (Final, Cleaned & Expanded) ===

if __name__ == "__main__":
    # Comprehensive, reliable sources for dog and cat nutrition, food safety, and essential feeding guidelines.
    sources = [
        # Dog Nutrition & Food Safety
        "https://www.akc.org/expert-advice/nutrition/human-foods-dogs-can-and-cant-eat/",  # AKC
        "https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food",  # FDA
        "https://www.humanesociety.org/resources/dangerous-foods-dogs",  # Humane Society
        "https://vetnutrition.tufts.edu/2016/07/top-5-myths-about-pet-food/",  # Tufts Vet School
        "https://vcahospitals.com/know-your-pet/nutrition-general-feeding-guidelines-for-dogs",  # VCA
        "https://www.aspca.org/pet-care/animal-poison-control/people-foods-avoid-feeding-your-pets",  # ASPCA
        "https://www.petmd.com/dog/nutrition/best-human-food-for-dogs",  # PetMD
        # Cat Nutrition & Food Safety
        "https://www.vet.cornell.edu/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics/feeding-your-cat",  # Cornell Feline Health
        "https://www.aspca.org/pet-care/animal-poison-control/cats-plant-list",  # ASPCA Cat Toxic Plants
        "https://www.humanesociety.org/resources/foods-are-dangerous-cats",  # Humane Society Cats
        "https://vcahospitals.com/know-your-pet/nutrition-general-feeding-guidelines-for-cats",  # VCA Cats
        # General Pet Food Safety
        "https://www.avma.org/resources-tools/pet-owners/petcare/safe-pet-food-storage-handling",  # AVMA
    ]

    knowledge_base = []

    print("--- Starting Universal Scraping Process ---")
    for url in sources:
        scraped_chunks = extract_main_content(url)
        if scraped_chunks:
            for chunk in scraped_chunks:
                knowledge_base.append({"source": url, "chunk": chunk})
        print("-" * 20)

    print("--- Scraping Process Finished ---\n")

    if knowledge_base:
        # Always run post-processing to filter out junk/affiliate content
        final_knowledge_base = post_process_and_filter(knowledge_base)
        save_to_json(final_knowledge_base, filename="final_pet_health_kb.json")
        print(f"\nTotal chunks in final knowledge base: {len(final_knowledge_base)}")
    else:
        print("\nNo data was scraped.")

--- Starting Universal Scraping Process ---
Attempting to scrape URL: https://www.akc.org/expert-advice/nutrition/human-foods-dogs-can-and-cant-eat/
-> Found and cleaned 41 text chunks.
--------------------
Attempting to scrape URL: https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food
-> Found and cleaned 41 text chunks.
--------------------
Attempting to scrape URL: https://www.fda.gov/animal-veterinary/animal-health-literacy/complete-and-balanced-pet-food
-> Found and cleaned 10 text chunks.
--------------------
Attempting to scrape URL: https://www.humanesociety.org/resources/dangerous-foods-dogs
-> Found and cleaned 10 text chunks.
--------------------
Attempting to scrape URL: https://www.humanesociety.org/resources/dangerous-foods-dogs
-> Found and cleaned 1 text chunks.
--------------------
Attempting to scrape URL: https://vetnutrition.tufts.edu/2016/07/top-5-myths-about-pet-food/
-> Found and cleaned 1 text chunks.
--------------------
At