In [1]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urlparse, urljoin
import re

# Define the file to store the scraped content
output_file = "cats_and_dogs_data.txt"
visited_urls = set()  # Set to store and check visited URLs to avoid duplicates
base_domain = "veterinarypartner.vin"  # Adjust this to the domain you're targeting
total_scraped_length = 0  # Counter to keep track of the total length of the scraped data

# Function to check if a URL belongs to the relevant domain
def is_relevant_url(url):
    return urlparse(url).netloc.endswith(base_domain)

# Function to clean content by removing dates and empty lines
def clean_content(content):
    # Remove dates in formats like MM/DD/YYYY or Month Day, Year
    content = re.sub(r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|[A-Za-z]+\s\d{1,2},\s\d{4})\b', '', content)

    # Remove empty lines
    content = '\n'.join([line.strip() for line in content.split('\n') if line.strip()])

    return content

# Function to scrape and filter useful content
def scrape_page(url):
    """Scrapes the content of a given URL and writes filtered content to the file."""
    global total_scraped_length  # Use the global variable to track length

    if url in visited_urls:
        return  # Skip this URL if it was already visited

    # Mark this URL as visited
    visited_urls.add(url)

    try:
        # Fetch the page content
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Initialize flags to indicate whether we are inside the 'Cats' or 'Dogs' sections
        inside_target_section = False
        content = ""

        # Loop through all <h2> tags and search for the target domains "Cats" or "Dogs"
        for tag in soup.find_all("h2"):
            if "Cats" in tag.get_text():  # Look for the Cats section
                inside_target_section = True  # Start scraping the Cats section
                content += f"Found target section: {tag.get_text()}\n\n"
            elif "Dogs" in tag.get_text():  # Look for the Dogs section
                inside_target_section = True  # Start scraping the Dogs section
                content += f"Found target section: {tag.get_text()}\n\n"

            # If we are inside the target section, start collecting content
            if inside_target_section:
                # Extract content after the <h2> tag, check the next tags (paragraphs, divs, etc.)
                if tag.name == 'h2' and "Cats" not in tag.get_text() and "Dogs" not in tag.get_text():
                    break  # Stop scraping if we reach the next unrelated section

                # Add the text from paragraphs or other content under the target section
                for p in tag.find_all_next(["p", "div"]):  # Scrape <p> and <div> tags after the <h2>
                    text = p.get_text(separator="\n").strip()
                    if text:  # Only add non-empty content
                        content += text + "\n"

        # Clean up the content: remove dates and empty lines
        content = clean_content(content)

        # If we have useful content, write it to the file
        if content.strip():  # Only write if there’s actual content
            with open(output_file, "a", encoding="utf-8") as f:
                f.write(f"URL: {url}\n\n")  # Record the page URL
                f.write(content)          # Save the page content
                f.write("\n\n" + "="*80 + "\n\n")  # Separator for readability

            # Update the total length of scraped data
            total_scraped_length += len(content)  # Add the length of the content

        # Find and recursively scrape all unique and relevant links on the page
        for link in soup.find_all("a", href=True):
            full_url = urljoin(url, link["href"])  # Convert to absolute URL

            # Only follow relevant links (same domain) and skip advertisements or irrelevant URLs
            if full_url not in visited_urls and is_relevant_url(full_url) and 'logout' not in full_url:
                time.sleep(1)  # Add a delay to respect the server
                print(f"Following link: {full_url}")
                scrape_page(full_url)  # Recursively scrape the sublink

    except requests.RequestException as e:
        print(f"Error scraping {url}: {e}")

def main():
    start_url = "https://veterinarypartner.vin.com/default.aspx?pId=19239&catId=102887"  # Example starting URL
    print(f"Starting scrape at {start_url}")
    scrape_page(start_url)
    print(f"Scraping completed! Data saved in {output_file}")
    print(f"Total length of scraped data: {total_scraped_length} characters")

# Run the main function to begin scraping
if __name__ == "__main__":
    main()


Starting scrape at https://veterinarypartner.vin.com/default.aspx?pId=19239&catId=102887
Scraping completed! Data saved in cats_and_dogs_data.txt
Total length of scraped data: 381221 characters
