In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102899&id=12296225&ind=3&objTypeID=1007'
output_file = open('scraped_data.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102899&id=12296225&ind=3&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102899&id=9926296&ind=4&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102899&id=12341168&ind=5&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254086&id=6386334&ind=7&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254086&id=10517463&ind=8&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254086&id=4951475&ind=9&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254086&id=4951973&ind=10&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254086&id=4952543&ind=11&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254086&id=4952581&ind=12&o

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102903&id=12296225&ind=678&objTypeID=1007'
output_file = open('scraped_data_cat.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_cat.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102903&id=12296225&ind=678&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=6386334&ind=680&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=10517463&ind=681&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=4951475&ind=682&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=4951536&ind=683&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=12289434&ind=684&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=4952543&ind=685&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=4951977&ind=686&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=49

KeyboardInterrupt: 

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=10874323&ind=1165&objTypeID=1007'
output_file = open('scraped_data_horse.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_horse.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=10874323&ind=1165&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=6684460&ind=1166&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=9084749&ind=1167&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=4952438&ind=1168&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=4952722&ind=1169&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=11497344&ind=1170&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254025&id=4952141&ind=1171&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254026&id=4952623&ind=1173&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=25402

In [9]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254007&id=6664598&ind=1737&objTypeID=1007'
output_file = open('scraped_data_birds.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_birds.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254007&id=6664598&ind=1737&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254008&id=9580647&ind=1739&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254008&id=4952964&ind=1740&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254009&id=8808771&ind=1742&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254009&id=10158446&ind=1743&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254009&id=9748899&ind=1744&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254010&id=4953021&ind=1746&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254010&id=10158446&ind=1747&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=25401

In [11]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253983&id=6664598&ind=1785&objTypeID=1007'
output_file = open('scraped_data_reptilesAndAmphibians.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_reptilesAndAmphibians.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253983&id=6664598&ind=1785&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253982&id=7985794&ind=1787&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253984&id=9003821&ind=1789&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253984&id=8453042&ind=1790&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253984&id=8006102&ind=1791&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253984&id=8006545&ind=1792&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253985&id=8808771&ind=1794&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253985&id=7996764&ind=1795&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253985&

In [15]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102923&id=9926296&ind=1844&objTypeID=1007'
output_file = open('scraped_data_smallMammals.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_smallMammals.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102923&id=9926296&ind=1844&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254112&id=10013653&ind=1846&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254113&id=10149739&ind=1848&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254115&id=6664598&ind=1850&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254114&id=11515989&ind=1852&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254116&id=4952955&ind=1854&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254118&id=8808771&ind=1856&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254118&id=9595564&ind=1857&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=2541

In [17]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253968&id=6664598&ind=1915&objTypeID=1007'
output_file = open('scraped_data_pigs.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_pigs.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253968&id=6664598&ind=1915&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253969&id=8808771&ind=1917&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253969&id=11025325&ind=1918&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253970&id=11389306&ind=1920&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253970&id=8741468&ind=1921&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253970&id=9793974&ind=1922&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253972&id=11513429&ind=1925&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253973&id=11513429&ind=1927&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253

In [19]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253998&id=11553187&ind=1933&objTypeID=1007'
output_file = open('scraped_data_ruminants.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_ruminants.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253998&id=11553187&ind=1933&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253997&id=4952237&ind=1935&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253996&id=8808771&ind=1937&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253996&id=11293287&ind=1938&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253995&id=10584832&ind=1940&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253994&id=10293707&ind=1942&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253994&id=10293707&ind=1943&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=253994&id=8552700&ind=1944&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=25

In [21]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=9219873&ind=2048&objTypeID=1007'
output_file = open('scraped_data_medication.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_medication.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=9219873&ind=2048&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=4951450&ind=2049&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=4952834&ind=2050&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=6386334&ind=2051&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=7916713&ind=2052&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=4952068&ind=2053&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=4952652&ind=2054&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&id=4952766&ind=2055&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=102894&

In [23]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
import time

visited_urls = set()
visited_inds = set()
seed_url = 'https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=4951822&ind=2222&objTypeID=1007'
output_file = open('scraped_data_healthyPetsHappyOwners.txt', 'w', encoding='utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

restricted_cat_ids = {'102896', '238492', '238491', '238331', '226839'}
required_params = {'pid', 'catId', 'id', 'ind', 'objTypeID'}

def should_scrape_url(url):
    """Check if the URL has all required parameters and meets restrictions."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Ensure URL contains all required parameters
    if not required_params.issubset(query_params.keys()):
        return False

    # Check for unwanted 'ind' value range
    try:
        ind_value = int(query_params['ind'][0])
        if ind_value in visited_inds or 2354 <= ind_value <= 2482:
            return False
    except (ValueError, IndexError):
        return False

    # Check for restricted 'catId' values
    if 'catId' in query_params and query_params['catId'][0] in restricted_cat_ids:
        return False

    # Restrict to veterinarypartner.vin.com domain
    return parsed_url.netloc == "veterinarypartner.vin.com"

def scrape_page(url):
    """Scrape the content of a page and write it to the output file."""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for paragraph in soup.find_all('p'):
        text = paragraph.text.strip()
        if text:
            output_file.write(text + '\n\n')

def crawl(url):
    if url in visited_urls or not should_scrape_url(url):
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)
    
    # Extract 'ind' from the URL and add it to visited_inds
    query_params = parse_qs(urlparse(url).query)
    ind_value = int(query_params['ind'][0])
    visited_inds.add(ind_value)

    # Scrape the page content
    scrape_page(url)

    # Process links on the page
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('javascript:'):
            continue
        absolute_link = urljoin(url, href)
        if absolute_link not in visited_urls:
            crawl(absolute_link)

# Start crawling from the seed URL
crawl(seed_url)
output_file.close()
print("Scraping complete. Data saved to 'scraped_data_healthyPetsHappyOwners.txt'.")


Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=4951822&ind=2222&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=4952883&ind=2223&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=9801244&ind=2224&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=9963588&ind=2225&objTypeID=10
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=6048046&ind=2226&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=12063966&ind=2227&objTypeID=10
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=7998415&ind=2228&objTypeID=1007
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=10413473&ind=2229&objTypeID=10
Crawling: https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=225987&id=4