In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
from time import sleep
from tqdm import tqdm

BASE = "https://www.horseclicks.com"

FILTER_PATH = (
    "/for-sale?page=2"
)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Connection": "keep-alive",
}

def get_soup(url):
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "lxml")

def scrape_listings(max_pages=2, sleep_seconds=5):

    rows = []

    for page in tqdm(range(1, max_pages+1), total=max_pages, desc="Pages", leave=True, position=0):
        url = f"{BASE}/horses{FILTER_PATH.format(page=page)}"
        print(f"Scraping page {page}: {url}")

        soup = get_soup(url)

        links = soup.select("div.al-listings-item.is-item")
        listing_urls = set()

        for div in links:
            href = div.get("href")
            if not href:
                continue

            url = BASE+href
            print(url)
            return

            listing_urls.add(url)

        for lurl in tqdm(listing_urls, total=len(listing_urls), desc="Horse Profiles", leave=False, position=0):
            try:
                data = {}
                data["horse_id"] = lurl.split("-")[-1]
                
                #Caracteristicas del caballo
                lsoup = get_soup(lurl).body
                ul_features_horse = lsoup.select_one("ul.meta-data.list-unstyled")

                
                for dl in ul_features_horse.select("dl.row"):
                    key = dl.find("dt").get_text(strip=True)
                    value = dl.find("dd").get_text(strip=True)

                    data[key] = value
                
                # Location
                h5_location = lsoup.select_one("div.col-xs-12.col-sm-5.no-padding-xs div header h5")
                location = h5_location.get_text(strip=True) if h5_location else None

                # Price
                span_price = lsoup.select_one("span.item-price")
                price = span_price.get_text(strip=True) if span_price else None

                # Skills / Disciplines
                dl = lsoup.find("dt", string="Skills / Disciplines")
                skills_disciplines = None
                if dl:
                    dd = dl.find_next_sibling("dd")
                    skills_disciplines = dd.get_text(strip=True) if dd else None
                    
                # Additional comments (span[itemprop="description"])
                desc_span = lsoup.select_one('div.well p span[itemprop="description"]')
                additional_comments = desc_span.get_text(strip=True) if desc_span else None

                # Shipping (segundo <p> después de div.well)
                p_shipping = lsoup.select_one("div.well p ~ p")
                shipping = p_shipping.get_text(strip=True) if p_shipping else None

                # Company name (h4)
                h4_cn = lsoup.select_one("div.well h4")
                company_name = h4_cn.get_text(strip=True) if h4_cn else None

                # Company profile href (primer <a> dentro de los <p> después de h4)
                a_cp = lsoup.select_one("div.well h4 + p ~ p a")
                company_profile = a_cp.get("href") if a_cp else None

                data["Location"] = location
                data["Price"] = price
                data["Horse Profile"] = lurl
                data["Skills"] = skills_disciplines
                data["Comments"] = additional_comments
                data["Shipping"] = shipping
                data["Company Name"] = company_name
                data["Company Profile"] = company_profile

                rows.append(data)

                sleep(sleep_seconds)  # rate limit suave

            except Exception as e:
                print(f"Error scraping {lurl}: {e}")

    df = pd.DataFrame(rows)
    return df

In [8]:
df = scrape_listings(max_pages=100)
df.head()

Pages:   0%|          | 0/100 [00:00<?, ?it/s]

Scraping page 1: https://www.horseclicks.com/horses/for-sale?page=2


Pages:   0%|          | 0/100 [00:00<?, ?it/s]


HTTPError: 403 Client Error: Forbidden for url: https://www.horseclicks.com/horses/for-sale?page=2

In [23]:
df.shape

(892, 25)

In [3]:
df.to_parquet("../../data/raw/equinenow_horses_listings.parquet", index=False)

In [15]:
from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

await page.goto("https://www.whatismybrowser.com/detect/what-is-my-user-agent/")

NotImplementedError: 