In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
from time import sleep
from tqdm import tqdm

BASE = "https://www.equinenow.com"

FILTER_PATH = (
    "/browse-ssf--brf--clf--sxf--tgf--htf--agf--prf--"
    "auc-1-sdf-1-slf-1-orf--pg-{page}-"
    "rsf--dsf--svf--cnf--pnf--wbf--rgf-"
)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def get_soup(url):
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "lxml")

def scrape_listings(max_pages=2, sleep_seconds=5):

    rows = []

    for page in tqdm(range(0, max_pages), total=max_pages, desc="Pages", leave=True, position=0):
        url = f"{BASE}{FILTER_PATH.format(page=page)}"
        #print(f"Scraping page {page}: {url}")

        soup = get_soup(url)

        links = soup.select("a[href*='horse-ad']")
        listing_urls = set()

        for a in links:
            href = a.get("href")
            if not href:
                continue

            # nos quedamos solo con el path /horse-ad-XXXX
            parsed = urlparse(href)
            clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"

            listing_urls.add(clean_url)

        for lurl in tqdm(listing_urls, total=len(listing_urls), desc="Horse Profiles", leave=False, position=0):
            try:
                data = {}
                data["horse_id"] = lurl.split("-")[-1]
                
                #Caracteristicas del caballo
                lsoup = get_soup(lurl).body
                ul_features_horse = lsoup.select_one("ul.meta-data.list-unstyled")

                
                for dl in ul_features_horse.select("dl.row"):
                    key = dl.find("dt").get_text(strip=True)
                    value = dl.find("dd").get_text(strip=True)

                    data[key] = value
                
                # Location
                h5_location = lsoup.select_one("div.col-xs-12.col-sm-5.no-padding-xs div header h5")
                location = h5_location.get_text(strip=True) if h5_location else None

                # Price
                span_price = lsoup.select_one("span.item-price")
                price = span_price.get_text(strip=True) if span_price else None

                # Skills / Disciplines
                dl = lsoup.find("dt", string="Skills / Disciplines")
                skills_disciplines = None
                if dl:
                    dd = dl.find_next_sibling("dd")
                    skills_disciplines = dd.get_text(strip=True) if dd else None
                    
                # Additional comments (span[itemprop="description"])
                desc_span = lsoup.select_one('div.well p span[itemprop="description"]')
                additional_comments = desc_span.get_text(strip=True) if desc_span else None

                # Shipping (segundo <p> después de div.well)
                p_shipping = lsoup.select_one("div.well p ~ p")
                shipping = p_shipping.get_text(strip=True) if p_shipping else None

                # Company name (h4)
                h4_cn = lsoup.select_one("div.well h4")
                company_name = h4_cn.get_text(strip=True) if h4_cn else None

                # Company profile href (primer <a> dentro de los <p> después de h4)
                a_cp = lsoup.select_one("div.well h4 + p ~ p a")
                company_profile = a_cp.get("href") if a_cp else None

                data["Location"] = location
                data["Price"] = price
                data["Horse Profile"] = lurl
                data["Skills"] = skills_disciplines
                data["Comments"] = additional_comments
                data["Shipping"] = shipping
                data["Company Name"] = company_name
                data["Company Profile"] = company_profile

                rows.append(data)

                sleep(sleep_seconds)  # rate limit suave

            except Exception as e:
                print(f"Error scraping {lurl}: {e}")

    df = pd.DataFrame(rows)
    return df

In [20]:
df = scrape_listings(max_pages=100)
df.head()

Pages: 100%|██████████| 100/100 [1:37:02<00:00, 58.22s/it]   


Unnamed: 0,horse_id,Breed,Name,Gender,Foal Date,In Foal,Height (hh),Weight (lbs),Color,Registry,...,Price,Horse Profile,Skills,Comments,Shipping,Company Name,Company Profile,State Bred,Markings,Registry Number
0,1188651,Thoroughbred,Pre-purchase Deposit Option,Gelding,Jan 1st 2020,No,16.2,1200.0,Black,JC,...,"$2,995",https://www.equinenow.com/horse-ad-1188651,"All Around, Athletic, Barrel, Beginner, Breedi...",If you are looking for your next horse then - ...,Will help arrange all shipping,Thoroughbred Sport Horse Company,https://www.equinenow.com/farm/thoroughbred_sp...,,,
1,1606295,Lusitano,Felix,Gelding,Jun 29th 2022,No,15.3,1000.0,Black,IALHA,...,"$20,000",https://www.equinenow.com/horse-ad-1606295,"All Around, Flashy, Longe Line, Natural Horsem...","Hello, Felix is a beautiful black with lots of...",Felix is ready to be trained your way.,,,Washington,Blaze and 3 white socks flashy,23147 p
2,1599670,Thoroughbred,Iron Legacy,Gelding,Mar 5th 2022,No,16.0,1200.0,Chestnut,JC,...,"$2,995",https://www.equinenow.com/horse-ad-1599670,"All Around, Athletic, Dressage, Endurance, Equ...",Super nice to ride. Safe young guy...This bea...,We help arrange all shipping,Thoroughbred Sport Horse Company,https://www.equinenow.com/farm/thoroughbred_sp...,,,
3,1597597,Thoroughbred,Lindsy's Shadow,Mare,May 18th 2019,No,16.1,1200.0,Chocolate,JC,...,"$2,995",https://www.equinenow.com/horse-ad-1597597,"All Around, Athletic, Barrel, Barrel Racing, B...",She is a Beautiful Liver chestnut. She is saf...,We help arrange shipping.,Thoroughbred Sport Horse Company,https://www.equinenow.com/farm/thoroughbred_sp...,,white star,
4,1601382,Quarter Pony,Pudding,Gelding,Jan 1st 2018,No,13.3,1000.0,Palomino,,...,"$15,000",https://www.equinenow.com/horse-ad-1601382,"Beginner, Finished, Flashy, Husband Safe, Kid ...",Pudding is a gorgeous golden palomino that I h...,We are glad to help arrange shipping anywhere,Mcnabb Livestock,https://www.equinenow.com/farm/quarter-circle-...,,,


In [23]:
df.shape

(892, 25)

In [3]:
df.to_parquet("../../data/raw/equinenow_horses_listings.parquet", index=False)