In [1]:
# ==============================
# Setup
# ==============================

import random
import numpy as np
from faker import Faker
import pandas as pd

fake = Faker()
random.seed(42)
np.random.seed(42)

NUM_HORSES = 1000
NUM_SELLERS = 200

# ==============================
# Diccionario completo de razas
# ==============================
BREEDS = {
    "Pura Sangre Inglés": {"height_m": (1.6,1.7), "weight_kg": (450,500), "length_m":2.4, "max_speed_kmh":71, "temperament":"Nervioso / Atlético", "main_use":"Carreras y Salto"},
    "Cuarto de Milla": {"height_m": (1.4,1.6), "weight_kg": (500,600), "length_m":2.3, "max_speed_kmh":70, "temperament":"Tranquilo / Sociable", "main_use":"Rodeo y Trabajo"},
    "Árabe": {"height_m": (1.4,1.5), "weight_kg": (350,450), "length_m":2.2, "max_speed_kmh":(55,60), "temperament":"Inteligente / Leal", "main_use":"Endurance (Raid)"},
    "Hannoveriano": {"height_m": (1.6,1.7), "weight_kg": (600,650), "length_m":2.5, "max_speed_kmh":50, "temperament":"Equilibrado", "main_use":"Salto y Doma"},
    "Akhal-Teke": {"height_m": (1.5,1.6), "weight_kg": (420,480), "length_m":2.3, "max_speed_kmh":60, "temperament":"Independiente", "main_use":"Resistencia"},
    "Pura Raza Española": {"height_m": (1.5,1.6), "weight_kg": (500,550), "length_m":2.4, "max_speed_kmh":(45,50), "temperament":"Noble / Dócil", "main_use":"Doma y Exhibición"},
    "Frisón": {"height_m": (1.6,1.7), "weight_kg": (600,800), "length_m":2.6, "max_speed_kmh":(40,45), "temperament":"Manso / Dispuesto", "main_use":"Tiro y Cine"},
    "Percherón": {"height_m": (1.6,1.8), "weight_kg": (800,1000), "length_m":2.8, "max_speed_kmh":(25,30), "temperament":"Muy calmado", "main_use":"Tiro pesado"},
    "Appaloosa": {"height_m": (1.4,1.6), "weight_kg": (450,550), "length_m":2.3, "max_speed_kmh":(55,60), "temperament":"Activo / Resistente", "main_use":"Rutas y Western"},
    "Shetland (Poni)": {"height_m": (0.7,1.1), "weight_kg": (150,200), "length_m":1.5, "max_speed_kmh":20, "temperament":"Testarudo / Fuerte", "main_use":"Niños y Compañía"}
}

# ==============================
# Generador Horse
# ==============================
def generate_horse(i: int) -> dict:
    breed = random.choice(list(BREEDS.keys()))
    b = BREEDS[breed]

    is_complete = random.random() < 0.6
    maybe_missing = ["height_m","weight_kg","max_speed_kmh","h_career_top3_rate","h_days_since_last_race"]
    missing_field = random.choice(maybe_missing) if not is_complete else None

    def val_or_none(field, val):
        return None if field == missing_field else val

    return {
        "horse_id": f"H{i}",
        "horse_name": fake.word().capitalize(),
        "birth_date": fake.date_of_birth(minimum_age=2, maximum_age=30),
        "h_sex": random.choice(["M","F"]),
        "raza": breed,
        "height_m": val_or_none("height_m", round(random.uniform(*b["height_m"]),2)),
        "weight_kg": val_or_none("weight_kg", random.randint(*b["weight_kg"])),
        "length_m": b["length_m"],
        "max_speed_kmh": val_or_none("max_speed_kmh",
            random.randint(*b["max_speed_kmh"]) if isinstance(b["max_speed_kmh"], tuple) else b["max_speed_kmh"]),
        "h_temperament": b["temperament"],
        "h_category": b["main_use"],
        "h_career_races": np.random.poisson(15),
        "h_days_since_last_race": val_or_none("h_days_since_last_race", random.randint(7,400)),
        "h_linaje": random.choices(["Sí","No","Desconocido"],weights=[0.4,0.4,0.2])[0]
    }

# ==============================
# Generador Seller
# ==============================
def generate_seller(i: int) -> dict:
    created_at = fake.date_time_between(start_date="-4y", end_date="-6m")
    return {
        "seller_id": f"S{i}",
        "s_first_name": fake.first_name(),
        "s_last_name": fake.last_name(),
        "s_verified": random.random() > 0.2,
        "s_disputes": int(np.random.poisson(0.4)),
        "s_num_listings": random.randint(1,25),
        "s_flagged_fraud": random.random() < 0.05,
        "s_created_at": created_at,
        "s_last_active_at": fake.date_time_between(start_date=created_at)
    }

# ==============================
# Generador Listing
# ==============================
def generate_listing(horse_id: str, seller_id: str, i: int) -> dict:
    return {
        "listing_id": f"L{i}",
        "horse_id": horse_id,
        "seller_id": seller_id,
        "l_listing_status": random.choices(["active","sold","withdrawn"],weights=[0.6,0.25,0.15])[0],
        "l_asking_price_usd": int(np.random.normal(1500000,40000)),
        "l_created_at": fake.date_time_between(start_date="-1y")
    }

# ==============================
# Generador Vet
# ==============================
def generate_vet_record(horse_id: str) -> dict:
    return {
        "horse_id": horse_id,
        "v_exam_date": fake.date_time_between(start_date="-2y"),
        "v_major_issue": random.random() < 0.15,
        "v_confidence_score": round(random.uniform(0.7,1.0),2)
    }

# ==============================
# GENERACIÓN PRINCIPAL
# ==============================
horses = [generate_horse(i) for i in range(NUM_HORSES)]
horses_df = pd.DataFrame(horses)

sellers = [generate_seller(i) for i in range(NUM_SELLERS)]
sellers_df = pd.DataFrame(sellers)

# Asignación Horse → Seller (máx 8)
seller_horses = {s["seller_id"]: [] for s in sellers}
for horse in horses:
    assigned = False
    while not assigned:
        seller = random.choice(sellers)
        if len(seller_horses[seller["seller_id"]]) < 8:
            seller_horses[seller["seller_id"]].append(horse["horse_id"])
            assigned = True

# Listings
listings = []
for seller_id, horse_list in seller_horses.items():
    for i, h_id in enumerate(horse_list):
        listings.append(generate_listing(h_id, seller_id, i))
listings_df = pd.DataFrame(listings)

# Vet records
vet_records = []
for h in horses:
    for _ in range(random.randint(1,3)):
        vet_records.append(generate_vet_record(h["horse_id"]))
vet_records_df = pd.DataFrame(vet_records)

# ==============================
# AGREGACIÓN VET
# ==============================
vet_agg_df = (
    vet_records_df
    .groupby("horse_id")
    .agg(
        vet_total_exams=("v_exam_date", "count"),
        vet_major_issues=("v_major_issue", "sum"),
        vet_avg_confidence=("v_confidence_score", "mean")
    )
    .reset_index()
)

# ==============================
# MERGE FINAL
# ==============================
final_df = (
    horses_df
    .merge(listings_df, on="horse_id", how="inner")
    .merge(sellers_df, on="seller_id", how="left")
    .merge(vet_agg_df, on="horse_id", how="left")
)

# ==============================
# NUEVAS COLUMNAS: Países aleatorios
# ==============================
final_df["h_current_country"] = [fake.country() for _ in range(len(final_df))]
final_df["h_birth_country"] = [fake.country() for _ in range(len(final_df))]

# ==============================
# VALIDACIÓN
# ==============================
print("Horses:", horses_df.shape)
print("Listings:", listings_df.shape)
print("Final dataset:", final_df.shape)

# Guardar CSV
DATA_PATH = "/workspaces/S02-26-Equipo-30-Web-App-/data/horsetrust_database.csv"
final_df.to_csv(DATA_PATH, index=False)
print("✅ Dataset guardado en:", DATA_PATH)


Horses: (1000, 14)
Listings: (1000, 6)
Final dataset: (1000, 32)
✅ Dataset guardado en: /workspaces/S02-26-Equipo-30-Web-App-/data/horsetrust_database.csv
