# Simulacion de tracking de usuarios

No contamos con datos reales de tracking por restricciones de privacidad.
Por lo tanto, construimos un synthetic user behavior dataset calibrado usando el dataset de E-Commerce Behavior Data de Rees46.

## Packages

In [1]:
from pathlib import Path
import os
import requests
import zipfile
import tarfile
import gzip
import shutil
from typing import Literal
from urllib.parse import urlparse
import pandas as pd
from collections import Counter, defaultdict
from faker import Faker
from faker.providers import BaseProvider
import random
from babel import Locale
import pycountry
import numpy as np
import uuid
from datetime import datetime, timedelta

# Descargando el dataset de la competencia [E-Commerce Behavior Data de Rees46](https://rees46.com/en/datasets)

In [2]:
def download_and_prepare(
    url: str,
    download_path: str,
    extract_dir: str | None = None,
    chunk_size: int = 8192,
    mode: Literal["download", "extract", "download_and_extract"] = "download_and_extract",
) -> None:
    """
    Descarga y/o extrae un archivo desde una URL.

    Soporta:
    - ZIP (.zip)
    - TAR.GZ / TGZ (.tar.gz, .tgz)
    - GZ simple (.gz → un solo archivo)
    - Archivos sin compresión (solo descarga)

    Modes
    -----
    - download: solo descarga
    - extract: solo extrae (el archivo debe existir)
    - download_and_extract: descarga y luego extrae (default)
    """

    download_path = Path(download_path)
    os.makedirs(download_path.parent, exist_ok=True)

    # =====================
    # Asegurar extensión
    # =====================
    if download_path.suffix == "":
        url_path = Path(urlparse(url).path)
        if url_path.suffix:
            download_path = download_path.with_suffix(url_path.suffix)

    # =====================
    # Descargar
    # =====================
    if mode in {"download", "download_and_extract"}:
        if not download_path.exists():
            print(f"Descargando archivo desde {url} ...")
            response = requests.get(url, stream=True)
            response.raise_for_status()

            with open(download_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)

            print("Descarga completada.")
        else:
            print("El archivo ya existe, no se descargará.")

    # =====================
    # Extraer
    # =====================
    if mode not in {"extract", "download_and_extract"}:
        return

    if not download_path.exists():
        raise FileNotFoundError(
            f"No existe el archivo {download_path}. "
            "No se puede extraer."
        )

    is_zip = zipfile.is_zipfile(download_path)
    is_tar = tarfile.is_tarfile(download_path)
    is_gz = download_path.suffix == ".gz" and not is_tar

    if not is_zip and not is_tar and not is_gz:
        print("El archivo no es ZIP, TAR.GZ ni GZ. No se requiere extracción.")
        return

    # =====================
    # Directorio de extracción
    # =====================
    if extract_dir is None:
        extract_dir = download_path.as_posix() + "_extracted"

    extract_dir = Path(extract_dir)

    if extract_dir.exists():
        print("La carpeta de extracción ya existe, no se descomprimirá.")
        return

    os.makedirs(extract_dir, exist_ok=True)

    # =====================
    # ZIP
    # =====================
    if is_zip:
        print("Archivo ZIP detectado. Descomprimiendo...")
        with zipfile.ZipFile(download_path, "r") as zip_ref:
            zip_ref.extractall(extract_dir)

    # =====================
    # TAR / TAR.GZ / TGZ
    # =====================
    elif is_tar:
        print("Archivo TAR.GZ detectado. Descomprimiendo...")
        with tarfile.open(download_path, "r:*") as tar_ref:
            tar_ref.extractall(extract_dir)

    # =====================
    # GZ simple
    # =====================
    elif is_gz:
        print("Archivo GZ detectado. Descomprimiendo...")

        output_file = extract_dir / download_path.stem  # quita .gz

        with gzip.open(download_path, "rb") as f_in:
            with open(output_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

        print(f"Archivo extraído: {output_file}")

    print("Archivo descomprimido correctamente.")
    return

In [3]:
url = "https://data.rees46.com/datasets/marketplace/2020-Apr.csv.gz"
download_path = "../../data/tracking/rees/2020-Apr.csv.gz"
extract_dir = "../../data/tracking/rees/extracted"

download_and_prepare(url=url, download_path=download_path, extract_dir=extract_dir, mode='extract')

La carpeta de extracción ya existe, no se descomprimirá.


# Cargando datasets

## Cargamos el set de datos de Rees46 por chunks, extraemos la distribucion del target y extraemos una muestra mas pequeña con la misma distribucion

In [4]:
# ============================================================
# CONFIGURACIÓN GENERAL
# ============================================================
file_path = Path("../../data/tracking/rees/extracted/2020-Apr_sample.parquet")
SOURCE_CSV = "../../data/tracking/rees/extracted/2020-Apr.csv"

COL = "event_type"
CHUNKSIZE = 200_000
TARGET = 1_000_000
RANDOM_STATE = 42

# ============================================================
# CREACIÓN DEL DATASET SAMPLEADO (solo si no existe)
# ============================================================
if not file_path.exists():
    # --------------------------------------------------------
    # 1. CÁLCULO DE DISTRIBUCIÓN ORIGINAL POR CLASE
    # --------------------------------------------------------
    counts = Counter()
    total = 0

    for chunk in pd.read_csv(SOURCE_CSV, chunksize=CHUNKSIZE):
        counts.update(chunk[COL].value_counts().to_dict())
        total += len(chunk)

    dist = {k: v / total for k, v in counts.items()}
    print("Distribución original:", dist)

    # --------------------------------------------------------
    # 2. CÁLCULO DE OBJETIVO POR CLASE
    # --------------------------------------------------------
    target_per_class = {
        k: int(v * TARGET)
        for k, v in dist.items()
    }

    # Ajuste fino por redondeo
    diff = TARGET - sum(target_per_class.values())
    if diff != 0:
        biggest = max(target_per_class, key=target_per_class.get)
        target_per_class[biggest] += diff

    print("Objetivo por clase:", target_per_class)

    # --------------------------------------------------------
    # 3. SAMPLEO ESTRATIFICADO POR CHUNKS
    # --------------------------------------------------------
    selected = defaultdict(list)

    for chunk in pd.read_csv(SOURCE_CSV, chunksize=CHUNKSIZE):

        for cls, n_target in target_per_class.items():
            remaining = n_target - sum(len(x) for x in selected[cls])

            if remaining <= 0:
                continue

            rows = chunk[chunk[COL] == cls]

            if rows.empty:
                continue

            take = min(len(rows), remaining)

            sampled = rows.sample(
                n=take,
                random_state=RANDOM_STATE
            )

            selected[cls].append(sampled)

        # Corte temprano si ya alcanzamos el target global
        total_selected = sum(
            sum(len(x) for x in v)
            for v in selected.values()
        )

        if total_selected >= TARGET:
            break

    # --------------------------------------------------------
    # 4. CONSOLIDACIÓN Y GUARDADO
    # --------------------------------------------------------
    df_sessions = pd.concat(
        [pd.concat(v) for v in selected.values()],
        ignore_index=True
    )

    df_sessions.to_parquet(file_path)

# ============================================================
# CARGA FINAL DEL DATASET
# ============================================================

df_sessions = pd.read_parquet(file_path)
df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-04-01 02:37:24 UTC,view,1005225,2232732093077520756,construction.tools.light,samsung,178.51,565276246,a37bfb5b-8a32-4e64-bba2-1e9e4f600b88
1,2020-04-01 00:19:24 UTC,view,1005287,2232732093077520756,construction.tools.light,xiaomi,308.55,550480042,c13278a7-b518-4053-9145-f99665caf677
2,2020-04-01 03:27:48 UTC,view,1002525,2232732093077520756,construction.tools.light,apple,658.96,552255608,c949a06d-1c14-42ca-8b49-98a2060c2150
3,2020-04-01 00:47:47 UTC,view,33600000,2070005009172398851,,fiesta,291.87,578951037,c12adc7f-dc27-4471-b49a-35ceb63709b6
4,2020-04-01 01:34:00 UTC,view,100159174,2053013555220840837,appliances.kitchen.juicer,adata,64.44,621157362,e886a3df-5b88-4252-9130-ed0c4cc38773


In [5]:
len(df_sessions)

1000000

## Cargamos los set de datos de caballos y los concatenamos

In [6]:
pd.set_option("display.max_columns", None)
df_horses_en = pd.read_parquet("../../data/clean/equinenow_horses_listings_limpio.parquet")
df_horses_en.head()

Unnamed: 0,Horse_ID,Breed,Name,Gender,In Foal,Height (hh),Weight (lbs),Temperament,Ad Created,Last Update,Location,Price,Horse Profile,Skills,Comments,Shipping,Company Name,Company Profile,Color,Registry,Markings,Age,Has_Face_Markings
0,1605454,miniature,hershey's lucy,filly,no,6.0,150.0,5 10,2026-01-22,2026-01-22,"millersburg, oh",1900.0,httpswww.equinenow.comhorse-ad-1605454,"kid safe, therapy",hershey's lucy is born 8-30-25. here is the mo...,we can help you arrange shipping,hershberger's farm,httpswww.equinenow.comfarmhershbergers-farm.htm,sin información,sin información,sin información,0.5,0
1,1601263,quarter horse,bar x pumbaa,gelding,no,14.2,1000.0,1 10,2025-12-01,2026-02-06,"san antonio, tx",35000.0,httpswww.equinenow.comhorse-ad-1601263,"all around, athletic, beginner, flashy, husban...",elite aqha gelding proven bloodlines family fr...,pumba is located at our ranch in arkansas with...,horse of my dreams,httpswww.equinenow.comfarmhorse-of-my-dreams-4...,bay,aqha,sin información,6.1,0
2,1596672,standardbred,sin información,filly,no,16.0,750.0,5 10,2026-01-12,2026-01-12,"blairsville, pa",4500.0,httpswww.equinenow.comhorse-ad-1596672,sin información,lacy is a fancy colored blue eyed splash white...,sheer heaven pet resort and equestrian center ...,sheer heaven farm,httpswww.equinenow.comfarmsheerheavenfarm.htm,bay,usta,"splash white, and blue eyes",7.65,0
3,1602502,draft,sherman tank,gelding,no,15.3,1900.0,1 10,2025-12-11,2025-12-11,"ava, mo",12500.0,httpswww.equinenow.comhorse-ad-1602502,"cross, draft, driving, experienced, flashy, fl...",meet our sherman tankhe's stouta 5yo - coming ...,delivery hauling available,sin información,sin información,chestnut,sin información,solid,6.0,0
4,1603216,quarter horse,invitation to roan,gelding,no,16.0,1200.0,6 10,2025-12-21,2025-12-21,"auburn, pa",7500.0,httpswww.equinenow.comhorse-ad-1603216,"all around, ranch, ranch work, trail, trail ri...","rio is a 2017 gelding. hes an amazing horse, b...",sin información,sin información,sin información,buckskin,aqha,roan,8.8,0


In [7]:
len(df_horses_en)

1905

In [8]:
df_horses_hd = pd.read_parquet("../../data/clean/horsedeals_horses_listings_limpio.parquet")
df_horses_hd.head()

Unnamed: 0,Horse_ID,Breed,Name,Gender,Foal Date,In Foal,Height (hh),Weight (lbs),Temperament,Ad Created,Last Update,Location,Price,Horse Profile,Skills,Comments,Shipping,Company Name,Company Profile,Color,Markings
0,6062071769402210483,paints,stunning mare with a super trainable temperament,mare,2022-02-12,unknown,14.1,1100.0,unknown,2026-02-11,2026-02-11,garfield vic,15000.0,https://www.horsedeals.com.au/classifieds/item...,"allrounders, , interschool",jigsaw is an exciting young mare with a super ...,unknown,unknown,unknown,unknown,unknown
1,4919341762915834996,arabian & arabian derivative,anglo arab pony,gelding,2003-02-17,unknown,15.2,1100.0,unknown,2026-02-11,2026-02-11,"avoca, vic",3150.0,https://www.horsedeals.com.au/classifieds/item...,"endurance, , allrounders","great pony, regretfully having to move him on ...",unknown,unknown,unknown,unknown,unknown
2,6096571770359476081,paints,dame 2024 paint x qh filly,filly,2025-02-11,unknown,14.3,1100.0,unknown,2026-02-11,2026-02-11,"weranga, qld",5000.0,https://www.horsedeals.com.au/classifieds/item...,project,triangle february online horse sale**as descri...,unknown,unknown,unknown,unknown,unknown
3,4519141765256036460,australian stock horses,double destiny,colt,2025-02-11,unknown,15.0,1100.0,unknown,2026-02-11,2026-02-11,"kingaroy, qld",6600.0,https://www.horsedeals.com.au/classifieds/item...,"campdrafting, , challenge",very nice type well grown yearling colt ash 2...,unknown,unknown,unknown,unknown,unknown
4,5994061767858464097,gypsy cob,kallarroo lava-gypsy cob filly,filly,2026-02-11,unknown,14.3,1100.0,unknown,2026-02-11,2026-02-11,"nimmitabel, nsw",18000.0,https://www.horsedeals.com.au/classifieds/item...,"allrounders, , breeding","friendly, curious filly born nov 2025. will be...",unknown,unknown,unknown,unknown,unknown


In [9]:
len(df_horses_hd)

1083

In [10]:
df_horses = pd.concat([df_horses_en, df_horses_hd])
df_horses.head()

Unnamed: 0,Horse_ID,Breed,Name,Gender,In Foal,Height (hh),Weight (lbs),Temperament,Ad Created,Last Update,Location,Price,Horse Profile,Skills,Comments,Shipping,Company Name,Company Profile,Color,Registry,Markings,Age,Has_Face_Markings,Foal Date
0,1605454,miniature,hershey's lucy,filly,no,6.0,150.0,5 10,2026-01-22,2026-01-22,"millersburg, oh",1900.0,httpswww.equinenow.comhorse-ad-1605454,"kid safe, therapy",hershey's lucy is born 8-30-25. here is the mo...,we can help you arrange shipping,hershberger's farm,httpswww.equinenow.comfarmhershbergers-farm.htm,sin información,sin información,sin información,0.5,0.0,NaT
1,1601263,quarter horse,bar x pumbaa,gelding,no,14.2,1000.0,1 10,2025-12-01,2026-02-06,"san antonio, tx",35000.0,httpswww.equinenow.comhorse-ad-1601263,"all around, athletic, beginner, flashy, husban...",elite aqha gelding proven bloodlines family fr...,pumba is located at our ranch in arkansas with...,horse of my dreams,httpswww.equinenow.comfarmhorse-of-my-dreams-4...,bay,aqha,sin información,6.1,0.0,NaT
2,1596672,standardbred,sin información,filly,no,16.0,750.0,5 10,2026-01-12,2026-01-12,"blairsville, pa",4500.0,httpswww.equinenow.comhorse-ad-1596672,sin información,lacy is a fancy colored blue eyed splash white...,sheer heaven pet resort and equestrian center ...,sheer heaven farm,httpswww.equinenow.comfarmsheerheavenfarm.htm,bay,usta,"splash white, and blue eyes",7.65,0.0,NaT
3,1602502,draft,sherman tank,gelding,no,15.3,1900.0,1 10,2025-12-11,2025-12-11,"ava, mo",12500.0,httpswww.equinenow.comhorse-ad-1602502,"cross, draft, driving, experienced, flashy, fl...",meet our sherman tankhe's stouta 5yo - coming ...,delivery hauling available,sin información,sin información,chestnut,sin información,solid,6.0,0.0,NaT
4,1603216,quarter horse,invitation to roan,gelding,no,16.0,1200.0,6 10,2025-12-21,2025-12-21,"auburn, pa",7500.0,httpswww.equinenow.comhorse-ad-1603216,"all around, ranch, ranch work, trail, trail ri...","rio is a 2017 gelding. hes an amazing horse, b...",sin información,sin información,sin información,buckskin,aqha,roan,8.8,0.0,NaT


In [11]:
len(df_horses)

2988

# Generando informacion sintetica de usuarios

## Creador de perfil de usuarios

In [13]:
class EquestrianJobProvider(BaseProvider):
    def equestrian_job(self):
        equestrian_jobs = []
        with open("../../data/tracking/equestrian_jobs.txt") as f:
            for line in f:
                if not line.startswith('#'):
                    equestrian_jobs.append(line.strip())
        return random.choice(equestrian_jobs)

def user_info_for_country(country):
    faker = COUNTRY_TO_LOCALE.get(country)
    if faker is None:
        return None

    faker.add_provider(EquestrianJobProvider)
    gender = random.choice(["male", "female"])

    if gender == "male":
        first_name = faker.first_name_male()
    else:
        first_name = faker.first_name_female()

    last_name = faker.last_name()
    name = f"{first_name} {last_name}"
    city = faker.city()
    address = faker.street_address()
    phone = faker.phone_number()
    email = f"{first_name}.{last_name}@{faker.free_email_domain()}".lower()
    credit_card = faker.credit_card_full().split("\n")
    credit_card[1] = name
    credit_card = "\n".join(credit_card)
    job = {
        "title": faker.equestrian_job(),
        "company": faker.company(),
        "suffix": faker.company_suffix()
    }

    return name, gender, email, phone, city, address, credit_card, job

def country_from_locale(locale_str: str):
    locale = Locale.parse(locale_str)

    country_code = locale.territory  # ej: 'AR', 'US'
    if country_code is None:
        return None

    country = pycountry.countries.get(alpha_2=country_code)
    return country.name if country else None

locales = set()
with open("../../data/tracking/locale.txt") as f:
    for locale in f.readlines():
        locale = locale.replace("\n", "")
        country_name = country_from_locale(locale)
        locales.add((country_name, locale))

locales = list(locales)
COUNTRY_TO_LOCALE = {key: Faker(value) for key, value in locales}

print(user_info_for_country("United Kingdom"))

('Dennis Morgan', 'male', 'dennis.morgan@yahoo.co.uk', '+441414960898', 'New Martinport', '706 Wood Corner', 'VISA 13 digit\nDennis Morgan\n4267422680802 09/27\nCVC: 138\n', {'title': 'Bloodstock agent', 'company': 'Simpson-Evans', 'suffix': 'LLC'})


## Set de datos de usuarios

In [14]:
def generate_users(n_users: int, locales: list):
    devices = ["mobile", "desktop"]
    sources = ["organic", "paid", "referral"]
    base_date = datetime(2020, 1, 1)
    
    users = []
    for _ in range(n_users):
        user_id = str(uuid.uuid4())
        X = np.random.randint(1, 1800)
        first_seen = base_date - timedelta(days=X)
        country_id = np.random.choice(len(locales))
        country, _ = locales[country_id]
        name, gender, email, phone, city, address, credit_card, job = user_info_for_country(country)
        users.append({
            "user_id": user_id,
            "name": name,
            "gender": gender,
            "country": country, 
            "city": city,
            "addres": address, 
            "credit_card_info": credit_card, 
            "email": email, 
            "phone_number": phone,
            "job_info": job,
            "device_type": np.random.choice(devices, p=[0.6, 0.4]),
            "traffic_source": np.random.choice(sources, p=[0.7, 0.2, 0.1]),
            "first_seen": first_seen.date()
        })

    return pd.DataFrame(users)

df_users = generate_users(n_users=200000, locales=locales)
df_users.head()

Unnamed: 0,user_id,name,gender,country,city,addres,credit_card_info,email,phone_number,job_info,device_type,traffic_source,first_seen
0,2eaf17c0-e972-453b-a1e1-b80faa1ca344,Daire Mac Breen,male,Ireland,Nikki Ville,12 Carbery Street,Maestro\nDaire Mac Breen\n676189742924 09/31\n...,daire.mac breen@yahoo.com,475.914.9686x4418,"{'title': 'Equine biomechanics specialist', 'c...",mobile,paid,2018-04-04
1,6b5f323f-65cd-4f7b-b3d4-b868a8ff77b4,Erick Santos,male,Brazil,Caldeira do Norte,"Trecho de Costela, 69",VISA 16 digit\nErick Santos\n4636098634562258 ...,erick.santos@gmail.com,+55 (071) 2659-3856,"{'title': 'Sport horse syndicate member', 'com...",mobile,referral,2017-11-21
2,64796621-90e7-438d-a7c1-150412ef0f7f,Bianca Martinez,female,Argentina,San Ferando del Valle de Catamarca,Avenida Santiago del Estero N° 72,Maestro\nBianca Martinez\n630415022817 08/29\n...,bianca.martinez@yahoo.com,+54 15 2471 0580,"{'title': 'Equine biomechanics specialist', 'c...",mobile,organic,2017-07-25
3,c2fcf023-dd1f-4557-ab91-5207d3c79253,Sheryl Yates,female,Cyprus,Port Johnny,602 Lopez Camp,Diners Club / Carte Blanche\nSheryl Yates\n304...,sheryl.yates@hotmail.com,554.944.9594x5881,{'title': 'Sports marketing director (equestri...,mobile,organic,2016-01-24
4,91a228a2-71ac-48e5-b0ad-2844dc1a8191,Gonzalo Muñoz,male,Argentina,San Ferando del Valle de Catamarca,Avenida 1 N° 726,VISA 16 digit\nGonzalo Muñoz\n4395379143620063...,gonzalo.muñoz@yahoo.com,+54 9 3811 9531,"{'title': 'Professional polo player', 'company...",desktop,organic,2015-04-07


In [15]:
df_users.to_parquet("../../data/clean/users_info.parquet")

## Asignando a cada sesion un usuario

In [16]:
def assign_sessions_to_users(df_sessions, users_ids, seed=42):
    rng = np.random.default_rng(seed)
    sessions = df_sessions["user_session"].unique().tolist()

    if len(users_ids) < len(sessions):
        raise ValueError("No hay suficientes users para asignar sesiones")

    rng.shuffle(sessions)
    rng.shuffle(users_ids)

    session_to_user = dict(zip(sessions, users_ids[:len(sessions)]))

    # asignación vectorizada
    df_sessions = df_sessions.copy()
    df_sessions["user_id"] = df_sessions["user_session"].map(session_to_user)

    return df_sessions

users_ids = df_users["user_id"].astype(str).tolist()

df_sessions = assign_sessions_to_users(df_sessions, users_ids)
df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-04-01 02:37:24 UTC,view,1005225,2232732093077520756,construction.tools.light,samsung,178.51,0d243a42-c44e-4a1d-837f-b5ba173903b8,a37bfb5b-8a32-4e64-bba2-1e9e4f600b88
1,2020-04-01 00:19:24 UTC,view,1005287,2232732093077520756,construction.tools.light,xiaomi,308.55,90d96f27-730b-4b8c-9b45-916c34b5e185,c13278a7-b518-4053-9145-f99665caf677
2,2020-04-01 03:27:48 UTC,view,1002525,2232732093077520756,construction.tools.light,apple,658.96,875103a5-2170-438f-9721-1019f3571196,c949a06d-1c14-42ca-8b49-98a2060c2150
3,2020-04-01 00:47:47 UTC,view,33600000,2070005009172398851,,fiesta,291.87,536b89ce-952c-4f87-bdb0-cbf225c822e8,c12adc7f-dc27-4471-b49a-35ceb63709b6
4,2020-04-01 01:34:00 UTC,view,100159174,2053013555220840837,appliances.kitchen.juicer,adata,64.44,76681f1a-767e-4a35-bcc0-43c85bce51ea,e886a3df-5b88-4252-9130-ed0c4cc38773


# Relacionando categorias visitadas en cada sesion con cada categoria de caballo con el metodo Rank-based Projection

## Checkeamos las distribuciones de cada set de categorias

In [17]:
product_categories_dist = df_sessions.category_code.value_counts(normalize=True)
product_categories_dist

category_code
construction.tools.light               0.263342
electronics.audio.headphone            0.156989
appliances.personal.massager           0.040773
appliances.kitchen.refrigerators       0.037350
sport.bicycle                          0.032916
                                         ...   
computers.components.hdd               0.000012
appliances.kitchen.microwave           0.000008
appliances.kitchen.steam_cooker        0.000008
appliances.environment.water_heater    0.000004
appliances.kitchen.coffee_machine      0.000004
Name: proportion, Length: 136, dtype: float64

In [18]:
horse_categories_dist = df_horses.Breed.value_counts(normalize=True)
horse_categories_dist

Breed
quarter horse          0.178046
thoroughbred           0.125837
quarter horses         0.048862
gypsy vanner           0.033802
andalusian             0.033467
                         ...   
suffolk punch          0.000335
american saddlebred    0.000335
baroque                0.000335
dartmoor               0.000335
waler                  0.000335
Name: proportion, Length: 100, dtype: float64

## Balanceamos las cantidades agrupando la cola sobrante de categorias

In [19]:
product_categories = product_categories_dist.index.to_list()
product_categories = product_categories[: len(horse_categories_dist)-1]

df_sessions.loc[~df_sessions.category_code.isin(product_categories), "category_code"] = "nc"

product_categories_dist = df_sessions.category_code.value_counts(normalize=True)
product_categories_dist

category_code
construction.tools.light            0.241041
electronics.audio.headphone         0.143695
nc                                  0.088568
appliances.personal.massager        0.037320
appliances.kitchen.refrigerators    0.034187
                                      ...   
auto.accessories.videoregister      0.000483
kids.carriage                       0.000480
apparel.jumper                      0.000474
accessories.umbrella                0.000449
construction.tools.painting         0.000380
Name: proportion, Length: 100, dtype: float64

## Asignamos las categorías más frecuentes de A a las más frecuentes de B

In [20]:
horse_categories = horse_categories_dist.index.to_list()
product_categories = product_categories_dist.index.to_list()

df_sessions.category_code = df_sessions.category_code.map(dict(zip(product_categories, horse_categories)))
df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-04-01 02:37:24 UTC,view,1005225,2232732093077520756,quarter horse,samsung,178.51,0d243a42-c44e-4a1d-837f-b5ba173903b8,a37bfb5b-8a32-4e64-bba2-1e9e4f600b88
1,2020-04-01 00:19:24 UTC,view,1005287,2232732093077520756,quarter horse,xiaomi,308.55,90d96f27-730b-4b8c-9b45-916c34b5e185,c13278a7-b518-4053-9145-f99665caf677
2,2020-04-01 03:27:48 UTC,view,1002525,2232732093077520756,quarter horse,apple,658.96,875103a5-2170-438f-9721-1019f3571196,c949a06d-1c14-42ca-8b49-98a2060c2150
3,2020-04-01 00:47:47 UTC,view,33600000,2070005009172398851,quarter horses,fiesta,291.87,536b89ce-952c-4f87-bdb0-cbf225c822e8,c12adc7f-dc27-4471-b49a-35ceb63709b6
4,2020-04-01 01:34:00 UTC,view,100159174,2053013555220840837,irish draught,adata,64.44,76681f1a-767e-4a35-bcc0-43c85bce51ea,e886a3df-5b88-4252-9130-ed0c4cc38773


# Asignando un caballo a cada sesion basandose en la categoria principal visitada

In [21]:
def assign_horses(df_sessions, df_horses, seed=42):
    rng = np.random.default_rng(seed)

    # Breed -> Horse_IDs
    horses_by_breed = (
        df_horses
        .groupby("Breed")["Horse_ID"]
        .apply(np.array)
        .to_dict()
    )

    # Para cada fila, samplear un Horse_ID válido
    def sample_horse(breed):
        return rng.choice(horses_by_breed[breed])

    df_sessions = df_sessions.copy()
    df_sessions["Horse_ID"] = df_sessions["category_code"].map(sample_horse)

    return df_sessions

df_sessions = assign_horses(df_sessions, df_horses, seed=42)
df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,Horse_ID
0,2020-04-01 02:37:24 UTC,view,1005225,2232732093077520756,quarter horse,samsung,178.51,0d243a42-c44e-4a1d-837f-b5ba173903b8,a37bfb5b-8a32-4e64-bba2-1e9e4f600b88,1605424
1,2020-04-01 00:19:24 UTC,view,1005287,2232732093077520756,quarter horse,xiaomi,308.55,90d96f27-730b-4b8c-9b45-916c34b5e185,c13278a7-b518-4053-9145-f99665caf677,1605377
2,2020-04-01 03:27:48 UTC,view,1002525,2232732093077520756,quarter horse,apple,658.96,875103a5-2170-438f-9721-1019f3571196,c949a06d-1c14-42ca-8b49-98a2060c2150,1604124
3,2020-04-01 00:47:47 UTC,view,33600000,2070005009172398851,quarter horses,fiesta,291.87,536b89ce-952c-4f87-bdb0-cbf225c822e8,c12adc7f-dc27-4471-b49a-35ceb63709b6,6526461770017608019
4,2020-04-01 01:34:00 UTC,view,100159174,2053013555220840837,irish draught,adata,64.44,76681f1a-767e-4a35-bcc0-43c85bce51ea,e886a3df-5b88-4252-9130-ed0c4cc38773,1596697


# Borrando columnas redundantes

In [22]:
df_sessions = df_sessions[["user_id", "user_session", "Horse_ID", "event_type", "event_time"]]
df_sessions.head()

Unnamed: 0,user_id,user_session,Horse_ID,event_type,event_time
0,0d243a42-c44e-4a1d-837f-b5ba173903b8,a37bfb5b-8a32-4e64-bba2-1e9e4f600b88,1605424,view,2020-04-01 02:37:24 UTC
1,90d96f27-730b-4b8c-9b45-916c34b5e185,c13278a7-b518-4053-9145-f99665caf677,1605377,view,2020-04-01 00:19:24 UTC
2,875103a5-2170-438f-9721-1019f3571196,c949a06d-1c14-42ca-8b49-98a2060c2150,1604124,view,2020-04-01 03:27:48 UTC
3,536b89ce-952c-4f87-bdb0-cbf225c822e8,c12adc7f-dc27-4471-b49a-35ceb63709b6,6526461770017608019,view,2020-04-01 00:47:47 UTC
4,76681f1a-767e-4a35-bcc0-43c85bce51ea,e886a3df-5b88-4252-9130-ed0c4cc38773,1596697,view,2020-04-01 01:34:00 UTC


# Normalizando nombres

In [23]:
df_sessions = df_sessions.rename(
    columns={
        "Horse_ID": "horse_id",
    }
)
df_sessions.head()

Unnamed: 0,user_id,user_session,horse_id,event_type,event_time
0,0d243a42-c44e-4a1d-837f-b5ba173903b8,a37bfb5b-8a32-4e64-bba2-1e9e4f600b88,1605424,view,2020-04-01 02:37:24 UTC
1,90d96f27-730b-4b8c-9b45-916c34b5e185,c13278a7-b518-4053-9145-f99665caf677,1605377,view,2020-04-01 00:19:24 UTC
2,875103a5-2170-438f-9721-1019f3571196,c949a06d-1c14-42ca-8b49-98a2060c2150,1604124,view,2020-04-01 03:27:48 UTC
3,536b89ce-952c-4f87-bdb0-cbf225c822e8,c12adc7f-dc27-4471-b49a-35ceb63709b6,6526461770017608019,view,2020-04-01 00:47:47 UTC
4,76681f1a-767e-4a35-bcc0-43c85bce51ea,e886a3df-5b88-4252-9130-ed0c4cc38773,1596697,view,2020-04-01 01:34:00 UTC


# Guardando set de datos de tracking

In [None]:
df_sessions.to_parquet("../../data/clean/horses_sessions_info.parquet")