# Simulacion de tracking de usuarios

No contamos con datos reales de tracking por restricciones de privacidad.
Por lo tanto, construimos un synthetic user behavior dataset calibrado usando el dataset de E-Commerce Behavior Data de Rees46.

## Packages

In [32]:
from pathlib import Path
import os
import requests
import zipfile
import tarfile
import gzip
import shutil
from typing import Literal
from urllib.parse import urlparse
import pandas as pd
from collections import Counter, defaultdict
from faker import Faker
from faker.providers import BaseProvider
import random
from babel import Locale
import pycountry
import numpy as np
import uuid
from datetime import datetime, timedelta

# Descargando el dataset de la competencia [E-Commerce Behavior Data de Rees46](https://rees46.com/en/datasets)

In [33]:
def download_and_prepare(
    url: str,
    download_path: str,
    extract_dir: str | None = None,
    chunk_size: int = 8192,
    mode: Literal["download", "extract", "download_and_extract"] = "download_and_extract",
) -> None:
    """
    Descarga y/o extrae un archivo desde una URL.

    Soporta:
    - ZIP (.zip)
    - TAR.GZ / TGZ (.tar.gz, .tgz)
    - GZ simple (.gz → un solo archivo)
    - Archivos sin compresión (solo descarga)

    Modes
    -----
    - download: solo descarga
    - extract: solo extrae (el archivo debe existir)
    - download_and_extract: descarga y luego extrae (default)
    """

    download_path = Path(download_path)
    os.makedirs(download_path.parent, exist_ok=True)

    # =====================
    # Asegurar extensión
    # =====================
    if download_path.suffix == "":
        url_path = Path(urlparse(url).path)
        if url_path.suffix:
            download_path = download_path.with_suffix(url_path.suffix)

    # =====================
    # Descargar
    # =====================
    if mode in {"download", "download_and_extract"}:
        if not download_path.exists():
            print(f"Descargando archivo desde {url} ...")
            response = requests.get(url, stream=True)
            response.raise_for_status()

            with open(download_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)

            print("Descarga completada.")
        else:
            print("El archivo ya existe, no se descargará.")

    # =====================
    # Extraer
    # =====================
    if mode not in {"extract", "download_and_extract"}:
        return

    if not download_path.exists():
        raise FileNotFoundError(
            f"No existe el archivo {download_path}. "
            "No se puede extraer."
        )

    is_zip = zipfile.is_zipfile(download_path)
    is_tar = tarfile.is_tarfile(download_path)
    is_gz = download_path.suffix == ".gz" and not is_tar

    if not is_zip and not is_tar and not is_gz:
        print("El archivo no es ZIP, TAR.GZ ni GZ. No se requiere extracción.")
        return

    # =====================
    # Directorio de extracción
    # =====================
    if extract_dir is None:
        extract_dir = download_path.as_posix() + "_extracted"

    extract_dir = Path(extract_dir)

    if extract_dir.exists():
        print("La carpeta de extracción ya existe, no se descomprimirá.")
        return

    os.makedirs(extract_dir, exist_ok=True)

    # =====================
    # ZIP
    # =====================
    if is_zip:
        print("Archivo ZIP detectado. Descomprimiendo...")
        with zipfile.ZipFile(download_path, "r") as zip_ref:
            zip_ref.extractall(extract_dir)

    # =====================
    # TAR / TAR.GZ / TGZ
    # =====================
    elif is_tar:
        print("Archivo TAR.GZ detectado. Descomprimiendo...")
        with tarfile.open(download_path, "r:*") as tar_ref:
            tar_ref.extractall(extract_dir)

    # =====================
    # GZ simple
    # =====================
    elif is_gz:
        print("Archivo GZ detectado. Descomprimiendo...")

        output_file = extract_dir / download_path.stem  # quita .gz

        with gzip.open(download_path, "rb") as f_in:
            with open(output_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

        print(f"Archivo extraído: {output_file}")

    print("Archivo descomprimido correctamente.")
    return

In [34]:
url = "https://data.rees46.com/datasets/marketplace/2020-Apr.csv.gz"
download_path = "../../data/tracking/rees/2020-Apr.csv.gz"
extract_dir = "../../data/tracking/rees/extracted"

download_and_prepare(url=url, download_path=download_path, extract_dir=extract_dir, mode='extract')

La carpeta de extracción ya existe, no se descomprimirá.


# Cargando datasets

## Cargamos el set de datos de Rees46 por chunks, extraemos la distribucion del target y extraemos una muestra mas pequeña con la misma distribucion

In [36]:
# ============================================================
# CONFIGURACIÓN GENERAL
# ============================================================
file_path = Path("../../data/tracking/rees/extracted/2020-Apr_sample.parquet")
SOURCE_CSV = "../../data/tracking/rees/extracted/2020-Apr.csv"

COL = "event_type"
CHUNKSIZE = 200_000
TARGET = 1_000_000
RANDOM_STATE = 84

# ============================================================
# CREACIÓN DEL DATASET SAMPLEADO (solo si no existe)
# ============================================================
if not file_path.exists():
    # --------------------------------------------------------
    # 1. CÁLCULO DE DISTRIBUCIÓN ORIGINAL POR CLASE
    # --------------------------------------------------------
    counts = Counter()
    total = 0

    for chunk in pd.read_csv(SOURCE_CSV, chunksize=CHUNKSIZE):
        counts.update(chunk[COL].value_counts().to_dict())
        total += len(chunk)

    dist = {k: v / total for k, v in counts.items()}
    print("Distribución original:", dist)

    # --------------------------------------------------------
    # 2. CÁLCULO DE OBJETIVO POR CLASE
    # --------------------------------------------------------
    target_per_class = {
        k: int(v * TARGET)
        for k, v in dist.items()
    }

    # Ajuste fino por redondeo
    diff = TARGET - sum(target_per_class.values())
    if diff != 0:
        biggest = max(target_per_class, key=target_per_class.get)
        target_per_class[biggest] += diff

    print("Objetivo por clase:", target_per_class)

    # --------------------------------------------------------
    # 3. SAMPLEO ESTRATIFICADO POR CHUNKS
    # --------------------------------------------------------
    selected = defaultdict(list)

    for chunk in pd.read_csv(SOURCE_CSV, chunksize=CHUNKSIZE):

        for cls, n_target in target_per_class.items():
            remaining = n_target - sum(len(x) for x in selected[cls])

            if remaining <= 0:
                continue

            rows = chunk[chunk[COL] == cls]

            if rows.empty:
                continue

            take = min(len(rows), remaining)

            sampled = rows.sample(
                n=take,
                random_state=RANDOM_STATE
            )

            selected[cls].append(sampled)

        # Corte temprano si ya alcanzamos el target global
        total_selected = sum(
            sum(len(x) for x in v)
            for v in selected.values()
        )

        if total_selected >= TARGET:
            break

    # --------------------------------------------------------
    # 4. CONSOLIDACIÓN Y GUARDADO
    # --------------------------------------------------------
    df_sessions = pd.concat(
        [pd.concat(v) for v in selected.values()],
        ignore_index=True
    )

    df_sessions.to_parquet(file_path)

# ============================================================
# CARGA FINAL DEL DATASET
# ============================================================

df_sessions = pd.read_parquet(file_path)
df_sessions.head()

Distribución original: {'view': 0.9363957717631015, 'cart': 0.0490859878501743, 'purchase': 0.014518240386724179}
Objetivo por clase: {'view': 936397, 'cart': 49085, 'purchase': 14518}


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-04-01 05:24:33 UTC,view,100008570,2053013563835941749,appliances.kitchen.refrigerators,pulser,409.51,604166320,fcf6bd48-b655-4b5e-8271-ba642e600415
1,2020-04-01 02:30:17 UTC,view,6200700,2232732091961835858,appliances.environment.air_heater,,64.35,514644182,c85c49de-6212-4794-bc4a-03b07b46da5b
2,2020-04-01 01:23:34 UTC,view,1307135,2053013554658804075,electronics.audio.headphone,hp,324.08,536790299,63ba33d7-42ef-4b87-a557-bac366755a6c
3,2020-04-01 03:06:16 UTC,view,4100170,2232732098228126185,apparel.shoes,sony,480.84,592391712,0b6f1c7a-7c4c-4822-ab91-03d6a5870849
4,2020-04-01 05:30:30 UTC,view,1005143,2232732093077520756,construction.tools.light,apple,1433.87,549858079,e30ee1a4-4821-4e6c-bf56-c5f2056712d1


In [37]:
len(df_sessions)

1000000

## Cargamos los set de datos de caballos y los concatenamos

In [45]:
df_prods = pd.read_parquet("../../data/clean/doversaddlery_products_listing_limpio.parquet")
df_prods.head()

Unnamed: 0,Item_ID,Name,Stock,Description,Price,Images,URL
0,360064,troxel® sport 2.0™ helmet,0,browse our carefully curated categories to dis...,61.99,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/helm...
1,240084,weatherbeeta® free standard neck turnout sheet,0,browse our carefully curated categories to dis...,254.99,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/hors...
2,43229,rider’s international™ laced rein dog collar,0,browse our carefully curated categories to dis...,19.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/farm...
3,381034,noble equestrian™ ladies' traditions paddock b...,0,browse our carefully curated categories to dis...,139.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/ridi...
4,381037,tredstep™ ladies’ donatello iii dress boots,10,browse our carefully curated categories to dis...,379.0,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/ridi...


In [50]:
df_prods["Category"] = df_prods.URL.apply(lambda x: x.split("/")[4].replace("-", " "))

In [51]:
len(df_prods)

937

# Set de datos de usuarios

In [40]:
df_users = pd.read_parquet("../../data/clean/users_info.parquet")
df_users.head()

Unnamed: 0,user_id,name,gender,country,city,addres,credit_card_info,email,phone_number,job_info,device_type,traffic_source,first_seen
0,2eaf17c0-e972-453b-a1e1-b80faa1ca344,Daire Mac Breen,male,Ireland,Nikki Ville,12 Carbery Street,Maestro\nDaire Mac Breen\n676189742924 09/31\n...,daire.mac breen@yahoo.com,475.914.9686x4418,"{'company': 'Lane, McGerr and Kinaghan', 'suff...",mobile,paid,2018-04-04
1,6b5f323f-65cd-4f7b-b3d4-b868a8ff77b4,Erick Santos,male,Brazil,Caldeira do Norte,"Trecho de Costela, 69",VISA 16 digit\nErick Santos\n4636098634562258 ...,erick.santos@gmail.com,+55 (071) 2659-3856,"{'company': 'Cassiano da Luz S/A', 'suffix': '...",mobile,referral,2017-11-21
2,64796621-90e7-438d-a7c1-150412ef0f7f,Bianca Martinez,female,Argentina,San Ferando del Valle de Catamarca,Avenida Santiago del Estero N° 72,Maestro\nBianca Martinez\n630415022817 08/29\n...,bianca.martinez@yahoo.com,+54 15 2471 0580,"{'company': 'Villalba Group', 'suffix': 'and S...",mobile,organic,2017-07-25
3,c2fcf023-dd1f-4557-ab91-5207d3c79253,Sheryl Yates,female,Cyprus,Port Johnny,602 Lopez Camp,Diners Club / Carte Blanche\nSheryl Yates\n304...,sheryl.yates@hotmail.com,554.944.9594x5881,"{'company': 'Reeves-Le', 'suffix': 'Inc', 'tit...",mobile,organic,2016-01-24
4,91a228a2-71ac-48e5-b0ad-2844dc1a8191,Gonzalo Muñoz,male,Argentina,San Ferando del Valle de Catamarca,Avenida 1 N° 726,VISA 16 digit\nGonzalo Muñoz\n4395379143620063...,gonzalo.muñoz@yahoo.com,+54 9 3811 9531,"{'company': 'Sanchez Group', 'suffix': 'Inc', ...",desktop,organic,2015-04-07


## Asignando a cada sesion un usuario

In [52]:
def assign_sessions_to_users(df_sessions, users_ids, seed=42):
    rng = np.random.default_rng(seed)
    sessions = df_sessions["user_session"].unique().tolist()

    if len(users_ids) < len(sessions):
        raise ValueError("No hay suficientes users para asignar sesiones")

    rng.shuffle(sessions)
    rng.shuffle(users_ids)

    session_to_user = dict(zip(sessions, users_ids[:len(sessions)]))

    # asignación vectorizada
    df_sessions = df_sessions.copy()
    df_sessions["user_id"] = df_sessions["user_session"].map(session_to_user)

    return df_sessions

users_ids = df_users["user_id"].astype(str).tolist()

df_sessions = assign_sessions_to_users(df_sessions, users_ids)
df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-04-01 05:24:33 UTC,view,100008570,2053013563835941749,appliances.kitchen.refrigerators,pulser,409.51,56affb3c-f347-405a-b901-9221fe439763,fcf6bd48-b655-4b5e-8271-ba642e600415
1,2020-04-01 02:30:17 UTC,view,6200700,2232732091961835858,appliances.environment.air_heater,,64.35,84a2aedd-f800-4d4f-b06d-d7bc51842ccc,c85c49de-6212-4794-bc4a-03b07b46da5b
2,2020-04-01 01:23:34 UTC,view,1307135,2053013554658804075,electronics.audio.headphone,hp,324.08,05d7eaf4-437c-4b8d-bc5d-6aa3ba6c30fb,63ba33d7-42ef-4b87-a557-bac366755a6c
3,2020-04-01 03:06:16 UTC,view,4100170,2232732098228126185,apparel.shoes,sony,480.84,546e452a-fc9a-4251-8d9d-d3e932e300fc,0b6f1c7a-7c4c-4822-ab91-03d6a5870849
4,2020-04-01 05:30:30 UTC,view,1005143,2232732093077520756,construction.tools.light,apple,1433.87,56fc2e11-3404-4cf2-88fa-b0854af9a78d,e30ee1a4-4821-4e6c-bf56-c5f2056712d1


# Relacionando categorias visitadas en cada sesion con cada categoria de caballo con el metodo Rank-based Projection

## Checkeamos las distribuciones de cada set de categorias

## Asignamos las categorías más frecuentes de A a las más frecuentes de B

Seguiendo un criterio de demanda y comportamiento del usuario, no de inventario: priorice categorias que los usuarios necesitan buscar con mayor frecuencia (consumibles y reposicion como horse care y horse tack equipment), que requieren comparacion o ajuste (ropa y botas), y que tienen intencion clara de compra, por encima de categorias durables, estacionales, exploratorias (new, clearance) o no core (farm dog gear). 

La idea fue estimar que se busca mas en un ecommerce ecuestre cuando no hay logs de busquedas, usando heuristicas reales de uso, recurrencia y contexto, no la cantidad de productos disponibles.

In [69]:
SEARCH_WEIGHTS = {
    "horse care": 0.22,
    "horse tack equipment": 0.20,
    "horse riding apparel": 0.15,
    "riding boots chaps": 0.12,
    "helmets protective gear": 0.09,
    "horse blankets": 0.08,
    "kids horse riding apparel": 0.05,
    "stable arena supplies": 0.04,
    "new": 0.03,
    "clearance": 0.015,
    "farm dog gear": 0.015,
}

In [84]:
N = len(df_sessions)

weights = pd.Series(SEARCH_WEIGHTS)
weights = weights / weights.sum() # Prob.total = 1

counts = (weights * N).astype(int)

# ajuste fino por redondeo
diff = N - counts.sum()
if diff != 0:
    counts.iloc[0] += diff

df_sessions["product_category"] = ""
df_out = []

for category, n in counts.items():
    empty_product_category = df_sessions[df_sessions["product_category"] == ""]
    sampled = empty_product_category.sample(n=n, random_state=42)
    df_sessions.loc[sampled.index, "product_category"] = category

df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,product_category
44820,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse care
262650,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse tack equipment
460669,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse riding apparel
609183,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,helmets protective gear
727994,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse riding apparel


# Asignando un producto a cada sesion basandose en la categoria principal visitada

In [86]:
def assign_prods(df_sessions, df_prods, seed=42):
    rng = np.random.default_rng(seed)

    prods_by_cat = (
        df_prods
        .groupby("Category")["Item_ID"]
        .apply(np.array)
        .to_dict()
    )

    def sample_prod(cat):
        return rng.choice(prods_by_cat[cat])

    df_sessions = df_sessions.copy()
    df_sessions["Item_ID"] = df_sessions["product_category"].map(sample_prod)

    return df_sessions

df_sessions = assign_prods(df_sessions, df_prods, seed=42)
df_sessions.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,product_category,Item_ID
44820,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse care,26019
262650,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse tack equipment,3299
460669,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse riding apparel,36887
609183,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,helmets protective gear,36869
727994,2020-04-01 00:00:00 UTC,view,,2232732101407408685,apparel.shoes.slipons,samsung,230.38,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,horse riding apparel,401107


# Borrando columnas redundantes

In [88]:
df_sessions = df_sessions[["user_id", "user_session", "Item_ID", "event_type", "event_time"]]
df_sessions.head()

Unnamed: 0,user_id,user_session,Item_ID,event_type,event_time
44820,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,26019,view,2020-04-01 00:00:00 UTC
262650,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,3299,view,2020-04-01 00:00:00 UTC
460669,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,36887,view,2020-04-01 00:00:00 UTC
609183,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,36869,view,2020-04-01 00:00:00 UTC
727994,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,401107,view,2020-04-01 00:00:00 UTC


# Normalizando nombres

In [89]:
df_sessions = df_sessions.rename(
    columns={
        "Item_ID": "item_id",
    }
)
df_sessions.head()

Unnamed: 0,user_id,user_session,item_id,event_type,event_time
44820,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,26019,view,2020-04-01 00:00:00 UTC
262650,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,3299,view,2020-04-01 00:00:00 UTC
460669,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,36887,view,2020-04-01 00:00:00 UTC
609183,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,36869,view,2020-04-01 00:00:00 UTC
727994,aa9032d3-4cb0-47be-b572-f9570e51d5fb,e2456cef-2d4f-42b9-a53a-8893cb0c6851,401107,view,2020-04-01 00:00:00 UTC


# Guardando set de datos de tracking

In [90]:
df_sessions.to_parquet("../../data/clean/prods_sessions_info.parquet")