In [0]:
%pip install faker

Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.8.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# MAGIC %md
# MAGIC # 🎲 01_Generate_Synthetic_Data
# MAGIC Genera datasets sintéticos de transacciones QR, merchants, payers y devices.
# MAGIC - Datos realistas con patrones de fraude inyectados.
# MAGIC - Guarda directamente en Volume: `/Volumes/fraude_qr/bronze/raw_data/synthetic_data`
# MAGIC - Tipos de datos 100% compatibles con Spark/Delta Lake.
# MAGIC - Particionado por `date` listo para Auto Loader.

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Instalar Librerías Necesarias (SIN reiniciar)

# COMMAND ----------

# MAGIC %pip install --quiet faker

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Importar Librerías y Configurar Parámetros

# COMMAND ----------

import pandas as pd
import numpy as np
from faker import Faker
import os
from datetime import datetime, timedelta

# ===========================================
# 🔧 PARÁMETROS CONFIGURABLES
# ===========================================
num_transactions = 1_000_000    # Total de transacciones a generar
fraud_rate = 0.002              # Tasa de fraude (0.2%)
start_date = "2025-08-01"       # Fecha de inicio
output_dir = "/Volumes/fraude_qr/bronze/raw_data/synthetic_data"  # ¡Ruta en tu Volume!

# Inicializar Faker
fake = Faker()

print(f"📌 Parámetros configurados:")
print(f"   - Transacciones: {num_transactions:,}")
print(f"   - Tasa de fraude: {fraud_rate:.4f} ({fraud_rate*100:.2f}%)")
print(f"   - Fecha inicio: {start_date}")
print(f"   - Salida: {output_dir}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Función: Generar Dimensiones (Merchants, Payers, Devices)

# COMMAND ----------

def generate_dimensions(num_merchants=10000, num_payers=50000, num_devices=75000):
    """Genera tablas de dimensiones con tipos explícitos compatibles con Spark."""
    print("📊 Generando dimensiones...")
    
    merchants = pd.DataFrame({
        'merchant_id': np.arange(1, num_merchants + 1, dtype='int64'),
        'merchant_name': [fake.company() for _ in range(num_merchants)],
        'country': ['AR'] * num_merchants,
        'state': [fake.state() for _ in range(num_merchants)],
        'city': [fake.city() for _ in range(num_merchants)],
        'merchant_lat': np.random.uniform(-38, -34, num_merchants).astype('float64'),
        'merchant_lon': np.random.uniform(-63, -58, num_merchants).astype('float64'),
        'mcc': np.random.randint(1000, 9999, num_merchants, dtype='int64'),
        'risk_score': np.random.uniform(0.1, 1.0, num_merchants).astype('float64'),
        'on_watchlist': np.random.choice([0, 1], num_merchants, p=[0.95, 0.05]).astype('int64')
    })

    payers = pd.DataFrame({
        'payer_id': np.arange(1, num_payers + 1, dtype='int64'),
        'segment': np.random.choice(['new', 'regular', 'vip'], num_payers, p=[0.4, 0.5, 0.1]),
        'age_band': np.random.choice(['18-25', '26-35', '36-50', '50+'], num_payers, p=[0.3, 0.4, 0.2, 0.1]),
        'tenure_days': np.random.randint(1, 1000, num_payers, dtype='int64')
    })

    device_ids = [fake.sha256() for _ in range(num_devices)]
    devices = pd.DataFrame({
        'device_id': device_ids,
        'device_type': np.random.choice(['ios', 'android'], num_devices, p=[0.4, 0.6]),
        'app_version': [f"{np.random.randint(2,5)}.{np.random.randint(0,10)}.{np.random.randint(0,20)}" for _ in range(num_devices)],
        'rooted': np.random.choice([0, 1], num_devices, p=[0.9, 0.1]).astype('int64')
    })
    
    print(f"✅ Dimensiones generadas: {num_merchants} merchants, {num_payers} payers, {num_devices} devices.")
    return merchants, payers, devices

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Función: Generar Transacciones QR con Fraude

# COMMAND ----------

def generate_qr_transactions(num_transactions, fraud_rate, start_date_str, merchants, payers, devices):
    """Genera transacciones con patrones de fraude y tipos compatibles."""
    print(f"💳 Generando {num_transactions:,} transacciones...")
    
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    
    df = pd.DataFrame({
        'tx_id': np.arange(1, num_transactions + 1, dtype='int64'),
        'merchant_id': np.random.choice(merchants['merchant_id'].values, num_transactions),
        'payer_id': np.random.choice(payers['payer_id'].values, num_transactions),
        'device_id': np.random.choice(devices['device_id'].values, num_transactions),
        'created_at': pd.to_datetime([
            start_date + timedelta(seconds=int(s)) 
            for s in np.random.randint(0, 30*24*3600, num_transactions)
        ], utc=True),
        'currency': ['ARS'] * num_transactions,
        'qr_type': np.random.choice(['static', 'dynamic', 'cpm'], num_transactions, p=[0.6, 0.3, 0.1]),
        'channel': np.random.choice(['app', 'web'], num_transactions, p=[0.9, 0.1]),
        'has_error': np.random.choice([0, 1], num_transactions, p=[0.98, 0.02]).astype('int64'),
    })

    df['amount'] = np.random.lognormal(mean=8.5, sigma=1.5, size=num_transactions).round(2).astype('float64')
    df.loc[df['amount'] > 50000, 'amount'] = np.random.uniform(50000, 150000, df[df['amount'] > 50000].shape[0])

    df = df.merge(merchants[['merchant_id', 'merchant_lat', 'merchant_lon', 'mcc']], on='merchant_id', how='left')

    df['payer_lat'] = (df['merchant_lat'] + np.random.normal(0, 0.05, num_transactions)).astype('float64')
    df['payer_lon'] = (df['merchant_lon'] + np.random.normal(0, 0.05, num_transactions)).astype('float64')

    df['is_fraud'] = np.zeros(num_transactions, dtype='int64')
    num_fraud = int(num_transactions * fraud_rate)
    fraud_indices = np.random.choice(df.index, num_fraud, replace=False)

    print("⚠️ Inyectando patrones de fraude...")

    dist_fraud_indices = np.random.choice(fraud_indices, int(num_fraud * 0.2), replace=False)
    df.loc[dist_fraud_indices, 'payer_lat'] = df.loc[dist_fraud_indices, 'merchant_lat'] + np.random.uniform(1, 5)
    df.loc[dist_fraud_indices, 'payer_lon'] = df.loc[dist_fraud_indices, 'merchant_lon'] + np.random.uniform(1, 5)

    velocity_fraud_indices = np.random.choice(fraud_indices, int(num_fraud * 0.2), replace=False)
    for idx in velocity_fraud_indices[:len(velocity_fraud_indices)//2]:
        if idx + 1 < len(df):
            df.loc[idx + 1, ['payer_id', 'is_fraud']] = [df.loc[idx, 'payer_id'], 1]
        if idx + 2 < len(df):
            df.loc[idx + 2, ['payer_id', 'is_fraud']] = [df.loc[idx, 'payer_id'], 1]

    risky_fraud_indices = np.random.choice(fraud_indices, int(num_fraud * 0.4), replace=False)
    risky_devices = devices[devices['rooted'] == 1]['device_id'].values
    risky_mcc = merchants[merchants['risk_score'] > 0.8]['mcc'].values
    if len(risky_devices) > 0:
        df.loc[risky_fraud_indices, 'device_id'] = np.random.choice(risky_devices, len(risky_fraud_indices))
    if len(risky_mcc) > 0:
        df.loc[risky_fraud_indices, 'mcc'] = np.random.choice(risky_mcc, len(risky_fraud_indices))

    df.loc[fraud_indices, 'is_fraud'] = 1

    df['qr_hash'] = [fake.sha256() for _ in range(num_transactions)]
    df['error_code'] = np.nan
    df.loc[df['has_error'] == 1, 'error_code'] = np.random.choice(['501', '503', '504', '401'], df[df['has_error'] == 1].shape[0])

    # 🔥 FORZAR A MILISEGUNDOS
    df['created_at'] = df['created_at'].dt.tz_localize(None)
    df['created_at'] = df['created_at'].astype('datetime64[ms]')  # ← ¡ms, no us!

    df['date'] = df['created_at'].dt.date.astype(str)

    print(f"✅ Transacciones generadas: {len(df):,} registros, {df['is_fraud'].sum():,} fraudes inyectados.")
    return df

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Ejecutar Generación y Guardar en Volume

# COMMAND ----------

# Crear directorio de salida
print(f"📁 Creando directorio: {output_dir}")
dbutils.fs.mkdirs(output_dir)

# Generar dimensiones
print("\n🧩 Generando tablas de dimensiones...")
merchants, payers, devices = generate_dimensions()

# Guardar dimensiones
print("💾 Guardando dimensiones...")
merchants.to_parquet(
    os.path.join(output_dir, 'synthetic_merchants.parquet'),
    engine='pyarrow'
)
payers.to_parquet(
    os.path.join(output_dir, 'synthetic_payers.parquet'),
    engine='pyarrow'
)
devices.to_parquet(
    os.path.join(output_dir, 'synthetic_devices.parquet'),
    engine='pyarrow'
)

# Generar transacciones
print("\n🧩 Generando transacciones...")
transactions_df = generate_qr_transactions(
    num_transactions,
    fraud_rate,
    start_date,
    merchants,
    payers,
    devices
)

# Ruta de salida para transacciones
output_path = os.path.join(output_dir, 'synthetic_qr_transactions')
print(f"📂 Ruta de salida: {output_path}")

# 🔥 ELIMINAR DATOS ANTIGUOS
print("🧹 Eliminando datos antiguos (si existen)...")
try:
    dbutils.fs.rm(output_path, True)
    print("✅ Carpeta eliminada.")
except Exception as e:
    print(f"ℹ️ No se encontró carpeta anterior: {str(e)}")

# 🔥 GUARDAR CON PARÁMETROS COMPATIBLES
print("💾 Guardando datos con configuración Spark-friendly...")
transactions_df.to_parquet(
    output_path,
    partition_cols=['date'],
    engine='pyarrow',
    use_deprecated_int96_timestamps=False,
    coerce_timestamps='ms',          # ← Milisegundos
    allow_truncated_timestamps=True,
    compression='snappy'
)

print("\n🎉 ¡GENERACIÓN COMPLETADA!")
print(f"📊 Transacciones: {len(transactions_df):,}")
print(f"📈 Fraudes: {transactions_df['is_fraud'].sum():,} ({transactions_df['is_fraud'].mean()*100:.3f}%)")
print(f"📁 Ubicación: {output_path}")


📌 Parámetros configurados:
   - Transacciones: 1,000,000
   - Tasa de fraude: 0.0020 (0.20%)
   - Fecha inicio: 2025-08-01
   - Salida: /Volumes/fraude_qr/bronze/raw_data/synthetic_data
📁 Creando directorio: /Volumes/fraude_qr/bronze/raw_data/synthetic_data

🧩 Generando tablas de dimensiones...
📊 Generando dimensiones...
✅ Dimensiones generadas: 10000 merchants, 50000 payers, 75000 devices.
💾 Guardando dimensiones...

🧩 Generando transacciones...
💳 Generando 1,000,000 transacciones...
⚠️ Inyectando patrones de fraude...
✅ Transacciones generadas: 1,000,000 registros, 2,400 fraudes inyectados.
📂 Ruta de salida: /Volumes/fraude_qr/bronze/raw_data/synthetic_data/synthetic_qr_transactions
🧹 Eliminando datos antiguos (si existen)...
✅ Carpeta eliminada.
💾 Guardando datos con configuración Spark-friendly...

🎉 ¡GENERACIÓN COMPLETADA!
📊 Transacciones: 1,000,000
📈 Fraudes: 2,400 (0.240%)
📁 Ubicación: /Volumes/fraude_qr/bronze/raw_data/synthetic_data/synthetic_qr_transactions


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-5120634732865348>, line 244[0m
[1;32m    234[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;124m📁 Ubicación: [39m[38;5;132;01m{[39;00moutput_path[38;5;132;01m}[39;00m[38;5;124m"[39m)
[1;32m    236[0m [38;5;66;03m# COMMAND ----------[39;00m
[1;32m    237[0m 
[1;32m    238[0m [38;5;66;03m# MAGIC %md[39;00m
[0;32m   (...)[0m
[1;32m    242[0m 
[1;32m    243[0m [38;5;66;03m# Reconectar Spark si la sesión expiró[39;00m
[0;32m--> 244[0m spark [38;5;241m=[39m SparkSession[38;5;241m.[39mbuilder[38;5;241m.[39mgetOrCreate()
[1;32m    246[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124m🔍 Verificando estructura de archivos...[39m[38;5;124m"[39m)
[1;32m    247[0m dbutils[38;5;241m.[39mfs[38;5;241m.[39mls(output_path)

[0;31mNameError[0m: 

In [0]:
# En una nueva celda
output_path = "/Volumes/fraude_qr/bronze/raw_data/synthetic_data/synthetic_qr_transactions"

print("📁 Archivos generados:")
dbutils.fs.ls(output_path)[:3]  # Muestra primeras 3 particiones

print("\n📋 Esquema de los datos:")
df = spark.read.parquet(output_path)
df.printSchema()

print(f"\n📊 Total registros: {df.count():,}")
df.select("is_fraud", "date").groupBy("date").sum("is_fraud").orderBy("date").show(5)

📁 Archivos generados:

📋 Esquema de los datos:
root
 |-- tx_id: long (nullable = true)
 |-- merchant_id: long (nullable = true)
 |-- payer_id: long (nullable = true)
 |-- device_id: string (nullable = true)
 |-- created_at: timestamp_ntz (nullable = true)
 |-- currency: string (nullable = true)
 |-- qr_type: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- has_error: long (nullable = true)
 |-- amount: double (nullable = true)
 |-- merchant_lat: double (nullable = true)
 |-- merchant_lon: double (nullable = true)
 |-- mcc: long (nullable = true)
 |-- payer_lat: double (nullable = true)
 |-- payer_lon: double (nullable = true)
 |-- is_fraud: long (nullable = true)
 |-- qr_hash: string (nullable = true)
 |-- error_code: string (nullable = true)
 |-- __index_level_0__: long (nullable = true)
 |-- date: date (nullable = true)


📊 Total registros: 1,000,000
+----------+-------------+
|      date|sum(is_fraud)|
+----------+-------------+
|2025-08-01|           79|
|2025-0