In [0]:
# Ruta del Volume donde queremos guardar el archivo
volume_path = "/Volumes/fraude_qr/bronze/raw_data/kaggle"

# Crear la carpeta si no existe
dbutils.fs.mkdirs(volume_path)
print(f"✅ Carpeta creada o ya existente: {volume_path}")

✅ Carpeta creada o ya existente: /Volumes/fraude_qr/bronze/raw_data/kaggle


In [0]:
# Verificar que el archivo existe en el Volume
kaggle_path = "/Volumes/fraude_qr/bronze/raw_data/kaggle/PS_20174392719_1491204439457_log.csv"

try:
    dbutils.fs.ls(kaggle_path)
    print("✅ Archivo encontrado en el Volume.")
except Exception as e:
    print(f"❌ ERROR: {str(e)}")

✅ Archivo encontrado en el Volume.


In [0]:
# MAGIC %md
# MAGIC # 📥 01b_Ingest_Kaggle_Data
# MAGIC Ingesta el dataset público de Kaggle (PaySim) a la capa Bronze.

# COMMAND ----------

# Importamos la nueva función 'substring'
from pyspark.sql.functions import col, lit, when, from_unixtime, sha2, conv, substring
from pyspark.sql.types import LongType, StringType, DoubleType, TimestampType

# --- 1. Configuración de Rutas ---
kaggle_csv_path = "/Volumes/fraude_qr/bronze/raw_data/kaggle/PS_20174392719_1491204439457_log.csv"
bronze_table_name = "fraude_qr.bronze.qr_transactions_kaggle_raw"

print(f"📂 Ruta del dataset de Kaggle: {kaggle_csv_path}")
print(f"💾 Tabla Bronze de destino: {bronze_table_name}")

# --- 2. Leer el Dataset de Kaggle ---
print("\n📂 Leyendo dataset de Kaggle...")
df_kaggle = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(kaggle_csv_path)
)
print("📋 Esquema original de PaySim:")
df_kaggle.printSchema()

# --- 3. Alinear Esquema al Modelo de Datos QR ---
print("\n🧩 Alineando esquema al modelo de datos QR...")

df_aligned = (
    df_kaggle
    # --- IDs ---
    .withColumn(
        "tx_id",
        (
            col("step") * 1000000 +
            # CORRECCIÓN: Usamos substring para truncar el hash
            substring(conv(sha2(col("nameOrig"), 256), 16, 10), -15, 15).cast(LongType()) % 1000000
        ).cast(LongType())
    )
    .withColumn(
        "merchant_id",
        # CORRECCIÓN: Usamos substring
        (substring(conv(sha2(col("nameDest"), 256), 16, 10), -15, 15).cast(LongType()) % 10000 + 1).cast(LongType())
    )
    .withColumn(
        "payer_id",
        # CORRECCIÓN: Usamos substring
        (substring(conv(sha2(col("nameOrig"), 256), 16, 10), -15, 15).cast(LongType()) % 50000 + 1).cast(LongType())
    )
    
    # --- Dispositivo y QR ---
    .withColumn("device_id", lit("kaggle_device_placeholder"))
    .withColumn("qr_hash", lit("kaggle_qr_placeholder"))
    
    # --- Timestamp ---
    .withColumn(
        "created_at",
        from_unixtime(col("step") * 3600).cast(TimestampType())
    )
    
    # --- Monto y Moneda ---
    .withColumn("amount", col("amount").cast(DoubleType()))
    .withColumn("currency", lit("USD"))
    
    # --- Tipo de QR y Canal ---
    .withColumn("qr_type", lit("dynamic"))
    .withColumn("channel", lit("app"))
    
    # --- Ubicación (Geográfica) ---
    .withColumn("merchant_lat", (lit(-34.5) + (substring(conv(sha2(col("nameDest"), 256), 16, 10), -15, 15).cast(LongType()) % 1000 / 1000.0)).cast(DoubleType()))
    .withColumn("merchant_lon", (lit(-58.4) + (substring(conv(sha2(col("nameDest"), 256), 16, 10), -15, 15).cast(LongType()) % 1000 / 1000.0)).cast(DoubleType()))
    .withColumn("payer_lat", (lit(-34.5) + (substring(conv(sha2(col("nameOrig"), 256), 16, 10), -15, 15).cast(LongType()) % 1000 / 1000.0) * -1).cast(DoubleType()))
    .withColumn("payer_lon", (lit(-58.4) + (substring(conv(sha2(col("nameOrig"), 256), 16, 10), -15, 15).cast(LongType()) % 1000 / 1000.0) * -1).cast(DoubleType()))
    
    # --- MCC (Merchant Category Code) ---
    .withColumn(
        "mcc",
        # CORRECCIÓN: Usamos substring
        (substring(conv(sha2(col("type"), 256), 16, 10), -15, 15).cast(LongType()) % 8999 + 1000).cast(LongType())
    )
    
    # --- Error y Fraude ---
    .withColumn("has_error", lit(0).cast("int"))
    .withColumn("error_code", lit(None).cast(StringType()))
    .withColumn("is_fraud", col("isFraud").cast("int"))
)

# --- Seleccionar y ordenar las columnas finales ---
df_final = df_aligned.select(
    "tx_id", "created_at", "merchant_id", "payer_id", "amount", "currency",
    "qr_type", "qr_hash", "mcc", "device_id", "payer_lat", "payer_lon",
    "merchant_lat", "merchant_lon", "channel", "has_error", "error_code", "is_fraud"
)

print("✅ Esquema alineado.")
print("📋 Nuevo esquema:")
df_final.printSchema()

# --- 4. Escribir en la Capa Bronze ---
print(f"\n💾 Escribiendo en tabla Bronze: {bronze_table_name}")

(
    df_final
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(bronze_table_name)
)

print("🎉 ¡Dataset de Kaggle ingerido exitosamente en la capa Bronze!")

📂 Ruta del dataset de Kaggle: /Volumes/fraude_qr/bronze/raw_data/kaggle/PS_20174392719_1491204439457_log.csv
💾 Tabla Bronze de destino: fraude_qr.bronze.qr_transactions_kaggle_raw

📂 Leyendo dataset de Kaggle...
📋 Esquema original de PaySim:
root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)


🧩 Alineando esquema al modelo de datos QR...
✅ Esquema alineado.
📋 Nuevo esquema:
root
 |-- tx_id: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- merchant_id: long (nullable = true)
 |-- payer_id: long (nullable = true)
 |-- amount: double (nullable = true)
 |-- c