In [0]:
# Databricks notebook cell
%pip install haversine

Collecting haversine
  Downloading haversine-2.9.0-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading haversine-2.9.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.9.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# MAGIC %md
# MAGIC # 🛠️ 04_Feature_Engineering
# MAGIC Crea características (features) predictivas a partir de los datos limpios de la capa Silver.
# MAGIC - Lee desde `fraude_qr.silver.qr_transactions`.
# MAGIC - Calcula features temporales (ventanas de tiempo), geográficas (distancia) y de comportamiento.
# MAGIC - Escribe el resultado en una tabla versionada: `fraude_qr.features.qr_tx_features_v1`.

# COMMAND ----------

from pyspark.sql.functions import col, lit, avg, stddev, count, when, udf
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType
from haversine import haversine

# --- 1. Configuración ---
silver_table = "fraude_qr.silver.qr_transactions"
feature_table = "fraude_qr.features.qr_tx_features_v1"

print(f"📖 Leyendo datos limpios desde: {silver_table}")
print(f"✍️ Tabla de features de destino: {feature_table}")

# --- 2. Cargar Datos Limpios ---
df = spark.table(silver_table)

# --- 3. Creación de Features ---
print("\n✨ Calculando nuevas características...")

# 3.1 Feature Geográfica: Distancia Payer-Merchant
# Creamos una UDF (User-Defined Function) para calcular la distancia Haversine.
def haversine_distance(lat1, lon1, lat2, lon2):
    if lat1 is None or lon1 is None or lat2 is None or lon2 is None:
        return None
    return haversine((lat1, lon1), (lat2, lon2))

# Registramos la UDF para usarla en Spark
haversine_udf = udf(haversine_distance, DoubleType())

df_features = df.withColumn(
    "distance_km",
    haversine_udf(col("payer_lat"), col("payer_lon"), col("merchant_lat"), col("merchant_lon"))
)
print("✅ Feature 'distance_km' creada.")

# 3.2 Features Temporales (Ventanas de Tiempo)
# Definimos las ventanas que mirarán hacia atrás desde cada transacción.
window_1h = Window.partitionBy("payer_id").orderBy(col("created_at").cast("long")).rangeBetween(-3600, 0)
window_24h = Window.partitionBy("payer_id").orderBy(col("created_at").cast("long")).rangeBetween(-86400, 0)
window_7d = Window.partitionBy("payer_id").orderBy(col("created_at").cast("long")).rangeBetween(-604800, 0)

df_features = (
    df_features
    # Contar transacciones del pagador en la última hora y 24 horas
    .withColumn("payer_tx_count_1h", count("tx_id").over(window_1h))
    .withColumn("payer_tx_count_24h", count("tx_id").over(window_24h))
    # Calcular el monto promedio del pagador en los últimos 7 días
    .withColumn("avg_amount_payer_7d", avg("amount").over(window_7d))
    # Calcular la desviación estándar del monto del pagador en los últimos 7 días
    .withColumn("stddev_amount_payer_7d", stddev("amount").over(window_7d))
)
print("✅ Features de ventanas de tiempo (velocidad) creadas.")

# 3.3 Features de Comportamiento (Z-Score del Monto)
# El Z-Score nos dice cuántas desviaciones estándar un monto está por encima o por debajo del promedio.
# Un Z-Score alto puede indicar una anomalía.
df_features = df_features.withColumn(
    "amount_zscore_payer_7d",
    when(
        col("stddev_amount_payer_7d") > 0,
        (col("amount") - col("avg_amount_payer_7d")) / col("stddev_amount_payer_7d")
    ).otherwise(0) # Si no hay desviación, el Z-Score es 0.
)
print("✅ Feature de Z-Score del monto creada.")

# --- 4. Selección Final de Features y Target ---
# Elegimos solo las columnas que serán útiles para el modelo.
df_final_features = df_features.select(
    "tx_id",
    "created_at",
    "transaction_date", # Importante para particionar
    "amount",
    # Features Creadas
    "distance_km",
    "payer_tx_count_1h",
    "payer_tx_count_24h",
    "amount_zscore_payer_7d",
    # Columnas Originales útiles
    "merchant_id", # Para agrupar en el entrenamiento (GroupKFold)
    "mcc",
    # Target (La variable que queremos predecir)
    "is_fraud"
).fillna(0) # Rellenamos los nulos restantes con 0

print("\n📋 Esquema final de la tabla de features:")
df_final_features.printSchema()

# --- 5. Escribir en la Tabla de Features (Feature Store) ---
print(f"\n💾 Escribiendo en la tabla de features: {feature_table}")

(
    df_final_features
    .write
    .mode("overwrite")
    .partitionBy("transaction_date")
    .option("overwriteSchema", "true")
    .saveAsTable(feature_table)
)

print("🎉 ¡Tabla de features creada exitosamente!")

# --- 6. Verificación ---
print(f"\n🔍 Muestra de datos en {feature_table}:")
spark.table(feature_table).limit(10).display()

📖 Leyendo datos limpios desde: fraude_qr.silver.qr_transactions
✍️ Tabla de features de destino: fraude_qr.features.qr_tx_features_v1

✨ Calculando nuevas características...
✅ Feature 'distance_km' creada.
✅ Features de ventanas de tiempo (velocidad) creadas.
✅ Feature de Z-Score del monto creada.

📋 Esquema final de la tabla de features:
root
 |-- tx_id: long (nullable = false)
 |-- created_at: timestamp (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- amount: double (nullable = false)
 |-- distance_km: double (nullable = false)
 |-- payer_tx_count_1h: long (nullable = false)
 |-- payer_tx_count_24h: long (nullable = false)
 |-- amount_zscore_payer_7d: double (nullable = false)
 |-- merchant_id: long (nullable = false)
 |-- mcc: long (nullable = false)
 |-- is_fraud: integer (nullable = false)


💾 Escribiendo en la tabla de features: fraude_qr.features.qr_tx_features_v1
🎉 ¡Tabla de features creada exitosamente!

🔍 Muestra de datos en fraude_qr.features.qr_tx_featur

tx_id,created_at,transaction_date,amount,distance_km,payer_tx_count_1h,payer_tx_count_24h,amount_zscore_payer_7d,merchant_id,mcc,is_fraud
84850007,1970-01-04T12:00:00.000Z,1970-01-04,4830219.15,123.59099254865092,1,1,4.504058368929395,1850,5567,1
94900007,1970-01-04T22:00:00.000Z,1970-01-04,15972.31,141.1017614227,1,2,-0.3256485193900746,8971,4905,0
94650025,1970-01-04T22:00:00.000Z,1970-01-04,139393.35,137.1816839643395,1,1,-0.043848563682035,1926,5567,0
95850025,1970-01-04T23:00:00.000Z,1970-01-04,80125.39,105.64726022448062,2,2,-0.4077193043434227,708,7312,0
94800048,1970-01-04T22:00:00.000Z,1970-01-04,65581.7,106.647519687021,1,1,-0.7529262598847426,1692,7312,0
93000093,1970-01-04T21:00:00.000Z,1970-01-04,195903.67,36.02813179725994,1,1,-0.1934656255722805,3158,7312,0
93800116,1970-01-04T21:00:00.000Z,1970-01-04,40084.81,72.52474735185565,1,1,-0.5479812725239606,388,7312,0
95100158,1970-01-04T23:00:00.000Z,1970-01-04,11920.72,35.44015984480222,1,1,-0.8250055419368838,9089,4905,0
94900186,1970-01-04T22:00:00.000Z,1970-01-04,5706.15,138.81092770468643,1,1,-0.5350709852521653,1777,4905,0
94200191,1970-01-04T22:00:00.000Z,1970-01-04,135.23,90.3977491727163,1,1,-0.7180853197511361,9437,4905,0
