In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, dayofweek, avg, count, lit, stddev
from pyspark.sql.window import Window

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator


bronze_path = "Files/bronze/transacciones_servicios/"
silver_path = "Files/silver/transacciones_servicios/"
gold_path = "Files/gold/dataset_recurrencia_servicios/"

notebook_version = "1.0"

print(f"Ruta Bronze: {bronze_path}")
print(f"Ruta Silver: {silver_path}")
print(f"Ruta Gold: {gold_path}")

StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 9, Finished, Available, Finished)

Ruta Bronze: Files/bronze/transacciones_servicios/
Ruta Silver: Files/silver/transacciones_servicios/
Ruta Gold: Files/gold/dataset_recurrencia_servicios/


In [8]:
df_modelo_servicios = spark.read.format("delta")\
                                  .load(gold_path)
                                   
print(f"Lectura completada. Se leyeron {df_modelo_servicios.count()} registros.")

StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 10, Finished, Available, Finished)

Lectura completada. Se leyeron 3563259 registros.


In [9]:
window_spec = Window.partitionBy("cedula_remitente", "numero_cuenta_servicio") \
                    .orderBy("fecha_transaccion") \
                    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_features = df_modelo_servicios.withColumn("conteo_transacciones_hist", count(lit(1)).over(window_spec)) \
                                   .withColumn("promedio_dias_hist", avg(col("dias_diferencia_anterior_transaccion")).over(window_spec))\
                                   .withColumn("promedio_monto_hist", avg(col("monto")).over(window_spec))


StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 11, Finished, Available, Finished)

In [10]:
df_features = df_features.select(col("cedula_remitente"),
                                 "numero_cuenta_servicio",
                                 "fecha_transaccion",
                                 "mes_transaccion",
                                 "dia_transaccion",
                                 "dia_semana",
                                 "fecha_anterior_transaccion",
                                 "fecha_siguiente_transaccion",
                                 "conteo_transacciones_hist",
                                 "monto",
                                 "monto_anterior_transaccion",
                                 "promedio_monto_hist",
                                 "dias_diferencia_anterior_transaccion",
                                 "promedio_dias_hist",
                                 col("dias_diferencia_siguiente_transaccion").alias("label_fecha"),
                                 col("monto_siguiente_transaccion").alias("label_monto"))

StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 12, Finished, Available, Finished)

In [11]:
df_features = df_features.fillna(0)
df_features = df_features.na.drop(subset=["label_fecha"])
df_features = df_features.na.drop(subset=["label_monto"])

StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 13, Finished, Available, Finished)

In [12]:
display(df_features)

StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 889095dc-a56c-42d0-8eb4-b92b2bbb2480)

In [13]:
# Guardar df_features como tabla en el metastore de Spark
df_features.write \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("fs_servicios")


StatementMeta(, a843dfd2-10c8-43fb-9a47-0926a5581bd4, 15, Finished, Available, Finished)