## 1) Iniciar/Reiniciar Spark y configuración de recursos


In [1]:
try: spark.stop()
except: pass
from pyspark.sql import SparkSession
spark=(SparkSession.builder
       .appName("churn-analysis")
       .master("local[2]")
       .config("spark.driver.memory","6g")
       .config("spark.sql.shuffle.partitions","200")
       .getOrCreate()
      )

## 2) Carga del parquet y construcción de `activity` (cliente × mes)


In [2]:
from pyspark.sql import functions as F, Window as W
PATH_TRANSACTIONS="/home/jovyan/data/transactions.parquet"
COL_TIMESTAMP="t_dat"; COL_CUSTOMER_ID="customer_id"
df=spark.read.parquet(PATH_TRANSACTIONS)
activity=(df.withColumn(
            "_ts",
            F.to_timestamp(F.col(COL_TIMESTAMP).cast("string")))
            .withColumn("period",F.date_trunc("month",F.col("_ts")))
            .select(F.col(COL_CUSTOMER_ID).alias("customer_id"),"period")
            .dropna().dropDuplicates()
         )

## 3) Activos, `prev_actives`, `survivors` y métricas de churn/retención

In [3]:
from pyspark.storagelevel import StorageLevel
activity=activity.repartition(400,"period").persist(StorageLevel.DISK_ONLY); activity.count()
actives=activity.groupBy("period").agg(F.countDistinct("customer_id").alias("actives"))
actives=actives.withColumn("prev_actives",F.lag("actives").over(W.orderBy("period")))
seq=activity.withColumn("prev_period",F.lag("period").over(W.partitionBy("customer_id").orderBy("period"))).withColumn("is_survivor",(F.expr("months_between(period, prev_period)=1")).cast("int"))
survivors=seq.groupBy("period").agg(F.sum("is_survivor").alias("survivors"))
churn=(actives.join(survivors,"period","left").fillna({"survivors":0}).withColumn("churn_count",F.when(F.col("prev_actives").isNull(),None).otherwise(F.col("prev_actives")-F.col("survivors"))).withColumn("churn_rate",F.when(F.col("prev_actives").isNull(),None).otherwise(F.round(F.col("churn_count")/F.col("prev_actives"),4))).withColumn("retention_rate",F.when(F.col("prev_actives").isNull(),None).otherwise(F.round(F.col("survivors")/F.col("prev_actives"),4))).orderBy("period"))

## 4) Nuevos y reactivados; ensamblado de KPIs (`kpis`)

In [4]:
first_period=activity.groupBy("customer_id").agg(F.min("period").alias("first_period"))
new_counts=activity.join(first_period,"customer_id","left").withColumn("is_new",(F.col("period")==F.col("first_period")).cast("int")).groupBy("period").agg(F.sum("is_new").alias("new_users"))
kpis=(churn.join(new_counts,"period","left").fillna({"new_users":0}).withColumn("reactivated",F.greatest(F.col("actives")-F.col("survivors")-F.col("new_users"),F.lit(0))).orderBy("period"))

## 5) Identificación de churners y período de churn


In [5]:
from pyspark.sql import functions as F, Window as W
w = W.partitionBy("customer_id").orderBy("period")
churners = (activity
    .withColumn("next_period", F.lead("period").over(w))
    .withColumn("churn_at", F.when(F.col("next_period").isNull() | (F.months_between("next_period","period")!=1), F.add_months("period",1)))
    .where(F.col("churn_at").isNotNull())
    .select("churn_at","customer_id"))
churners.orderBy("churn_at","customer_id").show(50, truncate=False)

+----------+----------------------------------------------------------------+
|churn_at  |customer_id                                                     |
+----------+----------------------------------------------------------------+
|2018-10-01|0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa|
|2018-10-01|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|
|2018-10-01|0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c9199e53dbb81641becd7|
|2018-10-01|0002b9088196900626214bcdc6d5f3d85f38fb462a480605bd50a03431600dee|
|2018-10-01|0002e6cdaab622b5047407efc0d0bf85e23220e092012095d98119fa9cb3cee7|
|2018-10-01|0003abe64294e66a6310c3436fa9e5b754cc5603deef4f26fc8ab8d043af9358|
|2018-10-01|0003c98542d16e740156638a7d133d93360d3ff3d0b7ec3ccb78932a915b1cfe|
|2018-10-01|0004068f54dbe1c7054b23c615edc5f733a508ecc54930bf323209f20410898c|
|2018-10-01|00048f2f68760664d2d0fa1e7fbfe083f05287f342484c29a13661772507ee6b|
|2018-10-01|0005ef6698aeae9819c35dd1d63446f251797d07fe5b4a621bee