## 1) Iniciar/Reiniciar Spark y configuración de recursos

In [2]:
try:
    spark.stop()
except:
    pass

from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("cohort-analysis")
         .master("local[2]")
         .config("spark.driver.memory", "4g")
         .config("spark.sql.shuffle.partitions", "200")
         .getOrCreate())

spark


## 2) Carga del parquet y periodo mensual

In [3]:
from pyspark.sql import functions as F, Window as W

PATH_TRANSACTIONS="/home/jovyan/data/transactions.parquet"
COL_TIMESTAMP="t_dat"
COL_CUSTOMER_ID="customer_id"

df=spark.read.parquet(PATH_TRANSACTIONS)
dfn=(df
     .withColumn("_ts",F.to_timestamp(F.col(COL_TIMESTAMP).cast("string"),"yyyy-MM-dd"))
     .withColumn("period",F.date_trunc("month",F.col("_ts")))
     .select(F.col(COL_CUSTOMER_ID).alias("customer_id"),"period")
     .dropna())
activity=dfn.dropDuplicates()


## 3) Cohorte por mes de primera compra (muestra)

In [4]:
first_tx=(activity.groupBy("customer_id")
           .agg(F.min("period").alias("cohort_month")))
cohort_size=(first_tx.groupBy("cohort_month")
             .agg(F.countDistinct("customer_id").alias("n0")))
first_tx.orderBy("cohort_month","customer_id").show(20,truncate=False)


+----------------------------------------------------------------+-------------------+
|customer_id                                                     |cohort_month       |
+----------------------------------------------------------------+-------------------+
|0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa|2018-09-01 00:00:00|
|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|2018-09-01 00:00:00|
|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|2018-09-01 00:00:00|
|0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c9199e53dbb81641becd7|2018-09-01 00:00:00|
|00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d60c6d7be26f06264d6|2018-09-01 00:00:00|
|00018385675844f7a6babbed41b5655b5727fb16483b6ea51d5798a6ab947344|2018-09-01 00:00:00|
|0001f8cef6b9702d54abf66fd89eb21014bf98567065a9b5e42f37bc99528cf5|2018-09-01 00:00:00|
|00022754ec18c5e53757eea8b281632a5c4a499368ecc53a21adfc64a665ba8b|2018-09-01 00:00:00|
|00025f8226be50dcab09402a2cacd520a99e112fe0

## 4) Actividad mensual y cohort_index (muestra)

In [5]:
tx_monthly=(activity.groupBy("customer_id","period")
              .agg(F.count("*").alias("hits")))
txm=(tx_monthly.join(first_tx,"customer_id","left")
     .withColumn("cohort_index",
                 (F.months_between("period","cohort_month")/F.lit(1)).cast("int")))
txm.orderBy("cohort_month","cohort_index","customer_id").show(20,truncate=False)


+----------------------------------------------------------------+-------------------+----+-------------------+------------+
|customer_id                                                     |period             |hits|cohort_month       |cohort_index|
+----------------------------------------------------------------+-------------------+----+-------------------+------------+
|0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa|2018-09-01 00:00:00|1   |2018-09-01 00:00:00|0           |
|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|2018-09-01 00:00:00|1   |2018-09-01 00:00:00|0           |
|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|2018-09-01 00:00:00|1   |2018-09-01 00:00:00|0           |
|0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c9199e53dbb81641becd7|2018-09-01 00:00:00|1   |2018-09-01 00:00:00|0           |
|00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d60c6d7be26f06264d6|2018-09-01 00:00:00|1   |2018-09-01 00:00:00|0           |


## 5) Retención por cohorte y mes relativo (muestra)

In [6]:
active=(txm.select("cohort_month","cohort_index","customer_id").dropDuplicates())
ret=(active.groupBy("cohort_month","cohort_index")
      .agg(F.countDistinct("customer_id").alias("active_users"))
      .join(cohort_size,"cohort_month","left")
      .withColumn("retention",F.round(F.col("active_users")/F.col("n0"),4))
      .orderBy("cohort_month","cohort_index"))
ret.show(60,truncate=False)


+-------------------+------------+------------+------+---------+
|cohort_month       |cohort_index|active_users|n0    |retention|
+-------------------+------------+------------+------+---------+
|2018-09-01 00:00:00|0           |140340      |140340|1.0      |
|2018-09-01 00:00:00|1           |61551       |140340|0.4386   |
|2018-09-01 00:00:00|2           |59793       |140340|0.4261   |
|2018-09-01 00:00:00|3           |56736       |140340|0.4043   |
|2018-09-01 00:00:00|4           |53683       |140340|0.3825   |
|2018-09-01 00:00:00|5           |50781       |140340|0.3618   |
|2018-09-01 00:00:00|6           |53249       |140340|0.3794   |
|2018-09-01 00:00:00|7           |55296       |140340|0.394    |
|2018-09-01 00:00:00|8           |55332       |140340|0.3943   |
|2018-09-01 00:00:00|9           |59931       |140340|0.427    |
|2018-09-01 00:00:00|10          |57399       |140340|0.409    |
|2018-09-01 00:00:00|11          |48356       |140340|0.3446   |
|2018-09-01 00:00:00|12  

## 6) Matriz de retención (vista amplia)

In [7]:
ret_matrix=(ret.groupBy("cohort_month")
             .pivot("cohort_index")
             .agg(F.first("retention"))
             .orderBy("cohort_month"))
ret_matrix.show(20,truncate=False)


+-------------------+---+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+
|cohort_month       |0  |1     |2     |3     |4     |5     |6     |7     |8     |9     |10    |11    |12    |13    |14    |15    |16    |17    |18    |19    |20    |21    |22    |23    |24   |
+-------------------+---+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+
|2018-09-01 00:00:00|1.0|0.4386|0.4261|0.4043|0.3825|0.3618|0.3794|0.394 |0.3943|0.427 |0.409 |0.3446|0.3638|0.3561|0.3646|0.3509|0.3156|0.3035|0.2758|0.268 |0.3006|0.3657|0.328 |0.3152|0.243|
|2018-10-01 00:00:00|1.0|0.3582|0.3335|0.3168|0.2961|0.3109|0.3258|0.33  |0.3579|0.3466|0.2818|0.2886|0.2975|0.3011|0.2926|0.2588|0.2486|0.2243|0.2171|0.247 |0.3076|0.2743|0.2599|0.1932|NULL |
|2018-11-01 00:00:00|1.0|0.2637|0.2