In [22]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [23]:
spark = SparkSession.builder.appName('Insights').getOrCreate()

In [32]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")

In [45]:
dfGrouped = df.groupBy("Customer_ID").agg(
    F.array_sort(F.collect_list("Date")).alias("Purchase_Dates"),
    F.round(F.sum("Total_Amount") ,2).alias("Total_Spend"),
    F.sum("Total_Purchases").alias("Total_Purchases")
)
dfGrouped = dfGrouped.withColumn("Last_Purchase", F.element_at(F.col("Purchase_Dates"), -1)) \
                     .withColumn("Second_Last_Purchase", F.when(F.size(F.col("Purchase_Dates")) > 1, 
                                                                F.element_at(F.col("Purchase_Dates"), -2)))

dfGrouped = dfGrouped.withColumn("Recency", 
                                 F.when(F.col("Second_Last_Purchase").isNull(), 0)
                                  .otherwise(F.datediff(F.col("Last_Purchase"), F.col("Second_Last_Purchase"))))
dfGrouped = dfGrouped.drop('Purchase_Dates')
dfGrouped = dfGrouped.where(F.col('Recency')>0)
latestDate = df.select(F.max('Date')).collect()[0][0]
churnDf = dfGrouped.withColumn(
    "Churn", 
    F.when(F.datediff(F.lit(latestDate), F.col("Last_Purchase")) > 90, 1).otherwise(0)
)
churnDf = churnDf.withColumn('Avg_Order_Value', F.round(F.col('Total_Spend') / F.col('Total_Purchases'), 2))
churnDf = churnDf.drop('Last_Purchase', 'Second_Last_Purchase')
churnDf.write.parquet('data/processed/temp/churnData.parquet', mode='overwrite')
