In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql import functions as F
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [23]:
spark = SparkSession.builder.appName('Insights').getOrCreate()

In [49]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")

In [50]:
dfGrouped = df.groupBy("Customer_ID").agg(
    F.array_sort(F.collect_list("Date")).alias("Purchase_Dates"),
    F.round(F.sum("Total_Amount") ,2).alias("Total_Spend"),
    F.sum("Total_Purchases").alias("Total_Purchases")
)
dfGrouped = dfGrouped.withColumn("Last_Purchase", F.element_at(F.col("Purchase_Dates"), -1)) \
                     .withColumn("Second_Last_Purchase", F.when(F.size(F.col("Purchase_Dates")) > 1, 
                                                                F.element_at(F.col("Purchase_Dates"), -2)))

dfGrouped = dfGrouped.withColumn("Recency", 
                                 F.when(F.col("Second_Last_Purchase").isNull(), 0)
                                  .otherwise(F.datediff(F.col("Last_Purchase"), F.col("Second_Last_Purchase"))))
dfGrouped = dfGrouped.drop('Purchase_Dates')
dfGrouped = dfGrouped.where(F.col('Recency')>0)
latestDate = df.select(F.max('Date')).collect()[0][0]
churnDf = dfGrouped.withColumn(
    "Churn", 
    F.when(F.datediff(F.lit(latestDate), F.col("Last_Purchase")) > 90, 1).otherwise(0)
)
churnDf = churnDf.withColumn('Avg_Order_Value', F.round(F.col('Total_Spend') / F.col('Total_Purchases'), 2))
churnDf = churnDf.drop('Last_Purchase', 'Second_Last_Purchase', 'Customer_ID')
churnDf.write.parquet('data/processed/temp/churnData.parquet', mode='overwrite')


In [56]:
df = pd.read_parquet('data/processed/temp/churnData.parquet', engine='pyarrow')

In [66]:
print(df.head())

   Total_Spend  Total_Purchases  Recency  Churn  Avg_Order_Value
0      1997.33               13       38      1           153.64
1      5194.43               22       93      1           236.11
2      7291.80               30       26      1           243.06
3      5157.83               18       40      1           286.55
4      9831.52               33       79      0           297.92


In [57]:
X = df.drop(columns=['Churn'])
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
model = XGBClassifier(random_state=42, eval_metric="logloss")
model.fit(X_train, y_train)

In [None]:
newCustomer = pd.DataFrame({
    "Total_Spend": [9831.52],
    "Total_Purchases": [33],
    "Recency": [79],
    "Avg_Order_Value": [300] 
})

churnPrediction = model.predict(newCustomer)
churnProbability = model.predict_proba(newCustomer)[:, 1]
print(churnPrediction)
print(churnProbability)

Churn Prediction: [0]
Churn Probability: [0.38429555]
