In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pyspark.sql import functions as F

In [22]:
spark = SparkSession.builder.appName('Insights').getOrCreate()

In [23]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")

In [24]:
df.show()

+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      4|


In [29]:
dfGrouped = df.groupBy('Customer_ID').agg(F.max('Customer_Segment').alias('Type'), F.round(F.sum("Total_Amount") ,2).alias("Total_Spend"), F.sum("Total_Purchases").alias("Total_Purchases"), F.min("Date").alias("First_Purchase_Date"), F.max("Date").alias("Last_Purchase_Date"))
lifeSpanDF = dfGrouped.withColumn('Lifespan', F.round(((F.datediff(F.col("Last_Purchase_Date"), F.col("First_Purchase_Date")))/365), 2))
lifeSpanDF = lifeSpanDF.drop('First_Purchase_Date', 'Last_Purchase_Date', 'Customer_ID')
clvDF = lifeSpanDF.withColumn('CLV', F.round(((F.col('Total_Purchases'))/(F.col('Lifespan')))*(F.col('Total_Spend')), 2))
clvDF.show()

+-------+-----------+---------------+--------+---------+
|   Type|Total_Spend|Total_Purchases|Lifespan|      CLV|
+-------+-----------+---------------+--------+---------+
|Regular|    1997.33|             13|   22.62|  1147.89|
|Regular|    2506.75|             23|   12.28|  4695.05|
|Regular|    1014.54|              7|     0.0|     NULL|
|Regular|    5194.43|             22|   10.83| 10551.94|
|Regular|     7291.8|             30|    16.1|  13587.2|
|Premium|    5157.83|             18|   21.86|  4247.07|
|Regular|    9831.52|             33|   20.33| 15958.69|
|    New|    2595.12|              7|     0.0|     NULL|
|Regular|     5289.4|             13|   13.36|  5146.87|
|Regular|     8172.4|             22|   17.17| 10471.33|
|Regular|    3397.38|             12|    5.17|   7885.6|
|Regular|     1742.4|             20|    0.93| 37470.97|
|Regular|     642.28|              4|   10.39|   247.27|
|Premium|    5897.23|             19|   19.76|  5670.41|
|Regular|    8834.12|          