In [5]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName('Insights').getOrCreate()

In [3]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")

In [4]:
df.show()

+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      4|


In [15]:
dfGrouped = df.groupBy('Customer_ID').agg(F.round(F.sum("Total_Amount") ,2).alias("Total_Spend"), F.sum("Total_Purchases").alias("Total_Purchases"), F.min("Date").alias("First_Purchase_Date"), F.max("Date").alias("Last_Purchase_Date"))
lifeSpanDF = dfGrouped.withColumn('Lifespan', F.round(((F.datediff(F.col("Last_Purchase_Date"), F.col("First_Purchase_Date")))/365), 2))
lifeSpanDF = lifeSpanDF.drop('First_Purchase_Date', 'Last_Purchase_Date', 'Customer_ID')
clvDF = lifeSpanDF.withColumn('CLV', F.round(((F.col('Total_Purchases'))*(F.col('Lifespan'))), 2))
clvDF.show()



+-----------+---------------+--------+------+
|Total_Spend|Total_Purchases|Lifespan|   CLV|
+-----------+---------------+--------+------+
|    8280.01|             23|   17.11|393.53|
|   15413.02|             49|   18.51|906.99|
|    3395.03|             15|   21.59|323.85|
|   11753.28|             31|   18.61|576.91|
|    4122.42|             28|    22.0| 616.0|
|    7837.99|             29|    15.1| 437.9|
|    6934.44|             24|   22.66|543.84|
|    3161.61|             22|   21.24|467.28|
|    6048.51|             31|   19.08|591.48|
|    1631.95|             20|   11.36| 227.2|
|    4474.39|             24|    4.34|104.16|
|    8080.24|             33|   18.49|610.17|
|    4016.68|             12|     0.5|   6.0|
|    9775.98|             25|   15.78| 394.5|
|    7440.34|             23|   15.84|364.32|
|     228.87|              2|     0.0|   0.0|
|    7718.09|             21|    0.83| 17.43|
|    2439.95|              7|   15.15|106.05|
|    6374.66|             15|    0