In [1]:

from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql import functions as F
from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [8]:
spark = SparkSession.builder.appName('Insights').getOrCreate()

In [9]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")

In [24]:
df.show()

+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      4|


In [11]:
dfGrouped = df.groupBy('Customer_ID').agg(F.max('Customer_Segment').alias('Type'), F.round(F.sum("Total_Amount") ,2).alias("Total_Spend"), F.sum("Total_Purchases").alias("Total_Purchases"), F.min("Date").alias("First_Purchase_Date"), F.max("Date").alias("Last_Purchase_Date"))
lifeSpanDF = dfGrouped.withColumn('Lifespan', F.round(((F.datediff(F.col("Last_Purchase_Date"), F.col("First_Purchase_Date")))/365), 2))
lifeSpanDF = lifeSpanDF.drop('First_Purchase_Date', 'Last_Purchase_Date', 'Customer_ID')
clvDF = lifeSpanDF.withColumn('CLV', F.round(((F.col('Total_Purchases'))/(F.col('Lifespan')))*(F.col('Total_Spend')), 2))
clvDF = clvDF.where(F.col('Lifespan') != 0)
clvDF.show()

+-------+-----------+---------------+--------+---------+
|   Type|Total_Spend|Total_Purchases|Lifespan|      CLV|
+-------+-----------+---------------+--------+---------+
|Regular|    1997.33|             13|   22.62|  1147.89|
|Regular|    2506.75|             23|   12.28|  4695.05|
|Regular|    5194.43|             22|   10.83| 10551.94|
|Regular|     7291.8|             30|    16.1|  13587.2|
|Premium|    5157.83|             18|   21.86|  4247.07|
|Regular|    9831.52|             33|   20.33| 15958.69|
|Regular|     5289.4|             13|   13.36|  5146.87|
|Regular|     8172.4|             22|   17.17| 10471.33|
|Regular|    3397.38|             12|    5.17|   7885.6|
|Regular|     1742.4|             20|    0.93| 37470.97|
|Regular|     642.28|              4|   10.39|   247.27|
|Premium|    5897.23|             19|   19.76|  5670.41|
|Regular|    8834.12|             30|   20.08| 13198.39|
|Regular|    6067.29|             17|   20.59|  5009.42|
|Regular|    4745.26|          

In [None]:

print(df.head())

   Total_Spend  Total_Purchases  Lifespan       CLV  Type_New  Type_Premium  \
0      1997.33               13     22.62   1147.89         1             0   
1      2506.75               23     12.28   4695.05         0             0   
2      1014.54                7      0.00       NaN         0             0   
3      5194.43               22     10.83  10551.94         1             0   
4      7291.80               30     16.10  13587.20         0             1   

   Type_Regular  
0             0  
1             1  
2             1  
3             0  
4             0  


In [None]:
df = pd.read_parquet('../../data/processed/model/clvData.parquet', engine='pyarrow')
df = pd.get_dummies(df, columns=['Type'], dtype=int)
  
X = df[['Total_Spend', 'Total_Purchases', 'Lifespan', 'Type_New', 'Type_Premium', 'Type_Regular']]
y = df['CLV']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(X_train)
print(y_train)

[[0.19810105 0.16666667 0.0064907  0.         0.         1.        ]
 [0.07728412 0.17948718 0.83643444 1.         0.         0.        ]
 [0.31142748 0.23076923 0.02163566 0.         0.         1.        ]
 ...
 [0.22552058 0.30769231 0.89917785 1.         0.         0.        ]
 [0.35511632 0.35897436 0.03894418 0.         0.         1.        ]
 [0.03815449 0.05128205 0.         0.         1.         0.        ]]
27797    504560.93
46681      1641.46
70857    322722.60
64326       536.11
11010      1735.29
           ...    
6265       1891.67
54886      4868.20
76820      7402.36
860      311996.82
15795          NaN
Name: CLV, Length: 69024, dtype: float64


In [None]:
model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1) 
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))
  
  

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m4205/4314[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - loss: nan - mae: nan

KeyboardInterrupt: 