In [9]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [10]:
spark = SparkSession.builder.appName('sparkApp').config("spark.python.worker.timeout", "120") .getOrCreate()

In [20]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")
df.show()

+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      4|


In [21]:
df = df.groupBy("Product_Category", "Date").agg(F.sum(F.col('Total_Purchases')).alias("Total_Purchases")).orderBy("Date", "Product_Category")
df.show(50)

+----------------+----------+---------------+
|Product_Category|      Date|Total_Purchases|
+----------------+----------+---------------+
|           Books|2001-01-24|            747|
|        Clothing|2001-01-24|            732|
|     Electronics|2001-01-24|           1062|
|         Grocery|2001-01-24|            882|
|      Home Decor|2001-01-24|            774|
|           Books|2001-02-24|            954|
|        Clothing|2001-02-24|            733|
|     Electronics|2001-02-24|            982|
|         Grocery|2001-02-24|            821|
|      Home Decor|2001-02-24|            651|
|           Books|2001-03-24|            840|
|        Clothing|2001-03-24|            838|
|     Electronics|2001-03-24|            868|
|         Grocery|2001-03-24|            954|
|      Home Decor|2001-03-24|            897|
|           Books|2001-04-24|            775|
|        Clothing|2001-04-24|            743|
|     Electronics|2001-04-24|            949|
|         Grocery|2001-04-24|     

In [56]:
df.createOrReplaceTempView("sales_data")

In [57]:
minMaxDates = spark.sql("""
    SELECT 
        Product_Category, 
        MIN(Date) AS min_date, 
        MAX(Date) AS max_date 
    FROM sales_data 
    GROUP BY Product_Category
""")
minMaxDates.createOrReplaceTempView("minMaxDates")


dateSeries = spark.sql("""
    SELECT 
        Product_Category, 
        date_add(min_date, idx) AS Date
    FROM (
        SELECT 
            Product_Category, 
            min_date, 
            max_date, 
            posexplode(
                split(space(datediff(max_date, min_date)), ' ')
            ) AS (idx, _)
        FROM minMaxDates
    )
""")
dateSeries.createOrReplaceTempView("dateSeries")


dfFilled = spark.sql("""
    SELECT 
        ds.Product_Category, 
        ds.Date, 
        COALESCE(sd.Total_Purchases, 0) AS Total_Purchases
    FROM dateSeries ds
    LEFT JOIN sales_data sd
    ON ds.Product_Category = sd.Product_Category AND ds.Date = sd.Date
""")
dfFilled.createOrReplaceTempView("filled_data")


dfFilled.show()


+----------------+----------+---------------+
|Product_Category|      Date|Total_Purchases|
+----------------+----------+---------------+
|         Grocery|2001-01-24|            882|
|         Grocery|2001-01-25|              0|
|         Grocery|2001-01-26|              0|
|         Grocery|2001-01-27|              0|
|         Grocery|2001-01-28|              0|
|         Grocery|2001-01-29|              0|
|         Grocery|2001-01-30|              0|
|         Grocery|2001-01-31|              0|
|         Grocery|2001-02-01|              0|
|         Grocery|2001-02-02|              0|
|         Grocery|2001-02-03|              0|
|         Grocery|2001-02-04|              0|
|         Grocery|2001-02-05|              0|
|         Grocery|2001-02-06|              0|
|         Grocery|2001-02-07|              0|
|         Grocery|2001-02-08|              0|
|         Grocery|2001-02-09|              0|
|         Grocery|2001-02-10|              0|
|         Grocery|2001-02-11|     

In [58]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate

In [30]:
def salesPreProcess(data, seqLength = 30):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaledData = scaler.fit_transform(data)
    
    X, y = [], []
    for i in range(seqLength, len(scaledData)):
        X.append(scaledData[i-seqLength:i, 0])
        y.append(scaledData[i, 0])
    
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    return X, y, scaler

In [32]:
def lstmModel(inputShape):
    model = Sequential()
    model.add(Input(shape=inputShape)) 
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(25))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [33]:
def prediction(model, data, scaler, seqLength = 30):
    lastSequence = data[-seqLength:]
    lastSequence = np.reshape(lastSequence, (1, seqLength, 1))
    predictedScaled = model.predict(lastSequence)
    predictedValue = scaler.inverse_transform(predictedScaled)
    return predictedValue[0][0]

In [34]:
predictions = {}
pdf = dfInterpolated.toPandas()

for product_category in pdf['Product_Category'].unique():
    categoryDf = pdf[pdf['Product_Category'] == product_category]
    categoryDf = categoryDf.sort_values(by='Date')
    
    data = categoryDf['Total_Purchases'].values.reshape(-1, 1)
    X, y, scaler = salesPreProcess(data)

    inputShape = (X.shape[1], 1)
    model = lstmModel(inputShape)
    model.fit(X, y, batch_size=1, epochs=1, verbose=2)
    
    predictedResults = prediction(model, data, scaler)
    predictions[product_category] = predictedResults
    
for category, pred in predictions.items():
    print(f"Predicted Total Purchases for {category} next month: {pred:.2f}")

8407/8407 - 60s - 7ms/step - loss: 0.0159
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step
8407/8407 - 73s - 9ms/step - loss: 0.0171
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step
8407/8407 - 104s - 12ms/step - loss: 0.0181
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 788ms/step
8407/8407 - 156s - 19ms/step - loss: 0.0167
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 651ms/step


KeyboardInterrupt: 

In [59]:
def preprocessData(df, seqLength):
    scaler = MinMaxScaler(feature_range=(0, 1))
    df['Total_Purchases'] = scaler.fit_transform(df[['Total_Purchases']])
    encoder = OneHotEncoder()  # No sparse argument
    categoryEncoded = encoder.fit_transform(df[['Product_Category']]).toarray()  # Convert to dense array
    X, y = [], []
    for category in df['Product_Category'].unique():
        categoryDf = df[df['Product_Category'] == category]
        categoryData = categoryDf['Total_Purchases'].values
        categoryIndices = df[df['Product_Category'] == category].index
        categoryFeatures = categoryEncoded[categoryIndices]  # Use indices for proper alignment
        for i in range(seqLength, len(categoryData)):
            X.append(np.column_stack((categoryData[i-seqLength:i], categoryFeatures[i-seqLength:i])))
            y.append(categoryData[i])
    X, y = np.array(X), np.array(y)
    return X, y, scaler, encoder

In [60]:
def buildLSTMModel(inputShape, numCategories):
    inputSequence = Input(shape=inputShape)
    inputCategory = Input(shape=(numCategories,))
    
    # LSTM for time series data
    lstmOut = LSTM(50, return_sequences=True)(inputSequence)
    lstmOut = LSTM(50, return_sequences=False)(lstmOut)
    
    # Concatenate LSTM output with category features
    combined = Concatenate()([lstmOut, inputCategory])
    
    # Dense layers
    output = Dense(25, activation='relu')(combined)
    output = Dense(1)(output)
    
    # Define model
    model = Model(inputs=[inputSequence, inputCategory], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [61]:
def predictNextMonth(model, data, scaler, encoder, seqLength, category):
    categoryData = data[data['Product_Category'] == category]
    lastSequence = categoryData['Total_Purchases'].values[-seqLength:]
    lastCategoryFeatures = encoder.transform([[category]] * seqLength).toarray()  # Convert to dense array
    lastSequence = np.column_stack((lastSequence, lastCategoryFeatures))
    lastSequence = np.reshape(lastSequence, (1, seqLength, -1))
    predictedScaledValue = model.predict([lastSequence, lastCategoryFeatures[:1]])
    predictedValue = scaler.inverse_transform(predictedScaledValue)
    return predictedValue[0][0]

In [62]:
pandasDf = dfFilled.toPandas()
seqLength = 30
X, y, scaler, encoder = preprocessData(pandasDf, seqLength)
numCategories = len(pandasDf['Product_Category'].unique())

# Fix input shape for LSTM
inputShape = (seqLength, 1 + numCategories)  # sequence_length, num_features (time series + one-hot encoded categories)
model = buildLSTMModel(inputShape, numCategories)

# Fix input data for training
X_sequence = X[:, :, 0:1]  # Time series data (sequence_length, 1)
X_category = X[:, -1, 1:]  # Category features (last row of one-hot encoded features)
model.fit([X_sequence, X_category], y, batch_size=1, epochs=10, verbose=1)

predictions = {}
for category in pandasDf['Product_Category'].unique():
    predictedPurchases = predictNextMonth(model, pandasDf, scaler, encoder, seqLength, category)
    predictions[category] = predictedPurchases

for category, pred in predictions.items():
    print(f"Predicted Total Purchases for {category} next month: {pred:.2f}")

Epoch 1/10


ValueError: Exception encountered when calling LSTMCell.call().

[1mDimensions must be equal, but are 1 and 6 for '{{node functional_30_1/lstm_18_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](functional_30_1/lstm_18_1/strided_slice_1, functional_30_1/lstm_18_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [1,1], [6,200].[0m

Arguments received by LSTMCell.call():
  • inputs=tf.Tensor(shape=(1, 1), dtype=float32)
  • states=('tf.Tensor(shape=(1, 50), dtype=float32)', 'tf.Tensor(shape=(1, 50), dtype=float32)')
  • training=True