In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
spark = SparkSession.builder.appName('sales').getOrCreate()

In [4]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")
df.show()

+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      4|


In [None]:
groupedDf = df.groupBy('Date').agg(F.sum(F.col('Total_Amount')).alias('Total_Sales')).orderBy('Date')
minDate, maxDate = df.select(F.min("Date"), F.max("Date")).first()
newDf = spark.range(0, (maxDate - minDate).days + 1).withColumn("Date", F.date_add(F.lit(minDate), F.col("id").cast("int"))).drop("id")
resultDf = newDf.join(groupedDf, on="Date", how="left")
resultDf = resultDf.fillna(0, subset=["Total_Sales"])
orderedDf = resultDf.orderBy("Date")

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, ReLU
from datetime import timedelta

pandas_df = orderedDf.toPandas()

# Normalize the 'Total_Sales' column
scaler = MinMaxScaler(feature_range=(0, 1))
pandas_df['Total_Sales'] = scaler.fit_transform(pandas_df[['Total_Sales']])

# Prepare the time series data
def create_sequences(data, seq_length, pred_length):
    X, y = [], []
    for i in range(len(data) - seq_length - pred_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length:i + seq_length + pred_length])
    return np.array(X), np.array(y)

# Define sequence length and prediction length
seq_length = 30  # Use the last 30 days to predict the next 6 months
pred_length = 180  # 6 months * 30 days/month

# Create sequences
X, y = create_sequences(pandas_df['Total_Sales'].values, seq_length, pred_length)

# Split into training and testing sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Reshape X to be compatible with LSTM input (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [None]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(30, 1)))  # First LSTM layer
model.add(LSTM(50, return_sequences=False))  # Second LSTM layer
model.add(Dense(180))  # Output layer to predict the next 6 months
model.add(ReLU())  # Add ReLU activation to ensure non-negative predictions

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))

  super().__init__(**kwargs)


Epoch 1/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - loss: 0.0157 - val_loss: 0.0631
Epoch 2/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.0144 - val_loss: 0.0629
Epoch 3/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.0134 - val_loss: 0.0665
Epoch 4/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.0132 - val_loss: 0.0670
Epoch 5/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.0129 - val_loss: 0.0678
Epoch 6/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - loss: 0.0129 - val_loss: 0.0677
Epoch 7/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 0.0126 - val_loss: 0.0681
Epoch 8/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 0.0126 - val_loss: 0.0681
Epoch 9/50
[1m103/103[0m [32m

<keras.src.callbacks.history.History at 0x25787561d60>

In [10]:
last_sequence = pandas_df['Total_Sales'].values[-seq_length:].reshape((1, seq_length, 1))

# Predict the next 6 months
predicted_sales = model.predict(last_sequence)

# Clip negative values to 0
predicted_sales = np.clip(predicted_sales, 0, None)

# Inverse transform the predictions to get actual sales values
predicted_sales = scaler.inverse_transform(predicted_sales.reshape(-1, 1)).flatten()

# Generate the dates for the next 6 months
predicted_dates = [maxDate + timedelta(days=i) for i in range(1, pred_length + 1)]

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Date': predicted_dates,
    'Predicted_Sales': predicted_sales
})

# Convert 'Date' column to datetime
predictions_df['Date'] = pd.to_datetime(predictions_df['Date'])

# Aggregate predictions by month
predictions_df['YearMonth'] = predictions_df['Date'].dt.to_period('M')  # Extract year-month
monthly_predictions = predictions_df.groupby('YearMonth')['Predicted_Sales'].sum().reset_index()

# Convert YearMonth back to string for better readability
monthly_predictions['YearMonth'] = monthly_predictions['YearMonth'].astype(str)

print(monthly_predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
  YearMonth  Predicted_Sales
0   2024-03       12759434.0
1   2024-04       15174281.0
2   2024-05       10402098.0
3   2024-06        6792693.5
4   2024-07       10741728.0
5   2024-08        5615401.0
