In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName('sparkApp').getOrCreate()

In [11]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")
unique_values_df = df.select("Product_Type").distinct()
print(unique_values_df.count())
df.show()

33
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      

In [4]:
df = df.groupBy("Product_Type", "Date").agg(F.sum(F.col('Total_Purchases')).alias("Total_Purchases")).orderBy("Date", "Product_Type")
df.show(50)

+--------------------+----------+---------------+
|        Product_Type|      Date|Total_Purchases|
+--------------------+----------+---------------+
|            Bathroom|2001-01-24|             95|
|             Bedding|2001-01-24|             59|
|         BlueStar AC|2001-01-24|             28|
|          Children's|2001-01-24|             64|
|           Chocolate|2001-01-24|             48|
|              Coffee|2001-01-24|             85|
|         Decorations|2001-01-24|            185|
|               Dress|2001-01-24|             47|
|             Fiction|2001-01-24|            253|
|              Fridge|2001-01-24|            131|
|           Furniture|2001-01-24|            175|
|          Headphones|2001-01-24|             69|
|              Jacket|2001-01-24|             72|
|               Jeans|2001-01-24|            144|
|               Juice|2001-01-24|            176|
|             Kitchen|2001-01-24|             50|
|              Laptop|2001-01-24|             94|


In [56]:
df.createOrReplaceTempView("sales_data")

In [101]:
minMaxDates = spark.sql("""
    SELECT 
        Product_Category, 
        MIN(Date) AS min_date, 
        MAX(Date) AS max_date 
    FROM sales_data 
    GROUP BY Product_Category
""")
minMaxDates.createOrReplaceTempView("minMaxDates")


dateSeries = spark.sql("""
    SELECT 
        Product_Category, 
        date_add(min_date, idx) AS Date
    FROM (
        SELECT 
            Product_Category, 
            min_date, 
            max_date, 
            posexplode(
                split(space(datediff(max_date, min_date)), ' ')
            ) AS (idx, _)
        FROM minMaxDates
    )
""")
dateSeries.createOrReplaceTempView("dateSeries")


dfFilled = spark.sql("""
    SELECT 
        ds.Product_Category, 
        ds.Date, 
        COALESCE(sd.Total_Purchases, 0) AS Total_Purchases
    FROM dateSeries ds
    LEFT JOIN sales_data sd
    ON ds.Product_Category = sd.Product_Category AND ds.Date = sd.Date
""")
dfFilled.createOrReplaceTempView("filled_data")


dfFilled.show()


+----------------+----------+---------------+
|Product_Category|      Date|Total_Purchases|
+----------------+----------+---------------+
|         Grocery|2001-01-24|            882|
|         Grocery|2001-01-25|              0|
|         Grocery|2001-01-26|              0|
|         Grocery|2001-01-27|              0|
|         Grocery|2001-01-28|              0|
|         Grocery|2001-01-29|              0|
|         Grocery|2001-01-30|              0|
|         Grocery|2001-01-31|              0|
|         Grocery|2001-02-01|              0|
|         Grocery|2001-02-02|              0|
|         Grocery|2001-02-03|              0|
|         Grocery|2001-02-04|              0|
|         Grocery|2001-02-05|              0|
|         Grocery|2001-02-06|              0|
|         Grocery|2001-02-07|              0|
|         Grocery|2001-02-08|              0|
|         Grocery|2001-02-09|              0|
|         Grocery|2001-02-10|              0|
|         Grocery|2001-02-11|     

In [14]:
import pandas as pd
from prophet import Prophet

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [103]:
def trainProphetModel(dfFilled):
    dfFilled['Date'] = pd.to_datetime(dfFilled['Date'])
    models = {}
    for category in dfFilled['Product_Category'].unique():
        categoryData = dfFilled[dfFilled['Product_Category'] == category]
        prophetData = categoryData[['Date', 'Total_Purchases']].rename(columns={'Date': 'ds', 'Total_Purchases': 'y'})
        model = Prophet()
        model.fit(prophetData)
        models[category] = model
    return models

In [None]:
def predictNext30And7Days(models, dfFilled):
    timeSeriesData['Date'] = pd.to_datetime(timeSeriesData['Date'])
    predictions = {}
    for category, model in models.items():
        categoryData = timeSeriesData[timeSeriesData['Product_Category'] == category]
        prophetData = categoryData[['Date', 'Total_Purchases']].rename(columns={'Date': 'ds', 'Total_Purchases': 'y'})
        lastDate = prophetData['ds'].max()
        future = model.make_future_dataframe(periods=30, include_history=False)
        future = future[future['ds'] > lastDate]
        forecast = model.predict(future)
        predictions[category] = forecast[['ds', 'yhat']].assign(Product_Category=category)
    
    allPredictions = pd.concat(predictions.values())
    totalSales30Days = allPredictions.groupby('Product_Category')['yhat'].sum().round().astype(int).reset_index()
    totalSales30Days.columns = ['Product_Category', 'Total_Predicted_Sales_30Days']
    
    next7Days = allPredictions[allPredictions['ds'] <= (allPredictions['ds'].min() + pd.Timedelta(days=6))]
    totalSales7Days = next7Days.groupby('Product_Category')['yhat'].sum().round().astype(int).reset_index()
    totalSales7Days.columns = ['Product_Category', 'Total_Predicted_Sales_7Days']
    
    return allPredictions, totalSales30Days, totalSales7Days

In [105]:
dfFilled = dfFilled.toPandas()
models = trainProphetModel(dfFilled)
predictions, totalSales30Days, totalSales7Days = predictNext30And7Days(models, dfFilled)

print("\nTotal predicted sales for the next 30 days:")
print(totalSales30Days)
print("\nTotal predicted sales for the next 7 days:")
print(totalSales7Days)

11:46:05 - cmdstanpy - INFO - Chain [1] start processing
11:46:06 - cmdstanpy - INFO - Chain [1] done processing
11:46:07 - cmdstanpy - INFO - Chain [1] start processing
11:46:09 - cmdstanpy - INFO - Chain [1] done processing
11:46:09 - cmdstanpy - INFO - Chain [1] start processing
11:46:10 - cmdstanpy - INFO - Chain [1] done processing
11:46:11 - cmdstanpy - INFO - Chain [1] start processing
11:46:12 - cmdstanpy - INFO - Chain [1] done processing
11:46:13 - cmdstanpy - INFO - Chain [1] start processing
11:46:14 - cmdstanpy - INFO - Chain [1] done processing



Total predicted sales for the next 30 days:
  Product_Category  Total_Predicted_Sales_30Days
0            Books                          9325
1         Clothing                          9218
2      Electronics                         12000
3          Grocery                         11004
4       Home Decor                          9121

Total predicted sales for the next 7 days:
  Product_Category  Total_Predicted_Sales_7Days
0            Books                         2155
1         Clothing                         2122
2      Electronics                         2763
3          Grocery                         2542
4       Home Decor                         2103


In [None]:
allDates = df.select("Date").distinct()
allProducts = df.select("Product_Type").distinct()

completeDf = allDates.crossJoin(allProducts)
dfComplete = completeDf.join(df, on=["Date", "Product_Type"], how="left").na.fill(0)
pandas_df = df_complete.toPandas()

In [None]:

def forecast_sales(product_df, periods):
    product_df = product_df.rename(columns={'Date': 'ds', 'Total_Purchases': 'y'})
    model = Prophet()
    model.fit(product_df)
    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)
    return forecast[['ds', 'yhat']]


product_types = pandas_df['Product_Type'].unique()


forecasts_7d = {}
forecasts_30d = {}

for product in product_types:
    product_df = pandas_df[pandas_df['Product_Type'] == product]
    forecast_7d = forecast_sales(product_df, periods=7)
    forecast_30d = forecast_sales(product_df, periods=30)
    forecasts_7d[product] = forecast_7d
    forecasts_30d[product] = forecast_30d

14:05:16 - cmdstanpy - INFO - Chain [1] start processing
14:05:18 - cmdstanpy - INFO - Chain [1] done processing
14:05:19 - cmdstanpy - INFO - Chain [1] start processing
14:05:21 - cmdstanpy - INFO - Chain [1] done processing
14:05:23 - cmdstanpy - INFO - Chain [1] start processing
14:05:28 - cmdstanpy - INFO - Chain [1] done processing
14:05:33 - cmdstanpy - INFO - Chain [1] start processing
14:05:39 - cmdstanpy - INFO - Chain [1] done processing
14:05:39 - cmdstanpy - INFO - Chain [1] start processing
14:05:40 - cmdstanpy - INFO - Chain [1] done processing
14:05:41 - cmdstanpy - INFO - Chain [1] start processing
14:05:42 - cmdstanpy - INFO - Chain [1] done processing
14:05:42 - cmdstanpy - INFO - Chain [1] start processing
14:05:43 - cmdstanpy - INFO - Chain [1] done processing
14:05:43 - cmdstanpy - INFO - Chain [1] start processing
14:05:44 - cmdstanpy - INFO - Chain [1] done processing
14:05:45 - cmdstanpy - INFO - Chain [1] start processing
14:05:45 - cmdstanpy - INFO - Chain [1]

In [None]:
def get_top_and_lowest_products(forecasts):
    total_sales = {}
    for product, forecast in forecasts.items():
        total_sales[product] = forecast['yhat'].sum()
    
    sorted_highest = sorted(total_sales.items(), key=lambda x: x[1], reverse=True)
    top_3_highest = [product for product, sales in sorted_highest[:3]]
    
    sorted_lowest = sorted(total_sales.items(), key=lambda x: x[1])
    top_3_lowest = [product for product, sales in sorted_lowest[:3]]
    
    return top_3_highest, top_3_lowest

top_3_highest_7d, top_3_lowest_7d = get_top_and_lowest_products(forecasts_7d)
top_3_highest_30d, top_3_lowest_30d = get_top_and_lowest_products(forecasts_30d)

print("Top 3 highest-selling products for the next 7 days:", top_3_highest_7d)
print("Top 3 lowest-selling products for the next 7 days:", top_3_lowest_7d)
print("Top 3 highest-selling products for the next 30 days:", top_3_highest_30d)
print("Top 3 lowest-selling products for the next 30 days:", top_3_lowest_30d)

Top 3 highest-selling products for the next 7 days: ['Shorts', 'Tools', 'T-shirt']
Top 3 lowest-selling products for the next 7 days: ["Children's", 'BlueStar AC', 'Jacket']
Top 3 highest-selling products for the next 30 days: ['Shorts', 'Tools', 'T-shirt']
Top 3 lowest-selling products for the next 30 days: ["Children's", 'BlueStar AC', 'Thriller']
