In [None]:
# Updated version of cumulative method originally developed by Sarah Kim

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
import sqlite3

In [2]:
retail_df_csv = pd.read_csv("Resources/retail_price_cleaned.csv")

retail_df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   product_id             676 non-null    object 
 1   product_category_name  676 non-null    object 
 2   month_year             676 non-null    object 
 3   qty_sold               676 non-null    int64  
 4   total_price            676 non-null    float64
 5   freight_price          676 non-null    float64
 6   unit_price             676 non-null    float64
 7   product_rating         676 non-null    float64
 8   no_customers           676 non-null    int64  
 9   month                  676 non-null    int64  
 10  year                   676 non-null    int64  
 11  seasonality            676 non-null    float64
 12  volume                 676 non-null    int64  
 13  comp1_price            676 non-null    float64
 14  comp1_prod_rating      676 non-null    float64
 15  comp1_

In [3]:
# path and name for database and connecting the db
conn = sqlite3.connect("Resources/pricing_opt.db")
# creating the table "retail" from the dataframe
retail_df_csv.to_sql('retail',conn,index=False,if_exists='replace')

In [4]:
query = "SELECT * FROM retail;"
retail_df = pd.read_sql(query,conn)
conn.close()
retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
0,bed1,bed_bath_table,01-05-2017,1,45.95,15.1,45.95,4.0,57,5,...,89.9,3.9,15.011897,215.0,4.4,8.76,45.95,4.0,15.1,45.9
1,bed1,bed_bath_table,01-06-2017,3,137.85,12.933333,45.95,4.0,61,6,...,89.9,3.9,14.769216,209.0,4.4,21.322,45.95,4.0,12.933333,45.95
2,bed1,bed_bath_table,01-07-2017,6,275.7,14.84,45.95,4.0,123,7,...,89.9,3.9,13.993833,205.0,4.4,22.195932,45.95,4.0,14.84,45.95
3,bed1,bed_bath_table,01-08-2017,4,183.8,14.2875,45.95,4.0,90,8,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.2875,45.95
4,bed1,bed_bath_table,01-09-2017,2,91.9,15.1,45.95,4.0,54,9,...,89.9,3.9,18.776522,163.39871,4.4,24.324687,45.95,4.0,15.1,45.95


In [5]:
retail_df.columns

Index(['product_id', 'product_category_name', 'month_year', 'qty_sold',
       'total_price', 'freight_price', 'unit_price', 'product_rating',
       'no_customers', 'month', 'year', 'seasonality', 'volume', 'comp1_price',
       'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
       'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
       'comp3_prod_rating', 'comp3_freight_price', 'lag_price'],
      dtype='object')

In [6]:
# combining 'year' and 'month' to datetime
retail_df['date'] = pd.to_datetime(retail_df[['year', 'month']].assign(day=1))

# Sort the DataFrame by the new datetime column
retail_df = retail_df.sort_values('date')

retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price,date
389,health7,health_beauty,01-01-2017,1,64.99,11.06,64.99,3.9,9,1,...,3.9,11.06,64.99,3.9,11.06,64.99,3.9,11.06,64.94,2017-01-01
339,health5,health_beauty,01-01-2017,8,2799.2,22.90125,349.9,4.3,9,1,...,4.3,22.90125,349.9,4.3,22.90125,64.99,3.9,11.06,349.85,2017-01-01
438,bed2,bed_bath_table,01-02-2017,2,179.8,13.02,89.9,3.9,2,2,...,3.9,13.02,89.9,3.9,13.02,89.9,3.9,13.02,89.85,2017-02-01
236,garden8,garden_tools,01-02-2017,1,179.99,33.54,179.99,4.2,3,2,...,4.2,33.54,179.99,4.2,33.54,179.99,4.2,33.54,179.94,2017-02-01
58,health9,health_beauty,01-02-2017,11,219.89,11.750909,19.99,4.3,19,2,...,4.3,11.750909,19.99,4.3,11.750909,64.99,3.9,15.348,19.94,2017-02-01


In [7]:
# Initial size of data for training. This will expand by "expand_size" at every loop cycle.
# This is the concept of cumulative training.
init_train_length = 5 
# Define size of testing data.
test_length = 1
# Define the step size for expansion.
expand_size = 1

grouped = retail_df.groupby('product_id')

table_data = []
mse_scores = []
predicted_unit_prices = []

In [8]:

for group_key, group_data in grouped:
    # Extract the group's features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]

    target = group_data['unit_price']
   
    # debug
    print(f"Product {group_key} - Features Length: {len(features)}, Target Length: {len(target)}")
    
    # Check if there's enough data for cumulative training
    if len(features) >= init_train_length + test_length:
        predicted_unit_prices = []
        
        # cumulative training for each group
        
        i = 0       # i is the number of cumulative trainings for each product
        while init_train_length + test_length + i * expand_size <= len(features):
            # Define the periods for training and testing
            train_end = init_train_length + i * expand_size
            test_start = train_end
            test_end = train_end + test_length
            print(f"i = {i}, train_end = {train_end}, test_start = {test_start}, test_end = {test_end}")
            # Split data into training and testing sets
            #features_train = features[features.index < train_end]
            #target_train = target[target.index < train_end]
            #features_test = features[(features.index >= test_start) & (features.index < test_end)]
            #target_test = target[(target.index >= test_start) & (target.index < test_end)]
    
            features_train = features.iloc[0:train_end]
            target_train = target.iloc[0:train_end]
            features_test = features.iloc[test_start:test_end]
            target_test = target.iloc[test_start:test_end]
            
            # Train the model
            model = LinearRegression()
            model.fit(features_train, target_train)
            
            # Make predictions
            y_pred = model.predict(features_test)
            
            # Evaluate the predictions using Mean Squared Error
            mse = mean_squared_error(target_test, y_pred)
            mse_scores.append(mse)
  
            # Calculate and store the predicted unit prices
            predicted_unit_price = y_pred[0]
            predicted_unit_prices.append(predicted_unit_price)

            print(f"Product {group_key}, Period {i} - MSE: {mse:.2f}, Predicted Unit Price: {predicted_unit_price}")
            i += 1
            table_data.append([group_key, i, predicted_unit_price, mse])

    else:
        print(f"Product {group_key} - Not enough data for cumulative training")

#avg_mse = np.mean(mse_scores)
#print(f"Average MSE: {avg_mse:.2f}")

# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))

avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")


Product bed1 - Features Length: 16, Target Length: 16
i = 0, train_end = 5, test_start = 5, test_end = 6
Product bed1, Period 0 - MSE: 0.00, Predicted Unit Price: 45.95
i = 1, train_end = 6, test_start = 6, test_end = 7
Product bed1, Period 1 - MSE: 20.85, Predicted Unit Price: 45.09824123767232
i = 2, train_end = 7, test_start = 7, test_end = 8
Product bed1, Period 2 - MSE: 3.85, Predicted Unit Price: 41.95176167518695
i = 3, train_end = 8, test_start = 8, test_end = 9
Product bed1, Period 3 - MSE: 7.55, Predicted Unit Price: 42.73724953155537
i = 4, train_end = 9, test_start = 9, test_end = 10
Product bed1, Period 4 - MSE: 0.95, Predicted Unit Price: 40.964100279889095
i = 5, train_end = 10, test_start = 10, test_end = 11
Product bed1, Period 5 - MSE: 0.00, Predicted Unit Price: 40.03180072427811
i = 6, train_end = 11, test_start = 11, test_end = 12
Product bed1, Period 6 - MSE: 0.52, Predicted Unit Price: 39.26888157185424
i = 7, train_end = 12, test_start = 12, test_end = 13
Produc

Product cool3, Period 0 - MSE: 720.35, Predicted Unit Price: 38.16067011482375
i = 1, train_end = 6, test_start = 6, test_end = 7
Product cool3, Period 1 - MSE: 16.86, Predicted Unit Price: 60.89419825704608
Product cool4 - Features Length: 9, Target Length: 9
i = 0, train_end = 5, test_start = 5, test_end = 6
Product cool4, Period 0 - MSE: 2579.50, Predicted Unit Price: 116.20119204745521
i = 1, train_end = 6, test_start = 6, test_end = 7
Product cool4, Period 1 - MSE: 501.37, Predicted Unit Price: 154.59871352154977
i = 2, train_end = 7, test_start = 7, test_end = 8
Product cool4, Period 2 - MSE: 2089.71, Predicted Unit Price: 128.09485743688683
i = 3, train_end = 8, test_start = 8, test_end = 9
Product cool4, Period 3 - MSE: 124.06, Predicted Unit Price: 158.85159442096392
Product cool5 - Features Length: 13, Target Length: 13
i = 0, train_end = 5, test_start = 5, test_end = 6
Product cool5, Period 0 - MSE: 0.00, Predicted Unit Price: 99.99
i = 1, train_end = 6, test_start = 6, test

Product garden6, Period 7 - MSE: 0.00, Predicted Unit Price: 53.41515152000017
i = 8, train_end = 13, test_start = 13, test_end = 14
Product garden6, Period 8 - MSE: 0.00, Predicted Unit Price: 49.899999999999665
i = 9, train_end = 14, test_start = 14, test_end = 15
Product garden6, Period 9 - MSE: 0.00, Predicted Unit Price: 49.909999999999705
i = 10, train_end = 15, test_start = 15, test_end = 16
Product garden6, Period 10 - MSE: 0.00, Predicted Unit Price: 49.909999999999734
Product garden7 - Features Length: 16, Target Length: 16
i = 0, train_end = 5, test_start = 5, test_end = 6
Product garden7, Period 0 - MSE: 0.00, Predicted Unit Price: 59.9
i = 1, train_end = 6, test_start = 6, test_end = 7
Product garden7, Period 1 - MSE: 73.58, Predicted Unit Price: 59.9
i = 2, train_end = 7, test_start = 7, test_end = 8
Product garden7, Period 2 - MSE: 1.46, Predicted Unit Price: 55.77054658969668
i = 3, train_end = 8, test_start = 8, test_end = 9
Product garden7, Period 3 - MSE: 8.26, Predi

Product health5, Period 1 - MSE: 0.00, Predicted Unit Price: 349.90000000000003
i = 2, train_end = 7, test_start = 7, test_end = 8
Product health5, Period 2 - MSE: 0.00, Predicted Unit Price: 349.90000000000003
i = 3, train_end = 8, test_start = 8, test_end = 9
Product health5, Period 3 - MSE: 0.00, Predicted Unit Price: 349.9
i = 4, train_end = 9, test_start = 9, test_end = 10
Product health5, Period 4 - MSE: 0.00, Predicted Unit Price: 349.9
i = 5, train_end = 10, test_start = 10, test_end = 11
Product health5, Period 5 - MSE: 770.82, Predicted Unit Price: 349.9
i = 6, train_end = 11, test_start = 11, test_end = 12
Product health5, Period 6 - MSE: 152.93, Predicted Unit Price: 337.533519977638
i = 7, train_end = 12, test_start = 12, test_end = 13
Product health5, Period 7 - MSE: 20.52, Predicted Unit Price: 345.36989103868643
i = 8, train_end = 13, test_start = 13, test_end = 14
Product health5, Period 8 - MSE: 0.00, Predicted Unit Price: 349.899999999974
i = 9, train_end = 14, test_

Product watches6, Period 2 - MSE: 9.80, Predicted Unit Price: 131.37202093549575
i = 3, train_end = 8, test_start = 8, test_end = 9
Product watches6, Period 3 - MSE: 1.22, Predicted Unit Price: 118.54745903145452
i = 4, train_end = 9, test_start = 9, test_end = 10
Product watches6, Period 4 - MSE: 22.52, Predicted Unit Price: 100.9026815264162
i = 5, train_end = 10, test_start = 10, test_end = 11
Product watches6, Period 5 - MSE: 0.21, Predicted Unit Price: 103.33449816416575
i = 6, train_end = 11, test_start = 11, test_end = 12
Product watches6, Period 6 - MSE: 0.00, Predicted Unit Price: 119.04915717500796
i = 7, train_end = 12, test_start = 12, test_end = 13
Product watches6, Period 7 - MSE: 0.00, Predicted Unit Price: 118.36363640000054
i = 8, train_end = 13, test_start = 13, test_end = 14
Product watches6, Period 8 - MSE: 0.00, Predicted Unit Price: 111.99999999999997
Product watches7 - Features Length: 12, Target Length: 12
i = 0, train_end = 5, test_start = 5, test_end = 6
Produ

In [9]:
table_df = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df.to_csv("Output/SW_predicted_prices_mse_cumulative.csv", index=False)

In [10]:
from collections import defaultdict

# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))

# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)

# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])

# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))


Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    45.95       0.00                      1
bed2                    89.90       0.00                      1
bed3                    84.89       0.00                      4
bed4                    47.76       0.02                      2
computers1              98.72       0.03                      1
computers2              77.03       0.76                      5
computers3             135.98       3.67                      1
computers4             119.99       0.00                     13
computers5              91.69      13.59                      1
computers6             149.94       0.00                      1
consoles1               36.20       0.00                      7
consoles2               33.42       0.46                      4
cool1                   99.99       0.00                      6
cool2                  129.99       0.00

In [11]:
optimal_table_df = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df.to_csv("Output/RF_optimal_prices_cumulative.csv", index=False)