## Packages needed for ML

In [1]:
#pip install tabulate

In [2]:
#pip install seaborn

In [3]:
#pip install xgboost

## Import Modules

In [4]:
import sqlite3

In [5]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

## Importing data and creating SQLite database

In [6]:
# reading in CSV (NOTE: This is the data after the ETL is done)
retail_df_csv = pd.read_csv("Resources/retail_price_cleaned.csv")

In [7]:
# path and name for database and connecting the db
conn = sqlite3.connect("Resources/pricing_opt.db")
# creating the table "retail" from the dataframe
retail_df_csv.to_sql('retail',conn,index=False,if_exists='replace')

In [8]:
query = "SELECT * FROM retail;"
retail_df = pd.read_sql(query,conn)
conn.close()
retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
0,bed1,bed_bath_table,01-05-2017,1,45.95,15.1,45.95,4.0,57,5,...,89.9,3.9,15.011897,215.0,4.4,8.76,45.95,4.0,15.1,45.9
1,bed1,bed_bath_table,01-06-2017,3,137.85,12.933333,45.95,4.0,61,6,...,89.9,3.9,14.769216,209.0,4.4,21.322,45.95,4.0,12.933333,45.95
2,bed1,bed_bath_table,01-07-2017,6,275.7,14.84,45.95,4.0,123,7,...,89.9,3.9,13.993833,205.0,4.4,22.195932,45.95,4.0,14.84,45.95
3,bed1,bed_bath_table,01-08-2017,4,183.8,14.2875,45.95,4.0,90,8,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.2875,45.95
4,bed1,bed_bath_table,01-09-2017,2,91.9,15.1,45.95,4.0,54,9,...,89.9,3.9,18.776522,163.39871,4.4,24.324687,45.95,4.0,15.1,45.95


In [13]:
# retail_df = pd.read_csv("Resources/retail_price_cleaned.csv")
#retail_df.info()


In [10]:
retail_df.columns

Index(['product_id', 'product_category_name', 'month_year', 'qty_sold',
       'total_price', 'freight_price', 'unit_price', 'product_rating',
       'no_customers', 'month', 'year', 'seasonality', 'volume', 'comp1_price',
       'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
       'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
       'comp3_prod_rating', 'comp3_freight_price', 'lag_price'],
      dtype='object')

In [11]:
# combining 'year' and 'month' to datetime
retail_df['date'] = pd.to_datetime(retail_df[['year', 'month']].assign(day=1))

# Sort the DataFrame by the new datetime column
retail_df = retail_df.sort_values('date')

retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price,date
389,health7,health_beauty,01-01-2017,1,64.99,11.06,64.99,3.9,9,1,...,3.9,11.06,64.99,3.9,11.06,64.99,3.9,11.06,64.94,2017-01-01
339,health5,health_beauty,01-01-2017,8,2799.2,22.90125,349.9,4.3,9,1,...,4.3,22.90125,349.9,4.3,22.90125,64.99,3.9,11.06,349.85,2017-01-01
438,bed2,bed_bath_table,01-02-2017,2,179.8,13.02,89.9,3.9,2,2,...,3.9,13.02,89.9,3.9,13.02,89.9,3.9,13.02,89.85,2017-02-01
236,garden8,garden_tools,01-02-2017,1,179.99,33.54,179.99,4.2,3,2,...,4.2,33.54,179.99,4.2,33.54,179.99,4.2,33.54,179.94,2017-02-01
58,health9,health_beauty,01-02-2017,11,219.89,11.750909,19.99,4.3,19,2,...,4.3,11.750909,19.99,4.3,11.750909,64.99,3.9,15.348,19.94,2017-02-01


# Linear Regression Model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the window size
window_size = 5

grouped = retail_df.groupby('product_id')

table_data = []
mse_scores = []
predicted_unit_prices = []

for group_key, group_data in grouped:
    # set features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]

    target = group_data['unit_price']
    
    num_samples = len(features)
    predicted_unit_prices = []
    
    # sliding window training and testing
    for i in range(window_size, num_samples):
        # Define the periods for training and testing
        train_start = i - window_size
        train_end = i
        test_start = i
        test_end = i + 1
        
        # Split data into training and testing sets
        features_train = features.iloc[train_start:train_end]
        target_train = target.iloc[train_start:train_end]
        features_test = features.iloc[test_start:test_end]
        target_test = target.iloc[test_start:test_end]
        
        # Train the model
        model = LinearRegression()
        model.fit(features_train, target_train)
        
        # Make predictions
        y_pred = model.predict(features_test)
        
        # Evaluate using Mean Squared Error
        mse = mean_squared_error(target_test, y_pred)
        mse_scores.append(mse)
        
        # Calculate and store the predicted unit prices
        predicted_unit_price = y_pred[0]
        predicted_unit_prices.append(predicted_unit_price)
        
        table_data.append([group_key, i+1, predicted_unit_price, mse])

# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))

avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")


        
#         print(f"Product {group_key}, Sample {i+1} - Predicted Price: {predicted_unit_price:.2f}, MSE: {mse:.2f}")

# avg_mse = np.mean(mse_scores)
# avg_predicted_unit_price = np.mean(predicted_unit_prices)
# print(f"Average MSE: {avg_mse:.2f}")
# print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")

Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 6              45.95      0.00
bed1                 7              45.95     29.36
bed1                 8              41.98      3.98
bed1                 9              22.08    320.69
bed1                10              40.42      0.18
bed1                11              39.32      0.45
bed1                12              39.89      0.01
bed1                13              39.99      0.00
bed1                14              39.99      0.00
bed1                15              39.99      0.00
bed1                16              39.99      0.56
bed2                 6              89.90      0.00
bed2                 7              89.90      0.00
bed2                 8              89.90      0.00
bed2                 9              89.90      0.00
bed2                10              89.90      0.00
bed2                11              89.90      1.99
bed2        

In [14]:
from collections import defaultdict

# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))

# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)

# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])

# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))


Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    45.95       0.00                      6
bed2                    89.90       0.00                      6
bed3                    85.01       0.01                      9
bed4                    48.18       0.08                      7
computers1              98.71       0.04                      6
computers2              78.22       0.10                      9
computers3             137.33       0.33                      7
computers4             155.10       0.16                      8
computers5              91.69      13.59                      6
computers6             149.90       0.00                      7
consoles1               36.49       0.08                     11
consoles2               33.05       0.09                      9
cool1                   99.99       0.00                      7
cool2                  129.99       0.00

In [15]:
#Save to table
table_df_1 = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df_1.to_csv("Output/SW_predicted_prices_mse.csv", index=False)

In [16]:
#Save optimal prices to csv
optimal_table_df_1 = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df_1.to_csv("Output/SW_optimal_prices.csv", index=False)

# Random Forest Model

In [17]:
from sklearn.ensemble import RandomForestRegressor

# Define the window size
window_size = 5

grouped = retail_df.groupby('product_id')

table_data = []
mse_scores = []
predicted_unit_prices = []

for group_key, group_data in grouped:
    # set features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]

    target = group_data['unit_price']
    
    num_samples = len(features)
    predicted_unit_prices = []
    
    # sliding window training and testing
    for i in range(window_size, num_samples):
        # Define the periods for training and testing
        train_start = i - window_size
        train_end = i
        test_start = i
        test_end = i + 1
        
        # Split data into training and testing sets
        features_train = features.iloc[train_start:train_end]
        target_train = target.iloc[train_start:train_end]
        features_test = features.iloc[test_start:test_end]
        target_test = target.iloc[test_start:test_end]
        
        # Train the model
        model = RandomForestRegressor(n_estimators=50, random_state=78)
        model.fit(features_train, target_train)
        
        # Make predictions
        y_pred = model.predict(features_test)
        
        # Evaluate using Mean Squared Error
        mse = mean_squared_error(target_test, y_pred)
        mse_scores.append(mse)
        
        # Calculate and store the predicted unit prices
        predicted_unit_price = y_pred[0]
        predicted_unit_prices.append(predicted_unit_price)
        
        table_data.append([group_key, i+1, predicted_unit_price, mse])

# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))

avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")


        
#         print(f"Product {group_key}, Sample {i+1} - Predicted Price: {predicted_unit_price:.2f}, MSE: {mse:.2f}")

# avg_mse = np.mean(mse_scores)
# avg_predicted_unit_price = np.mean(predicted_unit_prices)
# print(f"Average MSE: {avg_mse:.2f}")
# print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")

Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 6              45.95      0.00
bed1                 7              45.95     29.36
bed1                 8              45.73     32.99
bed1                 9              41.07      1.17
bed1                10              41.96      3.89
bed1                11              40.26      0.07
bed1                12              40.09      0.01
bed1                13              39.99      0.00
bed1                14              39.99      0.00
bed1                15              39.99      0.00
bed1                16              39.99      0.56
bed2                 6              89.90      0.00
bed2                 7              89.90      0.00
bed2                 8              89.90      0.00
bed2                 9              89.90      0.00
bed2                10              89.90      0.00
bed2                11              89.90      1.99
bed2        

In [18]:
# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))

# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)

# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])

# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))


Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    39.99       0.00                     13
bed2                    89.90       0.00                      6
bed3                    88.24      11.15                     11
bed4                    47.90       0.00                     10
computers1              99.82       0.85                      6
computers2              78.98       1.17                     10
computers3             135.55       6.68                     10
computers4             120.03       0.00                     18
computers5              89.22       0.46                      7
computers6             149.90       0.00                      7
consoles1               35.66       0.29                     12
consoles2               22.15       5.50                      6
cool1                   99.99       0.00                      7
cool2                  129.99       0.00

In [19]:
#Save predicted prices to csv
table_df_2 = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df_2.to_csv("Output/RF_predicted_prices_mse.csv", index=False)

In [20]:
#Save optimal prices to csv
optimal_table_df_2 = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df_2.to_csv("Output/RF_optimal_prices.csv", index=False)


# XGBoost Model

In [21]:
## use XGBOOST model
import xgboost as xgb
# Define the window size
window_size = 5
grouped = retail_df.groupby('product_id')
table_data = []
mse_scores = []
predicted_unit_prices = []
for group_key, group_data in grouped:
    # set features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]
                           
    target = group_data['unit_price']
    num_samples = len(features)
    predicted_unit_prices = []
    # sliding window training and testing
    for i in range(window_size, num_samples):
        # Define the periods for training and testing
        train_start = i - window_size
        train_end = i
        test_start = i
        test_end = i + 1
        # Split data into training and testing sets
        features_train = features.iloc[train_start:train_end]
        target_train = target.iloc[train_start:train_end]
        features_test = features.iloc[test_start:test_end]
        target_test = target.iloc[test_start:test_end]
        # Train the model
        model = xgb.XGBRegressor(n_estimators=50, random_state=78)
        model.fit(features_train, target_train)
        # Make predictions
        y_pred = model.predict(features_test)
        # Evaluate using Mean Squared Error
        mse = mean_squared_error(target_test, y_pred)
        mse_scores.append(mse)
        # Calculate and store the predicted unit prices
        predicted_unit_price = y_pred[0]
        predicted_unit_prices.append(predicted_unit_price)
        table_data.append([group_key, i+1, predicted_unit_price, mse])
# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))
avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")

Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 6              45.95      0.00
bed1                 7              45.95     29.36
bed1                 8              45.95     35.52
bed1                 9              40.53      0.29
bed1                10              39.99      0.00
bed1                11              39.99      0.00
bed1                12              39.99      0.00
bed1                13              39.99      0.00
bed1                14              39.99      0.00
bed1                15              39.99      0.00
bed1                16              39.99      0.56
bed2                 6              89.90      0.00
bed2                 7              89.90      0.00
bed2                 8              89.90      0.00
bed2                 9              89.90      0.00
bed2                10              89.90      0.00
bed2                11              89.90      1.99
bed2        

In [22]:
# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))
# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)
# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])
# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))

Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    39.99       0.00                     11
bed2                    89.90       0.00                      6
bed3                    92.00       0.00                      6
bed4                    47.90       0.00                     10
computers1              98.90       0.00                      8
computers2              77.90       0.00                     10
computers3             133.68       0.50                     10
computers4             149.99       0.00                     10
computers5              88.07      84.05                      8
computers6             149.90       0.00                      7
consoles1               36.20       0.00                     12
consoles2               32.74       2.14                     10
cool1                   99.99       0.00                      7
cool2                  129.99       0.00

In [23]:
#Save predicted prices to csv
table_df_3 = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df_3.to_csv("Output/XGB_predicted_prices_mse.csv", index=False)

In [24]:
#Save optimal prices to csv
optimal_table_df_3 = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df_3.to_csv("Output/XGB_optimal_prices.csv", index=False)

# Linear Regression Cumulative Model

In [25]:
# Initial size of data for training. This will expand by "expand_size" at every loop cycle.
# This is the concept of cumulative training.
init_train_length = 5 
# Define size of testing data.
test_length = 1
# Define the step size for expansion.
expand_size = 1

grouped = retail_df.groupby('product_id')

table_data = []
mse_scores = []
predicted_unit_prices = []

In [26]:
# For cumulative model
for group_key, group_data in grouped:
    # Extract the group's features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]

    target = group_data['unit_price']
   
    # debug
    print(f"Product {group_key} - Features Length: {len(features)}, Target Length: {len(target)}")
    
    # Check if there's enough data for cumulative training
    if len(features) >= init_train_length + test_length:
        predicted_unit_prices = []
        
        # cumulative training for each group
        
        i = 0       # i is the number of cumulative trainings for each product
        while init_train_length + test_length + i * expand_size <= len(features):
            # Define the periods for training and testing
            train_end = init_train_length + i * expand_size
            test_start = train_end
            test_end = train_end + test_length
            print(f"i = {i}, train_end = {train_end}, test_start = {test_start}, test_end = {test_end}")
            # Split data into training and testing sets
            #features_train = features[features.index < train_end]
            #target_train = target[target.index < train_end]
            #features_test = features[(features.index >= test_start) & (features.index < test_end)]
            #target_test = target[(target.index >= test_start) & (target.index < test_end)]
    
            features_train = features.iloc[0:train_end]
            target_train = target.iloc[0:train_end]
            features_test = features.iloc[test_start:test_end]
            target_test = target.iloc[test_start:test_end]
            
            # Train the model
            model = LinearRegression()
            model.fit(features_train, target_train)
            
            # Make predictions
            y_pred = model.predict(features_test)
            
            # Evaluate the predictions using Mean Squared Error
            mse = mean_squared_error(target_test, y_pred)
            mse_scores.append(mse)
  
            # Calculate and store the predicted unit prices
            predicted_unit_price = y_pred[0]
            predicted_unit_prices.append(predicted_unit_price)

            print(f"Product {group_key}, Period {i} - MSE: {mse:.2f}, Predicted Unit Price: {predicted_unit_price}")
            i += 1
            table_data.append([group_key, i, predicted_unit_price, mse])

    else:
        print(f"Product {group_key} - Not enough data for cumulative training")

#avg_mse = np.mean(mse_scores)
#print(f"Average MSE: {avg_mse:.2f}")

# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))

avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")


Product bed1 - Features Length: 16, Target Length: 16
i = 0, train_end = 5, test_start = 5, test_end = 6
Product bed1, Period 0 - MSE: 0.00, Predicted Unit Price: 45.95
i = 1, train_end = 6, test_start = 6, test_end = 7
Product bed1, Period 1 - MSE: 19.78, Predicted Unit Price: 44.97894074509583
i = 2, train_end = 7, test_start = 7, test_end = 8
Product bed1, Period 2 - MSE: 3.85, Predicted Unit Price: 41.951761675186944
i = 3, train_end = 8, test_start = 8, test_end = 9
Product bed1, Period 3 - MSE: 7.55, Predicted Unit Price: 42.7372495315555
i = 4, train_end = 9, test_start = 9, test_end = 10
Product bed1, Period 4 - MSE: 0.95, Predicted Unit Price: 40.96410027988912
i = 5, train_end = 10, test_start = 10, test_end = 11
Product bed1, Period 5 - MSE: 0.00, Predicted Unit Price: 40.03180072427809
i = 6, train_end = 11, test_start = 11, test_end = 12
Product bed1, Period 6 - MSE: 0.52, Predicted Unit Price: 39.268881571854294
i = 7, train_end = 12, test_start = 12, test_end = 13
Produc

Product computers4, Period 10 - MSE: 0.00, Predicted Unit Price: 114.49115380000045
i = 11, train_end = 16, test_start = 16, test_end = 17
Product computers4, Period 11 - MSE: 0.00, Predicted Unit Price: 119.98999999999998
i = 12, train_end = 17, test_start = 17, test_end = 18
Product computers4, Period 12 - MSE: 0.00, Predicted Unit Price: 119.99000000000004
Product computers5 - Features Length: 8, Target Length: 8
i = 0, train_end = 5, test_start = 5, test_end = 6
Product computers5, Period 0 - MSE: 13.59, Predicted Unit Price: 91.68659066136146
i = 1, train_end = 6, test_start = 6, test_end = 7
Product computers5, Period 1 - MSE: 36.03, Predicted Unit Price: 95.90230466961891
i = 2, train_end = 7, test_start = 7, test_end = 8
Product computers5, Period 2 - MSE: 212.13, Predicted Unit Price: 93.46454466987052
Product computers6 - Features Length: 8, Target Length: 8
i = 0, train_end = 5, test_start = 5, test_end = 6
Product computers6, Period 0 - MSE: 0.00, Predicted Unit Price: 149.

Product furniture3, Period 5 - MSE: 3.87, Predicted Unit Price: 36.96657030565571
i = 6, train_end = 11, test_start = 11, test_end = 12
Product furniture3, Period 6 - MSE: 0.04, Predicted Unit Price: 35.19719396794565
Product furniture4 - Features Length: 10, Target Length: 10
i = 0, train_end = 5, test_start = 5, test_end = 6
Product furniture4, Period 0 - MSE: 0.00, Predicted Unit Price: 99.9
i = 1, train_end = 6, test_start = 6, test_end = 7
Product furniture4, Period 1 - MSE: 0.00, Predicted Unit Price: 99.89999999999999
i = 2, train_end = 7, test_start = 7, test_end = 8
Product furniture4, Period 2 - MSE: 0.00, Predicted Unit Price: 99.89999999999999
i = 3, train_end = 8, test_start = 8, test_end = 9
Product furniture4, Period 3 - MSE: 456.43, Predicted Unit Price: 99.9
i = 4, train_end = 9, test_start = 9, test_end = 10
Product furniture4, Period 4 - MSE: 0.01, Predicted Unit Price: 83.93204763451995
Product garden1 - Features Length: 18, Target Length: 18
i = 0, train_end = 5, t

Product garden4, Period 8 - MSE: 0.00, Predicted Unit Price: 49.89999999999997
Product garden5 - Features Length: 14, Target Length: 14
i = 0, train_end = 5, test_start = 5, test_end = 6
Product garden5, Period 0 - MSE: 3.81, Predicted Unit Price: 99.28549906459261
i = 1, train_end = 6, test_start = 6, test_end = 7
Product garden5, Period 1 - MSE: 9.92, Predicted Unit Price: 102.14987243046886
i = 2, train_end = 7, test_start = 7, test_end = 8
Product garden5, Period 2 - MSE: 130.54, Predicted Unit Price: 110.42534837239147
i = 3, train_end = 8, test_start = 8, test_end = 9
Product garden5, Period 3 - MSE: 3050.02, Predicted Unit Price: 154.2270033348667
i = 4, train_end = 9, test_start = 9, test_end = 10
Product garden5, Period 4 - MSE: 34.59, Predicted Unit Price: 104.88104116833952
i = 5, train_end = 10, test_start = 10, test_end = 11
Product garden5, Period 5 - MSE: 16978.61, Predicted Unit Price: -30.80200796682827
i = 6, train_end = 11, test_start = 11, test_end = 12
Product gard

Product health5, Period 4 - MSE: 0.00, Predicted Unit Price: 349.9
i = 5, train_end = 10, test_start = 10, test_end = 11
Product health5, Period 5 - MSE: 770.82, Predicted Unit Price: 349.9
i = 6, train_end = 11, test_start = 11, test_end = 12
Product health5, Period 6 - MSE: 152.93, Predicted Unit Price: 337.5335199776381
i = 7, train_end = 12, test_start = 12, test_end = 13
Product health5, Period 7 - MSE: 20.52, Predicted Unit Price: 345.3698910386866
i = 8, train_end = 13, test_start = 13, test_end = 14
Product health5, Period 8 - MSE: 0.00, Predicted Unit Price: 349.89999999997485
i = 9, train_end = 14, test_start = 14, test_end = 15
Product health5, Period 9 - MSE: 0.00, Predicted Unit Price: 349.9000000000586
i = 10, train_end = 15, test_start = 15, test_end = 16
Product health5, Period 10 - MSE: 0.00, Predicted Unit Price: 349.89999999999253
i = 11, train_end = 16, test_start = 16, test_end = 17
Product health5, Period 11 - MSE: 1.14, Predicted Unit Price: 351.1818181834534
i =

Product perfumery2, Period 0 - MSE: 130.68, Predicted Unit Price: 129.33134859287154
i = 1, train_end = 6, test_start = 6, test_end = 7
Product perfumery2, Period 1 - MSE: 41.90, Predicted Unit Price: 111.41606973394046
i = 2, train_end = 7, test_start = 7, test_end = 8
Product perfumery2, Period 2 - MSE: 0.03, Predicted Unit Price: 117.86427769376148
i = 3, train_end = 8, test_start = 8, test_end = 9
Product perfumery2, Period 3 - MSE: 0.01, Predicted Unit Price: 117.89934906313219
i = 4, train_end = 9, test_start = 9, test_end = 10
Product perfumery2, Period 4 - MSE: 0.00, Predicted Unit Price: 119.88902258814431
i = 5, train_end = 10, test_start = 10, test_end = 11
Product perfumery2, Period 5 - MSE: 0.00, Predicted Unit Price: 119.17857192908437
i = 6, train_end = 11, test_start = 11, test_end = 12
Product perfumery2, Period 6 - MSE: 0.00, Predicted Unit Price: 118.69999999999999
i = 7, train_end = 12, test_start = 12, test_end = 13
Product perfumery2, Period 7 - MSE: 0.00, Predict

Product watches8, Period 2 - MSE: 7999.24, Predicted Unit Price: 63.19487254945324
i = 3, train_end = 8, test_start = 8, test_end = 9
Product watches8, Period 3 - MSE: 1518.47, Predicted Unit Price: 118.17531908447145
i = 4, train_end = 9, test_start = 9, test_end = 10
Product watches8, Period 4 - MSE: 549.44, Predicted Unit Price: 134.50524851539117
Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 1              45.95      0.00
bed1                 2              44.98     19.78
bed1                 3              41.95      3.85
bed1                 4              42.74      7.55
bed1                 5              40.96      0.95
bed1                 6              40.03      0.00
bed1                 7              39.27      0.52
bed1                 8              40.01      0.00
bed1                 9              39.99      0.00
bed1                10              39.99      0.00
bed1                11 

In [27]:
from collections import defaultdict

# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))

# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)

# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])

# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))


Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    45.95       0.00                      1
bed2                    89.90       0.00                      1
bed3                    84.89       0.00                      4
bed4                    47.76       0.02                      2
computers1              98.71       0.04                      1
computers2              77.03       0.76                      5
computers3             135.98       3.67                      1
computers4             119.99       0.00                     12
computers5              91.69      13.59                      1
computers6             149.94       0.00                      1
consoles1               36.20       0.00                      7
consoles2               33.42       0.46                      4
cool1                   99.99       0.00                      6
cool2                  129.99       0.00

In [28]:
table_df_4 = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df_4.to_csv("Output/SW_predicted_prices_mse_cumulative.csv", index=False)

In [30]:
optimal_table_df_4 = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df_4.to_csv("Output/RF_optimal_prices_cumulative.csv", index=False)
