In [None]:
pip install tabulate

In [None]:
pip install seaborn

In [None]:
pip install xgboost

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

In [2]:
retail_df = pd.read_csv("Resources/retail_price_cleaned.csv")

retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   product_id             676 non-null    object 
 1   product_category_name  676 non-null    object 
 2   month_year             676 non-null    object 
 3   qty_sold               676 non-null    int64  
 4   total_price            676 non-null    float64
 5   freight_price          676 non-null    float64
 6   unit_price             676 non-null    float64
 7   product_rating         676 non-null    float64
 8   no_customers           676 non-null    int64  
 9   month                  676 non-null    int64  
 10  year                   676 non-null    int64  
 11  seasonality            676 non-null    float64
 12  volume                 676 non-null    int64  
 13  comp1_price            676 non-null    float64
 14  comp1_prod_rating      676 non-null    float64
 15  comp1_

In [3]:
retail_df.columns

Index(['product_id', 'product_category_name', 'month_year', 'qty_sold',
       'total_price', 'freight_price', 'unit_price', 'product_rating',
       'no_customers', 'month', 'year', 'seasonality', 'volume', 'comp1_price',
       'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
       'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
       'comp3_prod_rating', 'comp3_freight_price', 'lag_price'],
      dtype='object')

In [4]:
# combining 'year' and 'month' to datetime
retail_df['date'] = pd.to_datetime(retail_df[['year', 'month']].assign(day=1))

# Sort the DataFrame by the new datetime column
retail_df = retail_df.sort_values('date')

retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price,date
389,health7,health_beauty,01-01-2017,1,64.99,11.06,64.99,3.9,9,1,...,3.9,11.06,64.99,3.9,11.06,64.99,3.9,11.06,64.94,2017-01-01
339,health5,health_beauty,01-01-2017,8,2799.2,22.90125,349.9,4.3,9,1,...,4.3,22.90125,349.9,4.3,22.90125,64.99,3.9,11.06,349.85,2017-01-01
438,bed2,bed_bath_table,01-02-2017,2,179.8,13.02,89.9,3.9,2,2,...,3.9,13.02,89.9,3.9,13.02,89.9,3.9,13.02,89.85,2017-02-01
236,garden8,garden_tools,01-02-2017,1,179.99,33.54,179.99,4.2,3,2,...,4.2,33.54,179.99,4.2,33.54,179.99,4.2,33.54,179.94,2017-02-01
58,health9,health_beauty,01-02-2017,11,219.89,11.750909,19.99,4.3,19,2,...,4.3,11.750909,19.99,4.3,11.750909,64.99,3.9,15.348,19.94,2017-02-01


In [5]:
# Define the number and length of periods
num_periods = 5
period_length = 5  # Months per period

grouped = retail_df.groupby('product_id')

mse_scores = []

for group_key, group_data in grouped:
    # Extract the group's features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]

    target = group_data['unit_price']
    
    # debug
    print(f"Product {group_key} - Features Length: {len(features)}, Target Length: {len(target)}")
    
    # Check if there's enough data for cumulative training
    if len(features) >= num_periods * period_length:
        # cumulative training for each group
        for i in range(1, num_periods + 1):
            # Define the periods for training and testing
            train_end = i * period_length
            test_start = train_end
            test_end = (i + 1) * period_length
            
            # Split data into training and testing sets
            features_train = features[features.index < train_end]
            target_train = target[target.index < train_end]
            features_test = features[(features.index >= test_start) & (features.index < test_end)]
            target_test = target[(target.index >= test_start) & (target.index < test_end)]
            
            # Train the model
            model = LinearRegression()
            model.fit(features_train, target_train)
            
            # Make predictions
            y_pred = model.predict(features_test)
            
            # Evaluate the predictions using Mean Squared Error
            mse = mean_squared_error(target_test, y_pred)
            mse_scores.append(mse)
            
            print(f"Product {group_key}, Period {i} - MSE: {mse:.2f}")
    else:
        print(f"Product {group_key} - Not enough data for cumulative training")

avg_mse = np.mean(mse_scores)
print(f"Average MSE: {avg_mse:.2f}")

Product bed1 - Features Length: 16, Target Length: 16
Product bed1 - Not enough data for cumulative training
Product bed2 - Features Length: 19, Target Length: 19
Product bed2 - Not enough data for cumulative training
Product bed3 - Features Length: 11, Target Length: 11
Product bed3 - Not enough data for cumulative training
Product bed4 - Features Length: 10, Target Length: 10
Product bed4 - Not enough data for cumulative training
Product bed5 - Features Length: 5, Target Length: 5
Product bed5 - Not enough data for cumulative training
Product computers1 - Features Length: 15, Target Length: 15
Product computers1 - Not enough data for cumulative training
Product computers2 - Features Length: 10, Target Length: 10
Product computers2 - Not enough data for cumulative training
Product computers3 - Features Length: 10, Target Length: 10
Product computers3 - Not enough data for cumulative training
Product computers4 - Features Length: 18, Target Length: 18
Product computers4 - Not enough da

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the window size
window_size = 5

grouped = retail_df.groupby('product_id')

table_data = []
mse_scores = []
predicted_unit_prices = []

for group_key, group_data in grouped:
    # set features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]

    target = group_data['unit_price']
    
    num_samples = len(features)
    predicted_unit_prices = []
    
    # sliding window training and testing
    for i in range(window_size, num_samples):
        # Define the periods for training and testing
        train_start = i - window_size
        train_end = i
        test_start = i
        test_end = i + 1
        
        # Split data into training and testing sets
        features_train = features.iloc[train_start:train_end]
        target_train = target.iloc[train_start:train_end]
        features_test = features.iloc[test_start:test_end]
        target_test = target.iloc[test_start:test_end]
        
        # Train the model
        model = LinearRegression()
        model.fit(features_train, target_train)
        
        # Make predictions
        y_pred = model.predict(features_test)
        
        # Evaluate using Mean Squared Error
        mse = mean_squared_error(target_test, y_pred)
        mse_scores.append(mse)
        
        # Calculate and store the predicted unit prices
        predicted_unit_price = y_pred[0]
        predicted_unit_prices.append(predicted_unit_price)
        
        table_data.append([group_key, i+1, predicted_unit_price, mse])

# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))

avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")


Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 6              45.95      0.00
bed1                 7              45.95     29.36
bed1                 8              41.98      3.98
bed1                 9              22.08    320.69
bed1                10              40.42      0.18
bed1                11              39.32      0.45
bed1                12              39.89      0.01
bed1                13              39.99      0.00
bed1                14              39.99      0.00
bed1                15              39.99      0.00
bed1                16              39.99      0.56
bed2                 6              89.90      0.00
bed2                 7              89.90      0.00
bed2                 8              89.90      0.00
bed2                 9              89.90      0.00
bed2                10              89.90      0.00
bed2                11              89.90      1.99
bed2        

In [7]:
table_df = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df.to_csv("Output/SW_predicted_prices_mse.csv", index=False)

In [8]:
from collections import defaultdict

# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))

# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)

# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])

# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))


Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    45.95       0.00                      6
bed2                    89.90       0.00                      6
bed3                    85.01       0.01                      9
bed4                    48.18       0.08                      7
computers1              98.72       0.03                      6
computers2              78.22       0.10                      9
computers3             137.33       0.33                      7
computers4             155.10       0.16                      8
computers5              91.69      13.59                      6
computers6             149.90       0.00                      7
consoles1               36.49       0.08                     11
consoles2               33.05       0.09                      9
cool1                   99.99       0.00                      7
cool2                  129.99       0.00

In [9]:
# for group_key, group_data in grouped:
#     plt.figure(figsize=(10, 6))

#     sns.regplot(x='unit_price', y='predicted_unit_price', data=group_data,
#                 scatter_kws={'s': 20}, line_kws={'color': 'red'})

#     plt.title(f'Scatter Plot with Regression Line - Product ID: {group_key}')
#     plt.xlabel('Actual Unit Price')
#     plt.ylabel('Predicted Unit Price')
#     plt.show()

In [14]:
scaler = StandardScaler()

NameError: name 'StandardScaler' is not defined

In [13]:
# accuracy score of SW with LR
reshaped_target_test = target_test.array.reshape(-1,1)
score_sw_lr = model.score(reshaped_target_test, y_pred, sample_weight=None)
print(score_sw_lr)

  "X does not have valid feature names, but"


ValueError: X has 1 features, but LinearRegression is expecting 17 features as input.

In [10]:
optimal_table_df = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df.to_csv("Output/SW_optimal_prices.csv", index=False)

In [11]:
from sklearn.ensemble import RandomForestRegressor
# Define the window size
window_size = 5
grouped = retail_df.groupby('product_id')
table_data = []
mse_scores = []
predicted_unit_prices = []
for group_key, group_data in grouped:
    # set features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]
                           
    target = group_data['unit_price']
    num_samples = len(features)
    predicted_unit_prices = []
    # sliding window training and testing
    for i in range(window_size, num_samples):
        # Define the periods for training and testing
        train_start = i - window_size
        train_end = i
        test_start = i
        test_end = i + 1
        # Split data into training and testing sets
        features_train = features.iloc[train_start:train_end]
        target_train = target.iloc[train_start:train_end]
        features_test = features.iloc[test_start:test_end]
        target_test = target.iloc[test_start:test_end]
        # Train the model
        model = RandomForestRegressor(n_estimators=50, random_state=78)
        model.fit(features_train, target_train)
        # Make predictions
        y_pred = model.predict(features_test)
        # Evaluate using Mean Squared Error
        mse = mean_squared_error(target_test, y_pred)
        mse_scores.append(mse)
        # Calculate and store the predicted unit prices
        predicted_unit_price = y_pred[0]
        predicted_unit_prices.append(predicted_unit_price)
        table_data.append([group_key, i+1, predicted_unit_price, mse])
# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))
avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")

Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 6              45.95      0.00
bed1                 7              45.95     29.36
bed1                 8              45.73     32.99
bed1                 9              41.07      1.17
bed1                10              41.96      3.89
bed1                11              40.26      0.07
bed1                12              40.09      0.01
bed1                13              39.99      0.00
bed1                14              39.99      0.00
bed1                15              39.99      0.00
bed1                16              39.99      0.56
bed2                 6              89.90      0.00
bed2                 7              89.90      0.00
bed2                 8              89.90      0.00
bed2                 9              89.90      0.00
bed2                10              89.90      0.00
bed2                11              89.90      1.99
bed2        

In [12]:
table_df = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df.to_csv("Output/RF_predicted_prices_mse.csv", index=False)

In [13]:
# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))
# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)
# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])
# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))

Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    39.99       0.00                     13
bed2                    89.90       0.00                      6
bed3                    88.24      11.15                     11
bed4                    47.90       0.00                     10
computers1              99.82       0.85                      6
computers2              78.98       1.17                     10
computers3             135.55       6.68                     10
computers4             150.29       0.09                     10
computers5              89.22       0.46                      7
computers6             149.90       0.00                      7
consoles1               35.66       0.29                     12
consoles2               22.15       5.50                      6
cool1                   99.99       0.00                      7
cool2                  129.99       0.00

In [14]:
optimal_table_df = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df.to_csv("Output/RF_optimal_prices.csv", index=False)

In [15]:
## use XGBOOST model
import xgboost as xgb
# Define the window size
window_size = 5
grouped = retail_df.groupby('product_id')
table_data = []
mse_scores = []
predicted_unit_prices = []
for group_key, group_data in grouped:
    # set features and target variable
    features = group_data[['qty_sold', 'total_price', 'freight_price', 'product_rating',
                           'no_customers', 'seasonality', 'volume', 'comp1_price',
                           'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
                           'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
                           'comp3_prod_rating', 'comp3_freight_price', 'lag_price']]
                           
    target = group_data['unit_price']
    num_samples = len(features)
    predicted_unit_prices = []
    # sliding window training and testing
    for i in range(window_size, num_samples):
        # Define the periods for training and testing
        train_start = i - window_size
        train_end = i
        test_start = i
        test_end = i + 1
        # Split data into training and testing sets
        features_train = features.iloc[train_start:train_end]
        target_train = target.iloc[train_start:train_end]
        features_test = features.iloc[test_start:test_end]
        target_test = target.iloc[test_start:test_end]
        # Train the model
        model = xgb.XGBRegressor(n_estimators=50, random_state=78)
        model.fit(features_train, target_train)
        # Make predictions
        y_pred = model.predict(features_test)
        # Evaluate using Mean Squared Error
        mse = mean_squared_error(target_test, y_pred)
        mse_scores.append(mse)
        # Calculate and store the predicted unit prices
        predicted_unit_price = y_pred[0]
        predicted_unit_prices.append(predicted_unit_price)
        table_data.append([group_key, i+1, predicted_unit_price, mse])
# Print table
table_headers = ["Product ID", "Sample", "Predicted Price", "MSE"]
print(tabulate(table_data, headers=table_headers, floatfmt=(".0f", ".0f", ".2f", ".2f")))
avg_mse = np.mean(mse_scores)
avg_predicted_unit_price = np.mean(predicted_unit_prices)
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average Predicted Unit Price: {avg_predicted_unit_price:.2f}")

Product ID      Sample    Predicted Price       MSE
------------  --------  -----------------  --------
bed1                 6              45.95      0.00
bed1                 7              45.95     29.36
bed1                 8              45.95     35.52
bed1                 9              40.53      0.29
bed1                10              39.99      0.00
bed1                11              39.99      0.00
bed1                12              39.99      0.00
bed1                13              39.99      0.00
bed1                14              39.99      0.00
bed1                15              39.99      0.00
bed1                16              39.99      0.56
bed2                 6              89.90      0.00
bed2                 7              89.90      0.00
bed2                 8              89.90      0.00
bed2                 9              89.90      0.00
bed2                10              89.90      0.00
bed2                11              89.90      1.99
bed2        

In [16]:
table_df = pd.DataFrame(table_data, columns=["Product ID", "Sample", "Predicted Price", "MSE"])
table_df.to_csv("Output/XGB_predicted_prices_mse.csv", index=False)

In [17]:
# Calculate the average predicted unit price and MSE for each product ID
product_avg_predicted_prices = defaultdict(list)
for entry in table_data:
    group_key = entry[0]
    avg_predicted_price = entry[2]
    avg_mse = entry[3]
    product_avg_predicted_prices[group_key].append((avg_predicted_price, avg_mse, entry[1]))
# Find the optimal prices for each product ID based on minimum average MSE
optimal_prices = {}
for group_key, avg_mse_list in product_avg_predicted_prices.items():
    min_avg_mse_entry = min(avg_mse_list, key=lambda x: x[1])
    optimal_price = min_avg_mse_entry[0]
    min_mse = min_avg_mse_entry[1]
    min_mse_sample = min_avg_mse_entry[2]
    optimal_prices[group_key] = (optimal_price, min_mse, min_mse_sample)
# Prepare the final table data for optimal prices
optimal_table_data = []
for group_key, (optimal_price, min_mse, min_mse_sample) in optimal_prices.items():
    optimal_table_data.append([group_key, optimal_price, min_mse, min_mse_sample])
# Print the table for optimal prices
optimal_table_headers = ["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"]
print(tabulate(optimal_table_data, headers=optimal_table_headers, floatfmt=(".0f", ".2f", ".2f", ".0f")))

Product ID      Optimal Price    Min MSE    Sample with Min MSE
------------  ---------------  ---------  ---------------------
bed1                    39.99       0.00                     11
bed2                    89.90       0.00                      6
bed3                    92.00       0.00                      6
bed4                    47.90       0.00                     10
computers1              98.90       0.00                      8
computers2              77.90       0.00                     10
computers3             133.68       0.50                     10
computers4             149.99       0.00                     10
computers5              88.07      84.05                      8
computers6             149.90       0.00                      7
consoles1               36.20       0.00                     12
consoles2               32.74       2.14                     10
cool1                   99.99       0.00                      7
cool2                  129.99       0.00

In [18]:
optimal_table_df = pd.DataFrame(optimal_table_data, columns=["Product ID", "Optimal Price", "Min MSE", "Sample with Min MSE"])
optimal_table_df.to_csv("Output/XGB_optimal_prices.csv", index=False)

#### Rearranging the 4 predicted tables

In [8]:
sw_pred_df = pd.read_csv("Output/SW_predicted_prices_mse.csv")
sw_op_df = pd.read_csv("Output/SW_optimal_prices.csv")

In [13]:
no_sample_per_id = sw_pred_df.groupby(by=["Product ID"])["Sample"].count()
no_sample_per_id

Product ID
bed1          11
bed2          14
bed3           6
bed4           5
computers1    10
computers2     5
computers3     5
computers4    13
computers5     3
computers6     3
consoles1      7
consoles2      5
cool1         10
cool2          8
cool3          2
cool4          4
cool5          8
furniture1     8
furniture2     8
furniture3     7
furniture4     5
garden1       13
garden10      11
garden2       12
garden3       13
garden4        9
garden5        9
garden6       11
garden7       11
garden8        9
garden9       12
health1        4
health10       2
health2        8
health3        3
health4        6
health5       15
health6        2
health7       15
health8       12
health9       13
perfumery1     8
perfumery2     8
watches1      12
watches2      10
watches3      10
watches4       5
watches5       5
watches6       9
watches7       7
watches8       5
Name: Sample, dtype: int64

In [16]:
sw_op_df["Number of Samples"] = list(no_sample_per_id)
sw_op_df.head()

Unnamed: 0,Product ID,Optimal Price,Min MSE,Sample with Min MSE,Number of Samples
0,bed1,45.95,0.0,6,11
1,bed2,89.9,0.0,6,14
2,bed3,85.014052,0.013008,9,6
3,bed4,48.184328,0.080843,7,5
4,computers1,98.715605,0.034002,6,10


In [17]:
sw_op_df.to_csv("Output/SW_optimal_with_no_samples.csv", index=False)

In [18]:
rf_pred_df = pd.read_csv("Output/RF_predicted_prices_mse.csv")
rf_op_df = pd.read_csv("Output/RF_optimal_prices.csv")

In [19]:
no_sample_per_id_rf = rf_pred_df.groupby(by=["Product ID"])["Sample"].count()
no_sample_per_id_rf

Product ID
bed1          11
bed2          14
bed3           6
bed4           5
computers1    10
computers2     5
computers3     5
computers4    13
computers5     3
computers6     3
consoles1      7
consoles2      5
cool1         10
cool2          8
cool3          2
cool4          4
cool5          8
furniture1     8
furniture2     8
furniture3     7
furniture4     5
garden1       13
garden10      11
garden2       12
garden3       13
garden4        9
garden5        9
garden6       11
garden7       11
garden8        9
garden9       12
health1        4
health10       2
health2        8
health3        3
health4        6
health5       15
health6        2
health7       15
health8       12
health9       13
perfumery1     8
perfumery2     8
watches1      12
watches2      10
watches3      10
watches4       5
watches5       5
watches6       9
watches7       7
watches8       5
Name: Sample, dtype: int64

In [20]:
rf_op_df["Number of Samples"] = list(no_sample_per_id_rf)
rf_op_df.head()

Unnamed: 0,Product ID,Optimal Price,Min MSE,Sample with Min MSE,Number of Samples
0,bed1,39.99,0.0,13,11
1,bed2,89.9,2.019484e-28,6,14
2,bed3,88.2391,11.14959,11,6
3,bed4,47.9,2.4738680000000002e-27,10,5
4,computers1,99.82,0.8464,6,10


In [21]:
rf_op_df.to_csv("Output/RF_optimal_with_no_samples.csv", index=False)

In [22]:
xgb_pred_df = pd.read_csv("Output/XGB_predicted_prices_mse.csv")
xgb_op_df = pd.read_csv("Output/XGB_optimal_prices.csv")

In [23]:
no_sample_per_id_xgb = xgb_pred_df.groupby(by=["Product ID"])["Sample"].count()
no_sample_per_id_xgb

Product ID
bed1          11
bed2          14
bed3           6
bed4           5
computers1    10
computers2     5
computers3     5
computers4    13
computers5     3
computers6     3
consoles1      7
consoles2      5
cool1         10
cool2          8
cool3          2
cool4          4
cool5          8
furniture1     8
furniture2     8
furniture3     7
furniture4     5
garden1       13
garden10      11
garden2       12
garden3       13
garden4        9
garden5        9
garden6       11
garden7       11
garden8        9
garden9       12
health1        4
health10       2
health2        8
health3        3
health4        6
health5       15
health6        2
health7       15
health8       12
health9       13
perfumery1     8
perfumery2     8
watches1      12
watches2      10
watches3      10
watches4       5
watches5       5
watches6       9
watches7       7
watches8       5
Name: Sample, dtype: int64

In [24]:
xgb_op_df["Number of Samples"] = list(no_sample_per_id_xgb)
xgb_op_df.head()

Unnamed: 0,Product ID,Optimal Price,Min MSE,Sample with Min MSE,Number of Samples
0,bed1,39.989983,3.025867e-10,11,11
1,bed2,89.89995,2.691522e-09,6,14
2,bed3,91.99944,3.101886e-07,6,6
3,bed4,47.899742,6.649876e-08,10,5
4,computers1,98.89987,1.642853e-08,8,10


In [25]:
xgb_op_df.to_csv("Output/XGB_optimal_with_no_samples.csv", index=False)

In [4]:
cum_pred_df = pd.read_csv("Output/Cum_predicted_prices_mse.csv")
cum_op_df = pd.read_csv("Output/Cum_optimal_prices.csv")

In [5]:
no_sample_per_id_cum = cum_pred_df.groupby(by=["Product ID"])["Sample"].count()
no_sample_per_id_cum

Product ID
bed1          11
bed2          14
bed3           6
bed4           5
computers1    10
computers2     5
computers3     5
computers4    13
computers5     3
computers6     3
consoles1      7
consoles2      5
cool1         10
cool2          8
cool3          2
cool4          4
cool5          8
furniture1     8
furniture2     8
furniture3     7
furniture4     5
garden1       13
garden10      11
garden2       12
garden3       13
garden4        9
garden5        9
garden6       11
garden7       11
garden8        9
garden9       12
health1        4
health10       2
health2        8
health3        3
health4        6
health5       15
health6        2
health7       15
health8       12
health9       13
perfumery1     8
perfumery2     8
watches1      12
watches2      10
watches3      10
watches4       5
watches5       5
watches6       9
watches7       7
watches8       5
Name: Sample, dtype: int64

In [6]:
cum_op_df["Number of Samples"] = list(no_sample_per_id_cum)
cum_op_df.head()

Unnamed: 0,Product ID,Optimal Price,Min MSE,Sample with Min MSE,Number of Samples
0,bed1,45.95,0.0,1,11
1,bed2,89.9,0.0,1,14
2,bed3,84.890277,9.5e-05,4,6
3,bed4,47.764527,0.018353,2,5
4,computers1,98.715605,0.034002,1,10


In [7]:
cum_op_df.to_csv("Output/Cum_optimal_with_no_samples.csv", index=False)