# Model to predict Quantity(Demand)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# label encoder
from sklearn.preprocessing import LabelEncoder
# random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Cross validation k fold
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("retail_price.csv")


In [3]:
# we are also removing total_price as it will leak information about qty(our target variable)
useful_col = ['product_id', 'product_category_name', 'month_year', 'qty',
       'freight_price', 'unit_price', 'comp_1', 'ps1', 'fp1', 
       'comp_2', 'ps2', 'fp2', 'comp_3', 'ps3', 'fp3', 'lag_price']
df = df[useful_col].copy(deep=True)
df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty,freight_price,unit_price,comp_1,ps1,fp1,comp_2,ps2,fp2,comp_3,ps3,fp3,lag_price
0,bed1,bed_bath_table,01-05-2017,1,15.1,45.95,89.9,3.9,15.011897,215.0,4.4,8.76,45.95,4.0,15.1,45.9
1,bed1,bed_bath_table,01-06-2017,3,12.933333,45.95,89.9,3.9,14.769216,209.0,4.4,21.322,45.95,4.0,12.933333,45.95
2,bed1,bed_bath_table,01-07-2017,6,14.84,45.95,89.9,3.9,13.993833,205.0,4.4,22.195932,45.95,4.0,14.84,45.95
3,bed1,bed_bath_table,01-08-2017,4,14.2875,45.95,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.2875,45.95
4,bed1,bed_bath_table,01-09-2017,2,15.1,45.95,89.9,3.9,18.776522,163.39871,4.4,24.324687,45.95,4.0,15.1,45.95


**Label Encoding for categorical variables**

In [4]:
df.dtypes

product_id                object
product_category_name     object
month_year                object
qty                        int64
freight_price            float64
unit_price               float64
comp_1                   float64
ps1                      float64
fp1                      float64
comp_2                   float64
ps2                      float64
fp2                      float64
comp_3                   float64
ps3                      float64
fp3                      float64
lag_price                float64
dtype: object

In [5]:
# Convert to datetime
df['month_year'] = pd.to_datetime(df['month_year'], format='%d-%m-%Y')

In [6]:
df.dtypes

product_id                       object
product_category_name            object
month_year               datetime64[ns]
qty                               int64
freight_price                   float64
unit_price                      float64
comp_1                          float64
ps1                             float64
fp1                             float64
comp_2                          float64
ps2                             float64
fp2                             float64
comp_3                          float64
ps3                             float64
fp3                             float64
lag_price                       float64
dtype: object

In [7]:
# Extract numeric features from datetime
df['year'] = df['month_year'].dt.year
df['month'] = df['month_year'].dt.month

In [8]:
df.drop(['month_year'], axis=1, inplace=True)

In [9]:
df.dtypes

product_id                object
product_category_name     object
qty                        int64
freight_price            float64
unit_price               float64
comp_1                   float64
ps1                      float64
fp1                      float64
comp_2                   float64
ps2                      float64
fp2                      float64
comp_3                   float64
ps3                      float64
fp3                      float64
lag_price                float64
year                       int32
month                      int32
dtype: object

In [10]:
le = LabelEncoder()

df['product_id'] = le.fit_transform(df['product_id'])
df['product_category_name'] = le.fit_transform(df['product_category_name'])

In [11]:
df.dtypes

product_id                 int32
product_category_name      int32
qty                        int64
freight_price            float64
unit_price               float64
comp_1                   float64
ps1                      float64
fp1                      float64
comp_2                   float64
ps2                      float64
fp2                      float64
comp_3                   float64
ps3                      float64
fp3                      float64
lag_price                float64
year                       int32
month                      int32
dtype: object

**Train-test split**

In [12]:
X = df.drop("qty", axis=1)
y = df.qty

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Print the shapes of xtrain,xtest,ytrain,ytest like shape of xtrain is 

print(f"xtrain shape: {xtrain.shape}")
print(f"xtest shape: {xtest.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"ytest shape: {ytest.shape}")

xtrain shape: (540, 16)
xtest shape: (136, 16)
ytrain shape: (540,)
ytest shape: (136,)


In [14]:
rf_regressor = RandomForestRegressor(random_state=42)

In [15]:
xtrain_kf = pd.DataFrame(xtrain)
ytrain_kf = pd.DataFrame(ytrain)

In [16]:
# Run k-fold cross vaildation on xtrain, ytrain
# Define the number of folds
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Perform K-Fold Cross-Validation
mse_scores = []

for train_index, test_index in kf.split(xtrain_kf):
    # Split the data into training and testing sets
    X_train, X_test = xtrain_kf.iloc[train_index], xtrain_kf.iloc[test_index]
    y_train, y_test = ytrain_kf.iloc[train_index], ytrain_kf.iloc[test_index]
    
    # Ensure y is 1D
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()

    # Train the Random Forest Regressor
    rf_regressor.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = rf_regressor.predict(X_test)
    
    # Calculate the Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    rmse_scores = np.sqrt(mse_scores)

# Calculate and print the average MSE across all folds
average_rmse = np.mean(rmse_scores)
print(f"Root Mean Squared Errors for each fold: {rmse_scores}")
print(f"Average Root Mean Squared Error: {average_rmse}")

Root Mean Squared Errors for each fold: [ 7.46884864 14.61432624 13.02319889 17.23885002 14.79745373 13.58225592
 16.03728006 12.77293275 12.13670306 11.20629296]
Average Root Mean Squared Error: 13.287814227664347


**Training on whole train data**

In [17]:
rf_regressor.fit(xtrain,ytrain)

In [18]:
y_pred_train = rf_regressor.predict(xtrain)
mse_train = mean_squared_error(ytrain, y_pred_train)
rmse_train = np.sqrt(mse_train)
print(f"Root Mean Squared Error for train data: {rmse_train}")

y_pred_test = rf_regressor.predict(xtest)
mse_test = mean_squared_error(ytest, y_pred_test)
rmse_test = np.sqrt(mse_test)

print(f"Root Mean Squared Error for test data: {rmse_test}")

Root Mean Squared Error for train data: 5.074770091490006
Root Mean Squared Error for test data: 12.73600682525497


## Bayesian Optimization

In [19]:
xtrain.head()

Unnamed: 0,product_id,product_category_name,freight_price,unit_price,comp_1,ps1,fp1,comp_2,ps2,fp2,comp_3,ps3,fp3,lag_price,year,month
218,45,8,17.003333,149.0,149.0,4.2,19.1,108.0,4.2,16.653846,185.96,3.8,18.686,140.56,2017,12
18,27,5,39.8975,97.588235,59.9,4.1,19.7325,59.9,4.2,26.246667,97.588235,4.1,39.8975,82.633333,2017,5
567,22,5,34.083333,98.323333,49.91,4.1,20.328333,49.9,4.2,36.442,49.9,4.1,32.32,99.99,2018,8
408,39,6,12.083333,58.99,23.99,4.3,16.77,82.821429,4.1,12.418571,58.99,3.9,12.083333,58.99,2018,8
657,6,1,14.414545,79.8,119.0,4.2,39.2172,149.9,4.3,18.943077,79.8,3.5,14.414545,80.666667,2018,5


In [20]:
# min and max values of df['unit_price]
min_price = df['unit_price'].min()
max_price = df['unit_price'].max()

print(f"Minimum unit_price: {min_price}")
print(f"Maximum unit_price: {max_price}")


Minimum unit_price: 19.9
Maximum unit_price: 364.0


In [21]:
predicted_qty = rf_regressor.predict(np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]).reshape(1,-1))
predicted_qty[0]



57.03

In [26]:
from skopt import gp_minimize
from skopt.space import Real

def optimize_price(*features):

    
    features = np.array([val for val in features]).reshape(1,-1)
    predicted_qty = rf_regressor.predict(features)
    print(f"predicted quantity is {predicted_qty[0]}")
    
    def objective_function(price):
        print(f"price is {price[0]}")
        # Calculate profit: (Price - Cost) * Predicted Quantity
        avg_freight_price= np.mean(xtest['freight_price'])
        cost = avg_freight_price # Replace with your actual average cost
        profit = (price - cost) * predicted_qty[0]
        print(f"profit is {profit}")
        profit = -(profit[0])
        
        return profit

    # Define search space for unit_price
    search_space = [Real(10, 400, name="unit_price")]

    # Run Bayesian Optimization
    result = gp_minimize(
        func=objective_function,            # Profit function
        dimensions=search_space,
        acq_func="EI",          # Expected Improvement
        n_calls=10,             # Number of iterations
        random_state=42
    )
    
    optimal_price = result.x[0]
    return optimal_price
# sample values for features 'product_id', 'product_category_name', 'freight_price',
 #      'unit_price', 'comp_1', 'ps1', 'fp1', 'comp_2', 'ps2', 'fp2',
  #     'comp_3', 'ps3', 'fp3', 'lag_price', 'year', 'month'


optimal_price = optimize_price(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
print(optimal_price)

predicted quantity is 55.34
price is 320.65176487549087
profit is [16669.25289723]
price is 81.5395680478039
profit is [3436.78392479]
price is 314.0794901063801
profit is [16305.54321151]
price is 242.77156159912997
profit is [12359.36244792]
price is 183.87477361290058
profit is [9100.01420076]
price is 48.990217169021136
profit is [1635.50284715]
price is 189.10706786668823
profit is [9389.56936476]
price is 140.14635834421853
profit is [6680.08369979]
price is 65.71805898955691
profit is [2561.2216135]
price is 263.84650445005263
profit is [13525.64978529]
320.65176487549087




In [24]:
rf_regressor.feature_names_in_

array(['product_id', 'product_category_name', 'freight_price',
       'unit_price', 'comp_1', 'ps1', 'fp1', 'comp_2', 'ps2', 'fp2',
       'comp_3', 'ps3', 'fp3', 'lag_price', 'year', 'month'], dtype=object)

In [51]:
optimize_price(45,	8,	4,	149.000000,	149.00,	4.2,	19.100000,	108.000000,	4.2,	16.653846,	185.960000,	3.8,	18.686000,	140.560000,	2017,	1)



320.65176487549087

In [67]:
from skopt import gp_minimize
from skopt.space import Real
import numpy as np

def optimize_price(*features):
    # Reshape features for model prediction
    features = np.array([val for val in features]).reshape(1, -1)
    predicted_qty = rf_regressor.predict(features)[0]  # Extract scalar quantity

    # Calculate average freight price (ensure xtest is defined globally or replace it)
    avg_freight_price = np.mean(xtest['freight_price'])  # Replace with appropriate logic
    cost = avg_freight_price

    # Define the objective function
    def objective_function(price):
        # Price is passed as a list/array; extract price[0]
        profit = (price - cost) * predicted_qty
        return -profit  # Negative because gp_minimize minimizes

    # Define the search space for unit_price
    search_space = [Real(10, 400, name="unit_price")]

    # Run Bayesian Optimization
    result = gp_minimize(
        func=objective_function,   # Profit function
        dimensions=search_space,
        acq_func="EI",             # Expected Improvement
        n_calls=10,                # Number of iterations
        random_state=42
    )
    
    optimal_price = result.x[0]
    return optimal_price

# Sample values for features
optimal_price = optimize_price(1, 2, 30, 4, 5, 10, 7, 20, 9, 10, 11, 12, 13, 14, 15, 16)
print("Optimal Price:", optimal_price)




ValueError: `func` should return a scalar

In [60]:
rf_regressor.predict([[1, 2, 30, 4, 40, 1000, 7, 20, 9, 10, 11, 12, 13, 14, 15, 16]])



array([24.75])

In [None]:
rf_regressor.predict()