Dynamic Pricing with Deep Q-Network

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# Load the dataset
orders = pd.read_csv('/content/olist_orders_dataset.csv')
order_items = pd.read_csv('/content/olist_order_items_dataset.csv')
products = pd.read_csv('/content/olist_products_dataset.csv')

# Merge the datasets to get a complete view
df = pd.merge(order_items, orders, on="order_id")
df = pd.merge(df, products, on="product_id")

# Convert order_purchase_timestamp to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# Feature engineering: Extracting year, month, and day from order_purchase_timestamp
df['year'] = df['order_purchase_timestamp'].dt.year
df['month'] = df['order_purchase_timestamp'].dt.month
df['day'] = df['order_purchase_timestamp'].dt.day

# Convert product_id and order_item_id to numerical values
df['product_id'] = df['product_id'].astype('category').cat.codes
df['order_item_id'] = df['order_item_id'].astype('category').cat.codes

# Check for missing values
df = df.dropna()

# Select relevant features for the RL environment
df = df[['order_id', 'product_id', 'price', 'freight_value', 'order_item_id', 'year', 'month', 'day']]

# Normalize the price and freight_value columns
df['price'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())
df['freight_value'] = (df['freight_value'] - df['freight_value'].min()) / (df['freight_value'].max() - df['freight_value'].min())

class PriceOptimizerEnv(gym.Env):
    def __init__(self, df):
        super(PriceOptimizerEnv, self).__init__()
        self.df = df
        self.current_step = 0
        self.action_space = spaces.Discrete(10)  # 10 possible price levels
        self.observation_space = spaces.Box(low=0, high=1, shape=(9,), dtype=np.float32)  # year, month, day, product_id, order_item_id, price, freight_value, stock_quantity, demand

    def reset(self):
        self.current_step = 0
        self.current_data = self.df.sample()
        self.state = self._get_state()
        return self.state

    def _get_state(self):
        stock_quantity = np.random.rand()
        demand = np.random.rand()
        state = self.current_data[['year', 'month', 'day', 'product_id', 'order_item_id', 'price', 'freight_value']].values.flatten().astype(np.float32)
        state = np.append(state, [stock_quantity, demand])
        return state

    def step(self, action):
        reward = self._get_reward(action)
        self.current_step += 1
        done = self.current_step >= len(self.df)
        self.state = self._get_state()
        return self.state, reward, done, {}

    def _get_reward(self, action):
        reward = np.random.rand()  # Mock reward
        return reward

# Create the environment
env = PriceOptimizerEnv(df)

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0   # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = np.random.choice(len(self.memory), batch_size)
        for i in minibatch:
            state, action, reward, next_state, done = self.memory[i]
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Initialize the agent with the updated state size
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

# Train the agent
episodes = 100
batch_size = 32

for e in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if done:
            print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2}")
            break

    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

# Save the model
model_filename = 'dqn_price_optimizer_model.h5'
agent.model.save(model_filename)
print(f"Model saved to {model_filename}")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Model saved to dqn_price_optimizer_model.h5


  saving_api.save_model(
