set path because otherwise it does not find price_simulator packages

In [2]:
import os
import sys
filepath_pc = r"C:\Users\Thomas Gausmann\sciebo - Gausmann, Thomas (t_gaus04@uni-muenster.de)@uni-muenster.sciebo.de\Masterarbeit\price_simulator"
filepath_laptop = r"C:\Users\gausm\sciebo - Gausmann, Thomas (t_gaus04@uni-muenster.de)@uni-muenster.sciebo.de\Masterarbeit\price_simulator"
os.chdir(filepath_laptop)
sys.path.append(filepath_laptop)
import numpy as np
import attr


packages packages (homemade) packages

Note: do not run tf and pyTorch together, calamity ensues

In [5]:
import random
from typing import List, Tuple

import attr

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from price_simulator.src.algorithm.agents.simple import AgentStrategy
from price_simulator.src.algorithm.policies import EpsilonGreedy, ExplorationStrategy


The LSTM model setup

In [6]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc1(out[:, -1, :])
        out = self.fc2(out)
        return out


create the LSTM agent and its procedures

Kept close to DQNAgent in approximate.py, less target network and replay buffer

In [None]:
@attr.s
class SimpleLSTMAgent(AgentStrategy):
    """Simplified LSTM Agent using sequences of past states"""

    # LSTM Network
    lstm: LSTMModel = attr.ib(default=None)
    hidden_nodes: int = attr.ib(default=32)
    sequence_length: int = attr.ib(default=5)  # Number of past states to use

    # General
    decision: ExplorationStrategy = attr.ib(factory=EpsilonGreedy)
    discount: float = attr.ib(default=0.95)
    learning_rate: float = attr.ib(default=0.1)

    @discount.validator
    def check_discount(self, attribute, value):
        if not 0 <= value <= 1:
            raise ValueError("Discount factor must lie in [0,1]")

    @learning_rate.validator
    def check_learning_rate(self, attribute, value):
        if not 0 <= value < 1:
            raise ValueError("Learning rate must lie in [0,1)")

    def who_am_i(self) -> str:
        return type(self).__name__ + " (gamma: {}, alpha: {}, policy: {}, quality: {}, mc: {})".format(
            self.discount, self.learning_rate, self.decision.who_am_i(), self.quality, self.marginal_cost
        )

    def play_price(self, states: List[Tuple[float]], action_space: List[float], n_period: int, t: int) -> float:
        """Returns an action by either following greedy policy or experimentation."""

        # init LSTM network if necessary
        if not self.lstm:
            self.lstm = self.initialize_network(len(states[0]), len(action_space))

        # play action
        if self.decision.explore(n_period, t):
            # if exploration:
            # select random action from the action space
            return random.choice(action_space) 
        else:
            # otherwise exploit
            # scale input sequence between [0,1]. cast to floating point tnesor, add new dimension at position 0, indicating bacth size
            states_input = torch.tensor(self.scale_sequence(states, action_space)).float().unsqueeze(0)
            # predict action values (Q-Values): detach to ensure no gradients are tracked (i.e. computations below do not affect original tensor),
            # returns numpy array
            # action values of form [1, num_actions]
            action_values = self.lstm(states_input).detach().numpy()
            # check for ties
            if sum(np.isclose(action_values[0], action_values[0].max())) > 1:
                #in case of ties, select randomly
                optimal_action_index = np.random.choice(
                    np.flatnonzero(np.isclose(action_values[0], action_values[0].max()))
                )
            else:
                # otherwise select maximum action value straight up
                optimal_action_index = np.argmax(action_values[0])
            return action_space[optimal_action_index]

    def learn(
        self,
        previous_rewards: List[float],
        rewards: List[float],
        previous_actions: List[float],
        actions: List[float],
        action_space: List,
        previous_states: List[Tuple],
        states: List[Tuple],
        next_states: List[Tuple],
    ):
        # Prepare the current state and next state sequences
        # scale them [0,1], make them a float and add batch dimension
        states_input = torch.tensor(self.scale_sequence(states, action_space)).float().unsqueeze(0)
        next_states_input = torch.tensor(self.scale_sequence(next_states, action_space)).float().unsqueeze(0)

        # Get max predicted Q values (for next state) from the local model
        # item() to select value from tensor
        next_optimal_q = self.lstm(next_states_input).max().item()

        # Compute Q targets for the current state (Bellman equation)
        targets = rewards[-1] + self.discount * next_optimal_q

        # Get current Q values from the local model ...
        local_estimates = self.lstm(states_input)
        # ... and update them with better estimates
        action_idx = np.atleast_1d(action_space == actions[-1]).nonzero()[0] # determine index of the action; instead of np.where to avoid deprecated warning
        local_estimates[0, action_idx] = targets

        # Perform gradient descent step on the local network
        optimizer = optim.Adam(self.lstm.parameters(), lr=self.learning_rate)
        # due to gradient accumulation, reset if not using batch updates
        optimizer.zero_grad()
        # compute lossfunction
        loss = nn.MSELoss()(local_estimates, local_estimates.clone().detach()) # instead of loss = nn.MSELoss()(local_estimates, torch.tensor(local_estimates).float()) to avoid warning
        # backward propagation
        loss.backward()
        optimizer.step()

    def initialize_network(self, n_agents: int, n_actions: int):
        """Create a neural network with one output node per possible action"""
        return LSTMModel(input_size=n_agents, hidden_size=self.hidden_nodes, output_size=n_actions)

    def scale_sequence(self, sequences: List[Tuple], action_space: List) -> np.array:
        """Scale float input sequences to range from 0 to 1."""
        max_action = max(action_space)
        min_action = min(action_space)
        return np.array([
            np.multiply(np.divide(np.array(seq) - min_action, max_action - min_action), 1) for seq in sequences
        ])


Lets test play price with example data

In [8]:
def test_play_price():
    # Define the possible action space
    possible_prices = [1.0, 2.0, 3.0]

    # Create an instance of SimpleLSTMAgent with EpsilonGreedy strategy
    agent = SimpleLSTMAgent(decision=EpsilonGreedy(eps=0.0))

    # Define a sample state sequence with sequence length of 5
    state_sequence = [
        (1.0, 2.0),
        (2.0, 2.0),
        (2.0, 2.0),
        (1.0, 2.0),
        (2.0, 2.0)
    ]

    # Call the play_price function to get an action
    action = agent.play_price(state_sequence, possible_prices, 0, 0)

    # Verify that the action is in the possible action space
    assert action in possible_prices, f"Action {action} is not in the possible action space {possible_prices}"

    print(f"play_price result: {action}")

# Run the test
test_play_price()


play_price result: 2.0


Lets test wether learning function runs

In [9]:
def test_learn():
    # Define the possible action space
    possible_prices = [1.0, 2.0, 3.0]

    # Create an instance of SimpleLSTMAgent with EpsilonGreedy strategy, no exploration
    agent = SimpleLSTMAgent(decision=EpsilonGreedy(eps=0.0))

    # Initialize the LSTM network
    agent.lstm = agent.initialize_network(2, len(possible_prices))

    # Define a sample state sequence
    state_sequence = [
        (1.0, 2.0),
        (2.0, 3.0),
        (3.0, 1.0),
        (2.0, 1.0),
        (1.0, 3.0)
    ]
    
    # Define the next state sequence
    next_state_sequence = [
        (2.0, 1.0),
        (3.0, 2.0),
        (1.0, 2.0),
        (3.0, 1.0),
        (2.0, 3.0)
    ]

    # Sample actions and rewards
    previous_rewards = [1.0]
    rewards = [10.0]
    previous_actions = [0.0]
    actions = [1.0]

    # Call the learn function to update Q-values
    agent.learn(
        previous_rewards=previous_rewards, # not used
        rewards=rewards,
        previous_actions=previous_actions, # not used
        actions=actions,
        action_space=possible_prices,
        previous_states=state_sequence, # not used
        states=state_sequence,
        next_states=next_state_sequence
    )
    
    # Check the Q-values
    state_input = torch.tensor(agent.scale_sequence(state_sequence, possible_prices)).float().unsqueeze(0)
    action_values = agent.lstm(state_input).detach().numpy()

    print(f"Q-values after learning: {action_values}")

# Run the test
test_learn()


Q-values after learning: [[-0.09762758 -0.12373158 -0.09351548]]


In [3]:

from price_simulator.src.algorithm import main
main.run()



Agent                                                                                            Average Price    Nash Price    Monopoly Price    Average Profit Gain    Nash Profit    Monopoly Profit
---------------------------------------------------------------------------------------------  ---------------  ------------  ----------------  ---------------------  -------------  -----------------
AlwaysDefectAgent                                                                                      1.42772       1.47293           1.92498               0.283858       0.222927            0.33749
Qlearning (gamma: 0.95, alpha: 0.125, policy: DecreasingEpsilonGreedy, quality: 2.0, mc: 1.0)          1.57339       1.47293           1.92498              -0.285828       0.222927            0.33749


In [4]:
import price_simulator.src.utils.analyzer as Analyzer
from price_simulator.src.algorithm.agents.approximate import DiffDQN
from price_simulator.src.algorithm.agents.simple import AlwaysDefectAgent
from price_simulator.src.algorithm.agents.tabular import Qlearning
from price_simulator.src.algorithm.demand import LogitDemand
from price_simulator.src.algorithm.environment import DiscreteSynchronEnvironment
from price_simulator.src.algorithm.policies import DecreasingEpsilonGreedy
from price_simulator.src.algorithm.agents.lstm_agent import LSTM_Agent  # Import the new LSTM agent

In [9]:
def run():
    dqn_env = DiscreteSynchronEnvironment(
        markup=0.1,
        n_periods=100000,
        possible_prices=[],
        n_prices=15,
        demand=LogitDemand(outside_quality=0.0, price_sensitivity=0.25),
        history_after=50,
        agents=[
            Qlearning(
                discount=0.95, learning_rate=0.125, decision=DecreasingEpsilonGreedy(), marginal_cost=1.0, quality=2.0,
            ),
            Qlearning(
                discount=0.95, learning_rate=0.125, decision=DecreasingEpsilonGreedy(), marginal_cost=1.0, quality=2.0,
            )
        ],
    )
    dqn_env.play_game()
    Analyzer.analyze(dqn_env)

run()

Agent                                                                                            Average Price    Nash Price    Monopoly Price    Average Profit Gain    Nash Profit    Monopoly Profit
---------------------------------------------------------------------------------------------  ---------------  ------------  ----------------  ---------------------  -------------  -----------------
Qlearning (gamma: 0.95, alpha: 0.125, policy: DecreasingEpsilonGreedy, quality: 2.0, mc: 1.0)          1.67401       1.47293           1.92498               0.41837        0.222927            0.33749
Qlearning (gamma: 0.95, alpha: 0.125, policy: DecreasingEpsilonGreedy, quality: 2.0, mc: 1.0)          1.65593       1.47293           1.92498               0.508553       0.222927            0.33749
