In [26]:
import pandas as pd
import numpy as np
from collections import defaultdict
from gym import Env, spaces
import random
import itertools
import pickle

In [27]:
potential_demand = np.arange(50, 151)  # Possible demand values from 50 to 150
stock_list = np.arange(50, 151)       # Possible stock values from 50 to 150
expected_sales = {}                   # Dictionary to store expected sales


for stock in stock_list:
    cumulative_expected_sales = 0
    for demand_realization in potential_demand:
        sales = min(demand_realization, stock)
        cumulative_expected_sales += sales

    # Calculate the average sales and store it in the dictionary
    expected_sales[stock] = cumulative_expected_sales / len(potential_demand)

In [28]:
class SupplyChainEnv(Env):
    def __init__(self, contract_type="wholesale", expected_sales=None, maxRounds=1000):
        super(SupplyChainEnv, self).__init__()
        self.contract_type = contract_type
        self.max_stock = 150
        self.min_w = 3
        self.max_price = 12
        self.max_rounds = maxRounds
        self.current_round = 0
        self.demand = 0
        self.expected_sales = expected_sales

        if self.contract_type == "wholesale":
            self.manufacturer_action_space = spaces.Discrete(
                self.max_price - self.min_w + 1
            )
            self.retailer_action_space = spaces.Discrete(5)
        elif self.contract_type == "buyback":
            self.manufacturer_action_space = spaces.MultiDiscrete(
                [self.max_price - self.min_w + 1, self.max_price + 1]
            )
            self.retailer_action_space = spaces.Discrete(5)
        elif self.contract_type == "revenue-sharing":
            self.manufacturer_action_space = spaces.MultiDiscrete(
                [self.max_price - self.min_w + 1, self.max_price + 1]
            )
            self.retailer_action_space = spaces.Discrete(5)
        else:
            raise ValueError("Invalid contract type.")

        self.observation_space = spaces.Box(
            low=0, high=150, shape=(3,), dtype=np.float32
        )
        self.reset()

    def reset(self):
        self.state = np.array([0, 0, 0])
        self.current_round = 0
        self.demand = 0
        return self.state

    def manufacturer_step(self, action):
        if self.contract_type == "wholesale":
            w = action + self.min_w
            b = 0
            r = 0
        elif self.contract_type == "buyback":
            w = action[0] + self.min_w
            b = action[1]
            r = 0
        elif self.contract_type == "revenue-sharing":
            w = action[0] + self.min_w
            r = action[1]
            b = 0

        self.state = np.array([w, b, r])
        return self.state

    def get_optimal_stock(self):
        w, b, r = self.state
        if self.contract_type == "wholesale":
            optimal_stock = 100 * ((12 - w) / 12) + 50
        elif self.contract_type == "buyback":
            if b == 12:
                optimal_stock = 150
            else:
                optimal_stock = 100 * ((12 - w) / (12 - b)) + 50
        elif self.contract_type == "revenue-sharing":
            if r == 12:
                optimal_stock = 0
            else:
                optimal_stock = 100 * ((12 - w - r) / (12 - r)) + 50
        else:
            optimal_stock = 100
        return optimal_stock

    def retailer_step(self, action):
        w, b, r = self.state

        if self.contract_type == "wholesale":
            optimal_stock = 100 * ((12 - w) / 12) + 50
        elif self.contract_type == "buyback":
            if b == 12:
                optimal_stock = 150
            else:
                optimal_stock = 100 * ((12 - w) / (12 - b)) + 50
        elif self.contract_type == "revenue-sharing":
            if r == 12:
                optimal_stock = 0
            else:
                optimal_stock = 100 * ((12 - w - r) / (12 - r)) + 50
        else:
            optimal_stock = 100

        if action == 0:
            Q = optimal_stock * 0.8
        elif action == 1:
            Q = optimal_stock * 0.9
        elif action == 2:
            Q = optimal_stock
        elif action == 3:
            Q = optimal_stock * 1.1
        elif action == 4:
            Q = optimal_stock * 1.2
        else:
            Q = optimal_stock

        Q = int(round(Q))
        Q = max(0, min(Q, self.max_stock))

        sales = self.expected_sales.get(Q, 0)
        leftovers = Q - sales
        c = 3
        p = 12

        if self.contract_type == "wholesale":
            retailer_profit = p * sales - w * Q
            manufacturer_profit = (w - c) * Q
            retailer_max = p * sales - w * optimal_stock
        elif self.contract_type == "buyback":
            if b > w:
                b = w
            retailer_profit = p * sales - w * Q + b * leftovers
            manufacturer_profit = (w - c) * Q - b * leftovers
            retailer_max =p * sales - w * optimal_stock + b * leftovers
        elif self.contract_type == "revenue-sharing":
            max_revenue_share = p - w
            if r > max_revenue_share:
                r = max_revenue_share
            retailer_profit = (p - r) * sales - w * Q
            manufacturer_profit = (w - c) * Q + r * sales
            retailer_max = (p - r) * sales - w * optimal_stock

        self.current_round += 1
        done = self.current_round >= self.max_rounds
        return self.state, (manufacturer_profit, retailer_profit), done, {}

In [29]:
# Q-Learning Agent
class QLearningAgent:
    def __init__(
        self,
        action_space,
        learning_rate=0.05,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.002,
        min_epsilon=0.00,
    ):
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.initial_epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.q_table = defaultdict(float)


        self.human_actions = []
        self.opponent_action_history = []
        self.behavior_change_threshold = 2

    def _state_to_key(self, state):
        return 0

    def get_possible_actions(self):
        if isinstance(self.action_space, spaces.Discrete):
            return range(self.action_space.n)
        elif isinstance(self.action_space, spaces.MultiDiscrete):
            ranges = [range(n) for n in self.action_space.nvec]
            return list(itertools.product(*ranges))

    def get_frozen_action_manufacturer(self, state, opponent_traits=None):
        """
        Always returns the same predefined action for the manufacturer.

        Args:
            state: The current state of the environment (unused in this function).
            opponent_traits: Traits of the opponent (unused in this function).

        Returns:
            The same predefined action for every call.
        """
        # Define the action you want the manufacturer to always take.
        # This action must be valid for the manufacturer's action space.
        if not hasattr(self, 'frozen_action1'):
            self.frozen_action1 = self.action_space.sample()  # Set a default action once.
        
        return self.frozen_action1
        
    def get_frozen_action_retailer(self, state, opponent_traits=None):
        """
        Always returns the same predefined action for the retailer.

        Args:
            state: The current state of the environment (unused in this function).
            opponent_traits: Traits of the opponent (unused in this function).

        Returns:
            The same predefined action for every call.
        """
        # Define the action you want the retailer to always take.
        # This action must be valid for the retailer's action space.
        if not hasattr(self, 'frozen_action2'):
            self.frozen_action2 = self.action_space.sample()  # Set a default action once.
        
        return self.frozen_action2


    def get_action(self, state):
        
        self.epsilon = max(
            self.min_epsilon,
            self.epsilon - self.epsilon_decay  # * (1.0 - (risk_aversion - 0.5))
        )

        if np.random.rand() < self.epsilon:
            return self.action_space.sample() if not isinstance(self.action_space, spaces.MultiDiscrete) else tuple(self.action_space.sample())
        else:
            state_key = self._state_to_key(state)
            possible_actions = self.get_possible_actions()
            q_values = [self.q_table[(state_key, a)] for a in possible_actions]
            max_q = max(q_values)
            max_actions = [a for a, q in zip(possible_actions, q_values) if q == max_q]
            return random.choice(max_actions)

    def update_q_table(self, state, action, reward, next_state):
        state_key = self._state_to_key(state)
        next_state_key = self._state_to_key(next_state)
        possible_actions = self.get_possible_actions()
        max_next_q = max([self.q_table[(next_state_key, a)] for a in possible_actions], default=0)
        # No reward shaping by traits, just regular Q-update
        self.q_table[(state_key, action)] += self.learning_rate * (
            reward + self.discount_factor * max_next_q - self.q_table[(state_key, action)]
        )

    def save_agent(self, filename):
        agent_data = {
            "q_table": self.q_table,
            "learning_rate": self.learning_rate,
            "discount_factor": self.discount_factor,
            "epsilon": self.epsilon,
            "initial_epsilon": self.initial_epsilon,
            "epsilon_decay": self.epsilon_decay,
            "min_epsilon": self.min_epsilon,
        }
        with open(filename, "wb") as f:
            pickle.dump(agent_data, f)

    def load_agent(self, filename):
        with open(filename, "rb") as f:
            agent_data = pickle.load(f)
        self.q_table = agent_data["q_table"]
        self.learning_rate = agent_data["learning_rate"]
        self.discount_factor = agent_data["discount_factor"]
        self.epsilon = agent_data["epsilon"]
        self.initial_epsilon = agent_data["initial_epsilon"]
        self.epsilon_decay = agent_data["epsilon_decay"]
        self.min_epsilon = agent_data["min_epsilon"]

    def update_q_value(self, state, action, reward, next_state):
        """
        Standard Q-Learning update.
        """
        possible_actions = self.get_possible_actions()
        next_qs = [self.q_table[(self._state_to_key(next_state), a)] for a in possible_actions]
        max_next_q = max(next_qs) if next_qs else 0.0

        old_q = self.q_table[(self._state_to_key(next_state), action)]
        new_q = old_q + self.learning_rate * (reward + self.discount_factor * max_next_q - old_q)
        self.q_table[(self._state_to_key(next_state), action)] = new_q

    def decay_epsilon(self):
        """
        Decay epsilon after each step if desired.
        """
        self.epsilon = max(self.min_epsilon, self.epsilon - self.epsilon_decay)


In [30]:
def train_from_scratch_each_episode(
    num_episodes=100,
    rounds_per_episode=1000,
    contract_type="wholesale",
    expected_sales=expected_sales,
    learning_rate=0.05,
    discount_factor=0.95,
    epsilon=1.0,
    epsilon_decay=0.002,
    min_epsilon=0.00,
):
    """
    For each episode:
      - Create new environment
      - Initialize new manufacturer & retailer agents with fresh Q-tables
      - Run 1000 steps
      - Track Q-values or store them for analysis
      - ALSO log manufacturer & retailer actions each round to a CSV
    """
    # We'll still store the Q-value evolution in 'q_value_history'
    q_value_history = []

    # NEW: We'll store (episode, round, state, action, reward, etc.) in a list of dicts
    action_log = []

    for ep in range(num_episodes):
        # Fresh environment
        env = SupplyChainEnv(contract_type=contract_type, expected_sales=expected_sales, maxRounds=rounds_per_episode)
        state = env.reset()

        # Fresh agents (both manufacturer & retailer)
        manufacturer_agent = QLearningAgent(env.manufacturer_action_space, learning_rate, discount_factor, epsilon, epsilon_decay, min_epsilon)
        retailer_agent = QLearningAgent(env.retailer_action_space, learning_rate, discount_factor, epsilon, epsilon_decay, min_epsilon)

        q_tables_per_round = []

        for round_i in range(rounds_per_episode):
            # 1) Manufacturer picks an action (frozen)
            #mfg_action = manufacturer_agent.get_action(state)
            mfg_action = manufacturer_agent.get_frozen_action_manufacturer(state)
            state_mfg = env.manufacturer_step(mfg_action)

            # 2) Retailer picks an action 
            ret_action = retailer_agent.get_action(state_mfg)
            next_state, (mfg_reward, ret_reward), done, _ = env.retailer_step(ret_action)

            # 3) Update Q-values for both agents
            manufacturer_agent.update_q_value(state, mfg_action, mfg_reward, next_state)
            retailer_agent.update_q_value(state_mfg, ret_action, ret_reward, next_state)

            # -- LOG THE ACTIONS & REWARDS --
            # Convert arrays to lists if needed so they can be saved as CSV-friendly data
            action_log.append({
                "Episode": ep,
                "Round": round_i,
                "CurrentState": state.tolist() if isinstance(state, np.ndarray) else state,
                "ManufacturerAction": mfg_action,
                "RetailerAction": ret_action,
                "ManufacturerReward": mfg_reward,
                "RetailerReward": ret_reward,
                "NextState": next_state.tolist() if isinstance(next_state, np.ndarray) else next_state,
                # If you also want to record Epsilon:
                "ManufacturerEpsilon": manufacturer_agent.epsilon,
                "RetailerEpsilon": retailer_agent.epsilon
            })

            # 5) Move to next state
            state = next_state

            # Optional: store Q-tables for analysis
            q_tables_per_round.append({
                'round': round_i,
                'manufacturer_q_table': dict(manufacturer_agent.q_table),
                'retailer_q_table': dict(retailer_agent.q_table),
            })

            if done:
                break

        q_value_history.append(q_tables_per_round)
        print(f"Episode {ep+1}/{num_episodes} completed. "
              f"Final manufacturer epsilon={manufacturer_agent.epsilon:.4f}, "
              f"retailer epsilon={retailer_agent.epsilon:.4f}")

    # -- AFTER ALL EPISODES, WRITE THE LOG TO A CSV --
    df_log = pd.DataFrame(action_log)
    df_log.to_csv("actions_log.csv", index=False)
    print("All actions logged to actions_log.csv")

    return q_value_history

q_value_evolution = train_from_scratch_each_episode(
    num_episodes=1,            # how many episodes
    rounds_per_episode=10000,   # how many rounds in each episode
    contract_type="wholesale",
    expected_sales=expected_sales,
    learning_rate=0.05,
    discount_factor=0.9,
    epsilon=1.0,
    epsilon_decay=0.0002,
    min_epsilon=0.00,
)

Episode 1/1 completed. Final manufacturer epsilon=1.0000, retailer epsilon=0.0000
All actions logged to actions_log.csv


In [31]:
from collections import Counter

def train_and_track_results(
    num_runs=1,  # Number of times to run the training
    num_episodes=100,
    rounds_per_episode=1000,
    contract_type="wholesale",
    expected_sales=expected_sales,
    learning_rate=0.05,
    discount_factor=0.90,
    epsilon=1.0,
    epsilon_decay=0.001,
    min_epsilon=0.00,
):
    """
    Runs the training multiple times with specified parameters, tracks results
    (last actions, frequencies), and returns a summary DataFrame.
    """

    all_runs_results = []

    for run in range(num_runs):
        print(f"Starting Run {run + 1}/{num_runs}")
        episode_results = []

        for ep in range(num_episodes):
            env = SupplyChainEnv(contract_type=contract_type, expected_sales=expected_sales, maxRounds=rounds_per_episode)
            state = env.reset()
            manufacturer_agent = QLearningAgent(env.manufacturer_action_space)
            retailer_agent = QLearningAgent(env.retailer_action_space)

            last_mfg_action = None
            last_ret_action = None

            for round_i in range(rounds_per_episode):
                mfg_action = manufacturer_agent.get_frozen_action_manufacturer(state)
                state_mfg = env.manufacturer_step(mfg_action)
                ret_action = retailer_agent.get_action(state_mfg)
                next_state, _, done, _ = env.retailer_step(ret_action) # rewards not needed here

                manufacturer_agent.update_q_value(state, mfg_action, 0, next_state) #Dummy reward
                retailer_agent.update_q_value(state_mfg, ret_action, 0, next_state) #Dummy reward

                last_mfg_action = mfg_action
                last_ret_action = ret_action
                state = next_state

                if done:
                    break

            episode_results.append({
                "Run": run + 1,
                "Episode": ep + 1,
                "LastMfgAction": last_mfg_action,
                "LastRetAction": last_ret_action,
            })
            #print(f"Run {run + 1}/{num_runs}, Episode {ep+1}/{num_episodes} completed.")

        all_runs_results.extend(episode_results)

    df_results = pd.DataFrame(all_runs_results)

    # Calculate frequencies of last actions across all episodes and runs.
    mfg_action_counts = Counter(df_results["LastMfgAction"])
    ret_action_counts = Counter(df_results["LastRetAction"])

    frequency_data = {
        "Run": "Frequency",
        "Episode": "Frequency",
        "LastMfgAction": dict(mfg_action_counts),
        "LastRetAction": dict(ret_action_counts),
    }

    df_results = pd.concat([df_results, pd.DataFrame([frequency_data])], ignore_index=True)
    df_results.to_csv("summary_results.csv", index=False)
    print("Summary results saved to summary_results.csv")

    return df_results


# Example usage:
summary_df = train_and_track_results(
    num_runs=1, # Run the training 3 times
    num_episodes=50, # 5 episodes each run
    rounds_per_episode=1010, # 100 rounds each episode
    expected_sales=expected_sales
)

print(summary_df)

Starting Run 1/1
Summary results saved to summary_results.csv
          Run    Episode                                      LastMfgAction  \
0           1          1                                                  6   
1           1          2                                                  7   
2           1          3                                                  1   
3           1          4                                                  5   
4           1          5                                                  0   
5           1          6                                                  5   
6           1          7                                                  4   
7           1          8                                                  8   
8           1          9                                                  1   
9           1         10                                                  8   
10          1         11                                             

In [32]:
import matplotlib.pyplot as plt

def plot_average_q_values(q_value_evolution, episode_index=0, agent="manufacturer"):
    """
    Plots the average Q-value per round for the specified agent in a given episode.
    agent can be 'manufacturer' or 'retailer'.
    episode_index is which episode's data to plot.
    """
    round_data = q_value_evolution[episode_index]  # This is the list of rounds for one episode
    avg_q_values = []

    for rd in round_data:
        if agent == "manufacturer":
            q_table = rd["manufacturer_q_table"]
        else:
            q_table = rd["retailer_q_table"]
        
        # If Q-table is empty, average is zero (or skip)
        if len(q_table) == 0:
            avg_q_values.append(0)
            continue

        # Compute average Q-value
        sum_q = sum(q_table.values())
        avg_q = sum_q / len(q_table)
        avg_q_values.append(avg_q)

    # Now we have a list of average Q-values, one per round
    plt.figure(figsize=(8, 5))
    plt.plot(range(len(avg_q_values)), avg_q_values, label=f'{agent.capitalize()} Avg Q')
    plt.xlabel("Round")
    plt.ylabel("Average Q-value")
    plt.title(f"Episode {episode_index} - {agent.capitalize()} Average Q-value Over Rounds")
    plt.legend()
    plt.show()

# Example usage: 
#   Plot average manufacturer Q for Episode 0.
#   Then do the same for retailer or another episode if you like.



#plot_average_q_values(q_value_evolution, episode_index=0, agent="manufacturer")
#plot_average_q_values(q_value_evolution, episode_index=0, agent="retailer")