In [19]:
import pandas as pd
import numpy as np

In [20]:
historical_data_wholesale = pd.read_csv('../Notebooks/experiment/curr_data.csv')
historical_data_buyback = pd.read_csv('../Notebooks/experiment/curr_data_bb.csv')
historical_data_revenue_sharing = pd.read_csv('../Notebooks/experiment/curr_data_rs.csv')
behavioral_data = pd.read_csv('../adjusted_reponse_survey.csv')

In [21]:
behavioral_data.columns = behavioral_data.columns.str.strip()
historical_data_wholesale.columns = historical_data_wholesale.columns.str.strip()
historical_data_buyback.columns = historical_data_buyback.columns.str.strip()
historical_data_revenue_sharing.columns = historical_data_revenue_sharing.columns.str.strip()

In [22]:
historical_data_wholesale.columns = historical_data_wholesale.columns.str.replace(' ', '_')
historical_data_buyback.columns = historical_data_buyback.columns.str.replace(' ', '_')
historical_data_revenue_sharing.columns = historical_data_revenue_sharing.columns.str.replace(' ', '_')

In [23]:
# Function to categorize retailer behavior based on stock and optimal stock
def categorize_behavior(row):
    """
    Categorizes retailer behavior as understocking, optimal, or overstocking.

    Args:
        row: A row from the historical data DataFrame.

    Returns:
        An integer representing the behavioral category:
        0: Understocking (Stock < Optimal * 0.8)
        1: Optimal Stocking (Optimal * 0.8 <= Stock <= Optimal * 1.2)
        2: Overstocking (Stock > Optimal * 1.2)
    """
    stock = row['Stock']
    optimal_stock = row["Optimal_Stock"]

    try:
        optimal_stock = float(optimal_stock)  # Try converting to float
    except ValueError:
        optimal_stock = 0  # Set to 0 if conversion fails

    if stock < optimal_stock * 0.8:
        return 0  # Understocking
    
    elif stock <= optimal_stock * 1.2:
        return 1  # Optimal Stocking
    
    else:
        return 2  # Overstocking

# Apply categorization to each dataset
historical_data_wholesale['Behavioral_Category'] = historical_data_wholesale.apply(categorize_behavior, axis=1)
historical_data_buyback['Behavioral_Category'] = historical_data_buyback.apply(categorize_behavior, axis=1)
historical_data_revenue_sharing['Behavioral_Category'] = historical_data_revenue_sharing.apply(categorize_behavior, axis=1)

# 2. Remove Extra 'Behavioral_Category' Columns
def remove_extra_columns(df, column_name):
    """Removes extra columns with the given name, keeping only the last one."""
    cols = df.columns.tolist()
    count = cols.count(column_name)
    if count > 1:
        indices = [i for i, x in enumerate(cols) if x == column_name]
        for i in indices[:-1]:  # Remove all but the last one
            df.drop(df.columns[i], axis=1, inplace=True)

remove_extra_columns(historical_data_wholesale, "Behavioral_Category")
remove_extra_columns(historical_data_buyback, "Behavioral_Category")
remove_extra_columns(historical_data_revenue_sharing, "Behavioral_Category")

# Save the updated data
historical_data_wholesale.to_csv('../Notebooks/experiment/curr_data.csv', index=False)
historical_data_buyback.to_csv('../Notebooks/experiment/curr_data_bb.csv', index=False)
historical_data_revenue_sharing.to_csv('../Notebooks/experiment/curr_data_rs.csv', index=False)

In [24]:
# --- Trait Extraction ---
def extract_traits(behavioral_data):
    """
    Extract traits for all manufacturers and retailers from the behavioral data.
    Returns dictionaries of traits for manufacturers and retailers indexed by PLAYER NAME.
    """
    manufacturers = behavioral_data[[
        'Manufacturer_Self Esteem Average',
        'Manufacturer_Regret Scale Average',
        'Manufacturer_Risk Averse Coefficient',
        'Manufacturer_Fairness Index'
    ]].rename(lambda col: col.replace('Manufacturer_', ''), axis=1)

    retailers = behavioral_data[[
        'Retailer_Self Esteem Average',
        'Retailer_Regret Scale Average',
        'Retailer_Risk Averse Coefficient',
        'Retailer_Fairness Index'
    ]].rename(lambda col: col.replace('Retailer_', ''), axis=1)

    # Return as dictionaries indexed by PLAYER NAME
    manufacturer_traits = manufacturers.to_dict(orient='index')
    retailer_traits = retailers.to_dict(orient='index')

    return manufacturer_traits, retailer_traits

# Extract Traits
manufacturer_traits_dict, retailer_traits_dict = extract_traits(behavioral_data)

In [25]:
potential_demand = np.arange(50, 151)  # Possible demand values from 50 to 150
stock_list = np.arange(50, 151)       # Possible stock values from 50 to 150
expected_sales = {}                   # Dictionary to store expected sales


for stock in stock_list:
    cumulative_expected_sales = 0
    for demand_realization in potential_demand:
        sales = min(demand_realization, stock)
        cumulative_expected_sales += sales

    # Calculate the average sales and store it in the dictionary
    expected_sales[stock] = cumulative_expected_sales / len(potential_demand)

In [26]:
import numpy as np
import pandas as pd
from collections import defaultdict
from gym import Env, spaces
import random
import itertools
import pickle

# --- Environment Definition ---

class SupplyChainEnv(Env):
    """
    Custom Environment for the supply chain game.
    """

    def __init__(self, contract_type="wholesale", expected_sales=None):
        super(SupplyChainEnv, self).__init__()
        self.contract_type = contract_type
        self.max_stock = 150
        self.min_w = 3  # Minimum wholesale price
        self.max_price = 12
        self.max_rounds = 40
        self.current_round = 0
        self.demand = 0  # Initialize demand
        self.expected_sales = expected_sales  # Initialize expected sales

        if self.contract_type == "wholesale":
            self.manufacturer_action_space = spaces.Discrete(
                self.max_price - self.min_w + 1
            )
            self.retailer_action_space = spaces.Discrete(3)  # Under, Optimal, Over
        elif self.contract_type == "buyback":
            self.manufacturer_action_space = spaces.MultiDiscrete(
                [self.max_price - self.min_w + 1, self.max_price + 1]
            )
            self.retailer_action_space = spaces.Discrete(3)  # Under, Optimal, Over
        elif self.contract_type == "revenue-sharing":
            self.manufacturer_action_space = spaces.MultiDiscrete(
                [self.max_price - self.min_w + 1, self.max_price + 1]
            )
            self.retailer_action_space = spaces.Discrete(3)  # Under, Optimal, Over
        else:
            raise ValueError("Invalid contract type.")

        # The state does not include the demand since it's unknown before decisions are made
        self.observation_space = spaces.Box(
            low=0, high=150, shape=(3,), dtype=np.float32
        )
        self.reset()

    def reset(self):
        # At reset, we do not generate demand or include it in the state
        self.state = np.array([0, 0, 0])  # State: [Wholesale Price, Buyback Price, Revenue Share]
        self.current_round = 0
        self.demand = 0
        return self.state

    def manufacturer_step(self, action):
        # Unpack actions based on contract type
        if self.contract_type == "wholesale":
            w = action + self.min_w  # Adjust w to be in the correct range
            b = 0
            r = 0
        elif self.contract_type == "buyback":
            w = action[0] + self.min_w  # Adjust w
            b = action[1]
            r = 0
        elif self.contract_type == "revenue-sharing":
            w = action[0] + self.min_w  # Adjust w
            r = action[1]
            b = 0

        # Update state with manufacturer's action
        self.state = np.array([w, b, r])

        return self.state
    
    def get_optimal_stock(self):
        # Helper function to calculate optimal stock based on the current state
        w, b, r = self.state

        if self.contract_type == "wholesale":
            optimal_stock = 100 * ((12 - w) / 12) + 50
        elif self.contract_type == "buyback":
            if b == 12:
                optimal_stock = 150
            else:
                optimal_stock = 100 * ((12 - w) / (12 - b)) + 50
        elif self.contract_type == "revenue-sharing":
            if r == 12:
                optimal_stock = 0
            else:
                optimal_stock = 100 * ((12 - w - r) / (12 - r)) + 50
        else:
            optimal_stock = 100

        return optimal_stock

    def retailer_step(self, action):
        # Determine optimal stock based on contract type and current w, b, r
        w, b, r = self.state

        if self.contract_type == "wholesale":
            optimal_stock = 100 * ((12 - w) / 12) + 50
        elif self.contract_type == "buyback":
            if b == 12:  # Division by zero
                optimal_stock = 150  # Set to maximum
            else:
                optimal_stock = 100 * ((12 - w) / (12 - b)) + 50
        elif self.contract_type == "revenue-sharing":
            if r == 12:  # Division by zero
                optimal_stock = 0  # Set to minimum
            else:
                optimal_stock = 100 * ((12 - w - r) / (12 - r)) + 50
        else:
            optimal_stock = 100  # default

        # Determine retailer's order quantity based on action (0: under, 1: optimal, 2: over)
        if action == 0:
            Q = optimal_stock * 0.8
        elif action == 1:
            Q = optimal_stock
        elif action == 2:
            Q = optimal_stock * 1.2
        else:
            Q = optimal_stock  # default

        Q = int(round(Q))  # Round Q to the nearest integer
        Q = max(0, min(Q, self.max_stock))  # Ensure Q is within bounds

        # Use expected sales for profit calculation
        sales = self.expected_sales.get(Q, 0)
        leftovers = Q - sales
        c = 3  # Manufacturer's production cost
        p = 12  # Retail price

        if self.contract_type == "wholesale":
            # Retailer expected profit
            retailer_profit = p * sales - w * Q

            # Manufacturer expected profit
            manufacturer_profit = (w - c) * Q

        elif self.contract_type == "buyback":
            # Enforce constraint: buyback price must not exceed wholesale price
            if b > w:
                b = w

            # Retailer expected profit
            retailer_profit = p * sales - w * Q + b * leftovers

            # Manufacturer expected profit
            manufacturer_profit = (w - c) * Q - b * leftovers

        elif self.contract_type == "revenue-sharing":
            # Enforce constraint: revenue share must not exceed (retail price - wholesale price)
            max_revenue_share = p - w
            if r > max_revenue_share:
                r = max_revenue_share

            # Retailer expected profit
            retailer_profit = (p - r) * sales - w * Q

            # Manufacturer expected profit
            manufacturer_profit = (w - c) * Q + r * sales



        # Generate demand
        # self.demand = np.random.randint(50, 151)

        # # Calculate sales and leftovers based on actual demand
        # sales = min(Q, self.demand)
        # leftovers = Q - sales

        # if self.contract_type == "wholesale":
        #     # Retailer and manufacturer actual profit
        #     realized_retailer_profit = p * sales - w * Q
        #     realized_manufacturer_profit = (w - c) * Q

        # elif self.contract_type == "buyback":
        #     # Retailer and manufacturer actual profit
        #     realized_retailer_profit = p * sales - w * Q + b * leftovers
        #     realized_manufacturer_profit = (w - c) * Q - b * leftovers

        # elif self.contract_type == "revenue-sharing":
        #     # Retailer and manufacturer actual profit
        #     realized_retailer_profit = (p - r) * sales - w * Q
        #     realized_manufacturer_profit = (w - c) * Q + r * sales
        # else:
        #     realized_retailer_profit = 0
        #     realized_manufacturer_profit = 0

        self.current_round += 1
        done = self.current_round >= self.max_rounds

        # Return next_state, rewards (expected profits), done, info
        return self.state, (manufacturer_profit, retailer_profit), done, {}

In [27]:
# --- Agent Definition ---

class QLearningAgent:
    def __init__(
        self,
        action_space,
        personality_traits=None,
        learning_rate=0.1,
        discount_factor=0.99,
        epsilon=1.0,
        epsilon_decay=0.995,
        min_epsilon=0.01,
    ):
        self.action_space = action_space
        self.personality_traits = personality_traits or {}
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.initial_epsilon = epsilon  # Store initial epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.q_table = defaultdict(float)
        self.estimated_human_traits = {
            "Risk Averse Coefficient": 0.5,
            "Fairness Index": 0.5,
            "Regret Scale Average": 0.0,
            "Self Esteem Average": 0.0,
        }
        self.human_actions = []
        self.behavior_change_threshold = 2
        self.opponent_trait_estimates = {
            "Risk Averse Coefficient": 0.5,  # Initial guess
            "Fairness Index": 0.5,  # Initial guess
            "Regret Scale Average": 0.0,
            "Self Esteem Average": 0.0,
        }
        self.opponent_action_history = []  # Keep track of opponent actions
        self.last_round_prof = 0

    def _state_to_key(self, state):
        trait_values = tuple(self.personality_traits.values())
        return tuple(state) + trait_values

    def get_action(self, state):
        if self.detect_behavior_change():
            self.epsilon = min(1.0, self.epsilon * 1.1)  # Increase epsilon to promote exploration
        else:
            # Risk-averse agents explore less
            risk_aversion = self.personality_traits.get("Risk Averse Coefficient", 0.5)
            self.epsilon = max(
                self.min_epsilon,
                self.epsilon * self.epsilon_decay * (1 - risk_aversion),
            )

        if np.random.rand() < self.epsilon:
            # Explore: choose a random action
            if isinstance(self.action_space, spaces.Discrete):
                return self.action_space.sample()
            elif isinstance(self.action_space, spaces.MultiDiscrete):
                return tuple(self.action_space.sample())
        else:
            # Exploit: choose the action with the highest Q-value
            state_key = self._state_to_key(state)
            possible_actions = self.get_possible_actions()
            q_values = [self.q_table[(state_key, a)] for a in possible_actions]
            max_q = max(q_values)
            max_actions = [a for a, q in zip(possible_actions, q_values) if q == max_q]
            chosen_action = random.choice(max_actions)
            return chosen_action

    def update_q_table(self, state, action, reward, next_state):
        state_key = self._state_to_key(state)
        next_state_key = self._state_to_key(next_state)

        # Get possible actions for next state
        possible_actions = self.get_possible_actions()
        max_next_q = max(
            [self.q_table[(next_state_key, a)] for a in possible_actions], default=0
        )

        # Adjust the reward based on estimated human traits and opponent traits
        adjusted_reward = self.adjust_reward_based_on_traits(reward)

        # Update Q-value using the adjusted reward
        self.q_table[(state_key, action)] += self.learning_rate * (
            adjusted_reward
            + self.discount_factor * max_next_q
            - self.q_table[(state_key, action)]
        )

    def update_estimated_human_traits(self, human_action):
        self.human_actions.append(human_action)
        if len(self.human_actions) > 10:
            recent_actions = self.human_actions[-10:]
            action_variance = np.var(recent_actions)
            max_variance = (
                (self.action_space.n - 1) ** 2 / 12
            )  # Variance of a uniform distribution
            self.estimated_human_traits["Risk Averse Coefficient"] = max(
                0, min(1, 1 - action_variance / max_variance)
            )

    def adjust_reward_based_on_traits(self, reward, other_agent_reward=0):
        risk_aversion = self.personality_traits.get("Risk Averse Coefficient", 0.5)
        fairness = self.personality_traits.get("Fairness Index", 0.0)
        regret_scale = self.personality_traits.get("Regret Scale Average", 0.0)
        self_esteem = self.personality_traits.get("Self Esteem Average", 0.0)

        # Use opponent trait estimates in reward shaping
        opponent_risk_aversion = self.opponent_trait_estimates["Risk Averse Coefficient"]

        # Risk-based adjustment
        risk_adjusted_reward = reward * (
            1.0 + (0.5 - risk_aversion)
        ) - (1.0 * opponent_risk_aversion)

        # Fairness-based adjustment
        fairness_adjusted_reward = risk_adjusted_reward + (fairness * other_agent_reward)

        # Regret-based adjustment
        regret_adjusted_reward = fairness_adjusted_reward - (regret_scale * (self.last_round_prof))

        # Self-esteem-based adjustment (example)
        # High self-esteem might lead to overconfidence, so slightly reduce reward
        self_esteem_adjusted_reward = regret_adjusted_reward * (
            1.0 - (self_esteem * 0.1)
        )

        self.last_round_prof = reward

        return self_esteem_adjusted_reward

    def detect_behavior_change(self):
        if len(self.human_actions) < 20:
            return False
        recent_actions = self.human_actions[-10:]
        previous_actions = self.human_actions[-20:-10]
        recent_mean = np.mean(recent_actions)
        previous_mean = np.mean(previous_actions)
        if abs(recent_mean - previous_mean) > self.behavior_change_threshold:
            return True
        return False

    def get_possible_actions(self):
        if isinstance(self.action_space, spaces.Discrete):
            return range(self.action_space.n)
        elif isinstance(self.action_space, spaces.MultiDiscrete):
            ranges = [range(n) for n in self.action_space.nvec]
            possible_actions = list(itertools.product(*ranges))
            return possible_actions

    def update_opponent_model(self, opponent_action, state, contract_type):
        """Updates estimates of the opponent's traits based on observed actions."""
        self.opponent_action_history.append((state, opponent_action))

        # Heuristic for estimating risk aversion:
        if contract_type == "wholesale":
            optimal_stock = 100 * ((12 - state[0]) / 12) + 50
        elif contract_type == "buyback":
            optimal_stock = 100 * ((12 - state[0]) / (12 - state[1])) + 50
        elif contract_type == "revenue-sharing":
            optimal_stock = 100 * ((12 - state[0] - state[2]) / (12 - state[2])) + 50
        else:
            optimal_stock = 100

        if opponent_action == 0:  # Understocking
            self.opponent_trait_estimates["Risk Averse Coefficient"] = min(
                1.0, self.opponent_trait_estimates["Risk Averse Coefficient"] + 0.1
            )
        elif opponent_action == 2:  # Overstocking
            self.opponent_trait_estimates["Risk Averse Coefficient"] = max(
                0.0, self.opponent_trait_estimates["Risk Averse Coefficient"] - 0.1
            )

        # Heuristics for other traits (examples):
        # Fairness:
        if contract_type == "wholesale":
            if state[0] > 8:  # Manufacturer sets a high wholesale price
                self.opponent_trait_estimates["Fairness Index"] = max(
                    0.0, self.opponent_trait_estimates["Fairness Index"] - 0.1
                )
            elif state[0] < 5:  # Manufacturer sets a low wholesale price
                self.opponent_trait_estimates["Fairness Index"] = min(
                    1.0, self.opponent_trait_estimates["Fairness Index"] + 0.1
                )

        # Regret: (This is more complex, needs tracking of past actions and outcomes)
        # You could, for example, increase the opponent's estimated regret if they repeat an action that previously led to a low reward.

        # Self-Esteem: (This is also tricky)
        # You could, for example, decrease the opponent's estimated self-esteem if they frequently change their actions,
        # or increase it if they stick to the same action for several rounds.

    def save_agent(self, filename):
        """Saves the agent's Q-table, opponent model parameters, and other attributes to a file."""
        agent_data = {
            "q_table": self.q_table,
            "opponent_trait_estimates": self.opponent_trait_estimates,
            "personality_traits": self.personality_traits,
            "learning_rate": self.learning_rate,
            "discount_factor": self.discount_factor,
            "epsilon": self.epsilon,
            "initial_epsilon": self.initial_epsilon,
            "epsilon_decay": self.epsilon_decay,
            "min_epsilon": self.min_epsilon,
        }
        with open(filename, "wb") as f:
            pickle.dump(agent_data, f)

    def load_agent(self, filename):
        """Loads the agent's Q-table, opponent model parameters, and other attributes from a file."""
        with open(filename, "rb") as f:
            agent_data = pickle.load(f)

        self.q_table = agent_data["q_table"]
        self.opponent_trait_estimates = agent_data["opponent_trait_estimates"]
        self.personality_traits = agent_data["personality_traits"]
        self.learning_rate = agent_data["learning_rate"]
        self.discount_factor = agent_data["discount_factor"]
        self.epsilon = agent_data["epsilon"]
        self.initial_epsilon = agent_data["initial_epsilon"]
        self.epsilon_decay = agent_data["epsilon_decay"]
        self.min_epsilon = agent_data["min_epsilon"]

In [28]:
# --- Historical Data Preprocessing ---

def preprocess_historical_data(df, contract_type):
    data = []
    num_rows = len(df)
    rows_per_game = 40

    for start_row in range(0, num_rows, rows_per_game):
        game_data = df.iloc[start_row : start_row + rows_per_game]
        previous_state = None

        for _, row in game_data.iterrows():
            wholesale_price = float(row["Wholesale_p."])
            buyback_price = (
                float(row["Buyback_p."])
                if "Buyback_p." in row and not pd.isnull(row["Buyback_p."])
                else 0.0
            )
            revenue_share = (
                float(row["Revenue_Share"])
                if "Revenue_Share" in row and not pd.isnull(row["Revenue_Share"])
                else 0.0
            )

            # State variables: No previous retailer profit
            state = [
                wholesale_price,
                buyback_price,
                revenue_share
            ]

            # Actions
            if contract_type == "wholesale":
                manufacturer_action = wholesale_price - 3
            elif contract_type == "buyback":
                manufacturer_action = (wholesale_price - 3, buyback_price)
            elif contract_type == "revenue-sharing":
                manufacturer_action = (wholesale_price - 3, revenue_share)
            else:
                manufacturer_action = (wholesale_price - 3, 0.0)

            retailer_action = int(row["Behavioral_Category"])  # Convert to int

            # Rewards (Using realized profits instead of expected)
            try:
                manufacturer_reward = float(row["Realized_Mfg_Profit"])
            except ValueError:
                manufacturer_reward = 0

            try:
                retailer_reward = float(row["Realized_Retailer_Profit"])
            except ValueError:
                retailer_reward = 0

            # Next state: Use current values as next state for the next round
            next_state = [
                wholesale_price,
                buyback_price,
                revenue_share
            ]

            data.append(
                (
                    state,
                    manufacturer_action,
                    retailer_action,
                    (manufacturer_reward, retailer_reward),
                    next_state,
                )
            )

    return data

# Process Historical Data
historical_data_wholesale = preprocess_historical_data(
    historical_data_wholesale, "wholesale"
)
historical_data_buyback = preprocess_historical_data(
    historical_data_buyback, "buyback"
)
historical_data_revenue_sharing = preprocess_historical_data(
    historical_data_revenue_sharing, "revenue-sharing"
)

# Combine Historical Data
historical_data = (
    historical_data_wholesale
    + historical_data_buyback
    + historical_data_revenue_sharing
)

In [29]:
# --- Agent Initialization ---

# Create an environment for each contract type
env_wholesale = SupplyChainEnv(contract_type="wholesale", expected_sales=expected_sales)
env_buyback = SupplyChainEnv(contract_type="buyback", expected_sales=expected_sales)
env_revenue_sharing = SupplyChainEnv(
    contract_type="revenue-sharing", expected_sales=expected_sales
)

# Create 6 agents with specific traits
# For simplicity, let's just use the first 6 from each trait dictionary
manufacturer_traits = list(manufacturer_traits_dict.values())[:3]
retailer_traits = list(retailer_traits_dict.values())[:3]

manufacturer_agent_wholesale = QLearningAgent(
    env_wholesale.manufacturer_action_space, personality_traits=manufacturer_traits[0]
)
retailer_agent_wholesale = QLearningAgent(
    env_wholesale.retailer_action_space, personality_traits=retailer_traits[0]
)

manufacturer_agent_buyback = QLearningAgent(
    env_buyback.manufacturer_action_space, personality_traits=manufacturer_traits[1]
)
retailer_agent_buyback = QLearningAgent(
    env_buyback.retailer_action_space, personality_traits=retailer_traits[1]
)

manufacturer_agent_revenue_sharing = QLearningAgent(
    env_revenue_sharing.manufacturer_action_space,
    personality_traits=manufacturer_traits[2],
)
retailer_agent_revenue_sharing = QLearningAgent(
    env_revenue_sharing.retailer_action_space, personality_traits=retailer_traits[2]
)


In [30]:
# --- Agent Training ---
num_episodes = 1000  # You can increase this for more training

# Train Wholesale Agents
for episode in range(num_episodes):
    state = env_wholesale.reset()
    done = False
    total_manufacturer_profit = 0
    total_retailer_profit = 0

    while not done:
        manufacturer_action = manufacturer_agent_wholesale.get_action(state)
        state = env_wholesale.manufacturer_step(manufacturer_action)

        retailer_action = retailer_agent_wholesale.get_action(state)
        next_state, rewards, done, _ = env_wholesale.retailer_step(retailer_action)

        manufacturer_reward, retailer_reward = rewards
        total_manufacturer_profit += manufacturer_reward
        total_retailer_profit += retailer_reward

        manufacturer_agent_wholesale.update_q_table(
            state, manufacturer_action, manufacturer_reward, next_state
        )
        retailer_agent_wholesale.update_q_table(
            state, retailer_action, retailer_reward, next_state
        )

        manufacturer_agent_wholesale.update_opponent_model(
            retailer_action, state, env_wholesale.contract_type
        )
        retailer_agent_wholesale.update_opponent_model(
            manufacturer_action, state, env_wholesale.contract_type
        )

        state = next_state

    if episode % 100 == 0:
        print(f"Wholesale - Episode {episode + 1}:")
        print(f"  Total Manufacturer Profit: {total_manufacturer_profit}")
        print(f"  Total Retailer Profit: {total_retailer_profit}")

# Train Buyback Agents
for episode in range(num_episodes):
    state = env_buyback.reset()
    done = False
    total_manufacturer_profit = 0
    total_retailer_profit = 0

    while not done:
        manufacturer_action = manufacturer_agent_buyback.get_action(state)
        state = env_buyback.manufacturer_step(manufacturer_action)

        retailer_action = retailer_agent_buyback.get_action(state)
        next_state, rewards, done, _ = env_buyback.retailer_step(retailer_action)

        manufacturer_reward, retailer_reward = rewards
        total_manufacturer_profit += manufacturer_reward
        total_retailer_profit += retailer_reward

        manufacturer_agent_buyback.update_q_table(
            state, manufacturer_action, manufacturer_reward, next_state
        )
        retailer_agent_buyback.update_q_table(
            state, retailer_action, retailer_reward, next_state
        )

        manufacturer_agent_buyback.update_opponent_model(
            retailer_action, state, env_buyback.contract_type
        )
        retailer_agent_buyback.update_opponent_model(
            manufacturer_action, state, env_buyback.contract_type
        )

        state = next_state

    if episode % 100 == 0:
        print(f"Buyback - Episode {episode + 1}:")
        print(f"  Total Manufacturer Profit: {total_manufacturer_profit}")
        print(f"  Total Retailer Profit: {total_retailer_profit}")

# Train Revenue Sharing Agents
for episode in range(num_episodes):
    state = env_revenue_sharing.reset()
    done = False
    total_manufacturer_profit = 0
    total_retailer_profit = 0

    while not done:
        manufacturer_action = manufacturer_agent_revenue_sharing.get_action(state)
        state = env_revenue_sharing.manufacturer_step(manufacturer_action)

        retailer_action = retailer_agent_revenue_sharing.get_action(state)
        next_state, rewards, done, _ = env_revenue_sharing.retailer_step(
            retailer_action
        )

        manufacturer_reward, retailer_reward = rewards
        total_manufacturer_profit += manufacturer_reward
        total_retailer_profit += retailer_reward

        manufacturer_agent_revenue_sharing.update_q_table(
            state, manufacturer_action, manufacturer_reward, next_state
        )
        retailer_agent_revenue_sharing.update_q_table(
            state, retailer_action, retailer_reward, next_state
        )

        manufacturer_agent_revenue_sharing.update_opponent_model(
            retailer_action, state, env_revenue_sharing.contract_type
        )
        retailer_agent_revenue_sharing.update_opponent_model(
            manufacturer_action, state, env_revenue_sharing.contract_type
        )

        state = next_state

    if episode % 100 == 0:
        print(f"Revenue Sharing - Episode {episode + 1}:")
        print(f"  Total Manufacturer Profit: {total_manufacturer_profit}")
        print(f"  Total Retailer Profit: {total_retailer_profit}")

# --- Save Trained Agents ---

manufacturer_agent_wholesale.save_agent("manufacturer_agent_wholesale.pkl")
retailer_agent_wholesale.save_agent("retailer_agent_wholesale.pkl")
manufacturer_agent_buyback.save_agent("manufacturer_agent_buyback.pkl")
retailer_agent_buyback.save_agent("retailer_agent_buyback.pkl")
manufacturer_agent_revenue_sharing.save_agent(
    "manufacturer_agent_revenue_sharing.pkl"
)
retailer_agent_revenue_sharing.save_agent(
    "retailer_agent_revenue_sharing.pkl"
)

Wholesale - Episode 1:
  Total Manufacturer Profit: 14950
  Total Retailer Profit: 9399.633663366338
Wholesale - Episode 101:
  Total Manufacturer Profit: 12599
  Total Retailer Profit: 8728.980198019803
Wholesale - Episode 201:
  Total Manufacturer Profit: 12046
  Total Retailer Profit: 12714.930693069302
Wholesale - Episode 301:
  Total Manufacturer Profit: 12929
  Total Retailer Profit: 13576.237623762372
Wholesale - Episode 401:
  Total Manufacturer Profit: 13181
  Total Retailer Profit: 10629.881188118814
Wholesale - Episode 501:
  Total Manufacturer Profit: 13339
  Total Retailer Profit: 7059.90099009901
Wholesale - Episode 601:
  Total Manufacturer Profit: 12914
  Total Retailer Profit: 8815.118811881192
Wholesale - Episode 701:
  Total Manufacturer Profit: 13537
  Total Retailer Profit: 9577.94059405941
Wholesale - Episode 801:
  Total Manufacturer Profit: 11801
  Total Retailer Profit: 13406.960396039603
Wholesale - Episode 901:
  Total Manufacturer Profit: 12749
  Total Retai

  optimal_stock = 100 * ((12 - state[0]) / (12 - state[1])) + 50
  optimal_stock = 100 * ((12 - state[0]) / (12 - state[1])) + 50


Buyback - Episode 101:
  Total Manufacturer Profit: 2318.8910891089145
  Total Retailer Profit: 27604.86138613862
Buyback - Episode 201:
  Total Manufacturer Profit: -674.4851485148515
  Total Retailer Profit: 30380.18811881188
Buyback - Episode 301:
  Total Manufacturer Profit: -1636.455445544554
  Total Retailer Profit: 31445.554455445545
Buyback - Episode 401:
  Total Manufacturer Profit: -4377.178217821782
  Total Retailer Profit: 34410.44554455446
Buyback - Episode 501:
  Total Manufacturer Profit: 4199.544554455445
  Total Retailer Profit: 24900.544554455446
Buyback - Episode 601:
  Total Manufacturer Profit: 11179.0
  Total Retailer Profit: 17380.58415841584
Buyback - Episode 701:
  Total Manufacturer Profit: 811.3663366336632
  Total Retailer Profit: 28213.099009900987
Buyback - Episode 801:
  Total Manufacturer Profit: 4313.425742574256
  Total Retailer Profit: 24347.920792079207
Buyback - Episode 901:
  Total Manufacturer Profit: 11502.267326732672
  Total Retailer Profit: 17

  optimal_stock = 100 * ((12 - state[0] - state[2]) / (12 - state[2])) + 50


Revenue Sharing - Episode 101:
  Total Manufacturer Profit: 7680.346534653466
  Total Retailer Profit: 2171.653465346535
Revenue Sharing - Episode 201:
  Total Manufacturer Profit: 10015.118811881188
  Total Retailer Profit: 2867.742574257425
Revenue Sharing - Episode 301:
  Total Manufacturer Profit: 5483.039603960396
  Total Retailer Profit: -930.0198019801982
Revenue Sharing - Episode 401:
  Total Manufacturer Profit: 7396.178217821783
  Total Retailer Profit: 467.38613861386125
Revenue Sharing - Episode 501:
  Total Manufacturer Profit: 7425.247524752474
  Total Retailer Profit: 4447.297029702971
Revenue Sharing - Episode 601:
  Total Manufacturer Profit: 7689.257425742574
  Total Retailer Profit: 2153.7722772277225
Revenue Sharing - Episode 701:
  Total Manufacturer Profit: 6534.198019801979
  Total Retailer Profit: 2619.0099009900987
Revenue Sharing - Episode 801:
  Total Manufacturer Profit: 7092.425742574256
  Total Retailer Profit: 851.811881188119
Revenue Sharing - Episode 90

In [None]:
# --- Game with User ---

# Load trained agents
manufacturer_agent_wholesale = QLearningAgent(env_wholesale.manufacturer_action_space)
retailer_agent_wholesale = QLearningAgent(env_wholesale.retailer_action_space)
manufacturer_agent_buyback = QLearningAgent(env_buyback.manufacturer_action_space)
retailer_agent_buyback = QLearningAgent(env_buyback.retailer_action_space)
manufacturer_agent_revenue_sharing = QLearningAgent(
    env_revenue_sharing.manufacturer_action_space
)
retailer_agent_revenue_sharing = QLearningAgent(
    env_revenue_sharing.retailer_action_space
)

manufacturer_agent_wholesale.load_agent("manufacturer_agent_wholesale.pkl")
retailer_agent_wholesale.load_agent("retailer_agent_wholesale.pkl")
manufacturer_agent_buyback.load_agent("manufacturer_agent_buyback.pkl")
retailer_agent_buyback.load_agent("retailer_agent_buyback.pkl")
manufacturer_agent_revenue_sharing.load_agent(
    "manufacturer_agent_revenue_sharing.pkl"
)
retailer_agent_revenue_sharing.load_agent(
    "retailer_agent_revenue_sharing.pkl"
)

# Get user input
contract_type = input(
    "Choose a contract type (wholesale, buyback, revenue-sharing): "
).lower()

while contract_type not in ["wholesale", "buyback", "revenue-sharing"]:
    print("Invalid contract type. Please choose from 'wholesale', 'buyback', or 'revenue-sharing'.")
    contract_type = input(
        "Choose a contract type (wholesale, buyback, revenue-sharing): "
    ).lower()
    
role = input("Do you want to be the manufacturer or the retailer? ").lower()

while role not in ["manufacturer", "retailer"]:
    print("Invalid role. Please choose either 'manufacturer' or 'retailer'.")
    role = input("Do you want to be the manufacturer or the retailer? ").lower()

# Select the environment and agents based on contract type
if contract_type == "wholesale":
    env = env_wholesale
    manufacturer_agent = manufacturer_agent_wholesale
    retailer_agent = retailer_agent_wholesale
elif contract_type == "buyback":
    env = env_buyback
    manufacturer_agent = manufacturer_agent_buyback
    retailer_agent = retailer_agent_buyback
elif contract_type == "revenue-sharing":
    env = env_revenue_sharing
    manufacturer_agent = manufacturer_agent_revenue_sharing
    retailer_agent = retailer_agent_revenue_sharing
else:
    raise ValueError("Invalid contract type")

# Initialize the environment
state = env.reset()
done = False
total_human_profit = 0
total_agent_profit = 0

# ... (The rest of the code up to the game loop remains the same)

# --- Game with User ---

# Load trained agents (assuming you have already trained and saved them)
manufacturer_agent_wholesale = QLearningAgent(env_wholesale.manufacturer_action_space)
retailer_agent_wholesale = QLearningAgent(env_wholesale.retailer_action_space)
manufacturer_agent_buyback = QLearningAgent(env_buyback.manufacturer_action_space)
retailer_agent_buyback = QLearningAgent(env_buyback.retailer_action_space)
manufacturer_agent_revenue_sharing = QLearningAgent(
    env_revenue_sharing.manufacturer_action_space
)
retailer_agent_revenue_sharing = QLearningAgent(
    env_revenue_sharing.retailer_action_space
)

manufacturer_agent_wholesale.load_agent("manufacturer_agent_wholesale.pkl")
retailer_agent_wholesale.load_agent("retailer_agent_wholesale.pkl")
manufacturer_agent_buyback.load_agent("manufacturer_agent_buyback.pkl")
retailer_agent_buyback.load_agent("retailer_agent_buyback.pkl")
manufacturer_agent_revenue_sharing.load_agent(
    "manufacturer_agent_revenue_sharing.pkl"
)
retailer_agent_revenue_sharing.load_agent(
    "retailer_agent_revenue_sharing.pkl"
)

# Get user input
contract_type = input(
    "Choose a contract type (wholesale, buyback, revenue-sharing): "
).lower()

while contract_type not in ["wholesale", "buyback", "revenue-sharing"]:
    print("Invalid contract type. Please choose from 'wholesale', 'buyback', or 'revenue-sharing'.")
    contract_type = input(
        "Choose a contract type (wholesale, buyback, revenue-sharing): "
    ).lower()

role = input("Do you want to be the manufacturer or the retailer? ").lower()
while role not in ["manufacturer", "retailer"]:
    print("Invalid role. Please choose either 'manufacturer' or 'retailer'.")
    role = input("Do you want to be the manufacturer or the retailer? ").lower()

# Select the environment and agents based on contract type
if contract_type == "wholesale":
    env = env_wholesale
    manufacturer_agent = manufacturer_agent_wholesale
    retailer_agent = retailer_agent_wholesale
elif contract_type == "buyback":
    env = env_buyback
    manufacturer_agent = manufacturer_agent_buyback
    retailer_agent = retailer_agent_buyback
elif contract_type == "revenue-sharing":
    env = env_revenue_sharing
    manufacturer_agent = manufacturer_agent_revenue_sharing
    retailer_agent = retailer_agent_revenue_sharing
else:
    raise ValueError("Invalid contract type")

# Initialize the environment
state = env.reset()
done = False
total_human_profit = 0
total_agent_profit = 0

# Game loop
for round_num in range(1, env.max_rounds + 1):
    print(f"\n--- Round {round_num} ---")

    if role == "manufacturer":
        # Human's turn (manufacturer)
        valid_action = False
        while not valid_action:
            try:
                if contract_type == "wholesale":
                    w = int(
                        input(
                            f"Enter wholesale price (integer between {env.min_w} and {env.max_price}): "
                        )
                    )
                    human_action = w - env.min_w  # Adjust action to be in the correct range for the agent
                    if 0 <= human_action <= env.manufacturer_action_space.n - 1:
                        valid_action = True
                        b = 0
                        r = 0
                    else:
                        print("Invalid wholesale price. ", end="")

                elif contract_type == "buyback":
                    w = int(
                        input(
                            f"Enter wholesale price (integer between {env.min_w} and {env.max_price}): "
                        )
                    )
                    b = int(
                        input(
                            f"Enter buyback price (integer between 0 and {env.max_price}): "
                        )
                    )
                    human_action = (w - env.min_w, b)  # Adjust action to be in the correct range for the agent
                    if (
                        0 <= human_action[0] <= env.manufacturer_action_space.nvec[0] - 1
                        and 0 <= human_action[1] <= env.manufacturer_action_space.nvec[1] - 1
                    ):
                        valid_action = True
                        r = 0
                    else:
                        print("Invalid wholesale or buyback price. ", end="")

                elif contract_type == "revenue-sharing":
                    w = int(
                        input(
                            f"Enter wholesale price (integer between {env.min_w} and {env.max_price}): "
                        )
                    )
                    r = int(
                        input(
                            f"Enter revenue share (integer between 0 and {env.max_price}): "
                        )
                    )
                    human_action = (w - env.min_w, r)  # Adjust action to be in the correct range for the agent
                    if (
                        0 <= human_action[0] <= env.manufacturer_action_space.nvec[0] - 1
                        and 0 <= human_action[1] <= env.manufacturer_action_space.nvec[1] - 1
                    ):
                        valid_action = True
                        b = 0
                    else:
                        print("Invalid wholesale price or revenue share. ", end="")

                else:
                    raise ValueError("Invalid contract type")

            except ValueError:
                print("Invalid input. Please enter an integer. ", end="")

        # Update the environment based on the human's action
        state = env.manufacturer_step(human_action)

        # Agent's turn (retailer)
        agent_action = retailer_agent.get_action(state)

        # Determine the agent's stock quantity based on the action
        optimal_stock = env.get_optimal_stock()
        if agent_action == 0:
            agent_stock_choice = int(round(optimal_stock * 0.8))  # Understock
        elif agent_action == 1:
            agent_stock_choice = int(round(optimal_stock))  # Optimal
        elif agent_action == 2:
            agent_stock_choice = int(round(optimal_stock * 1.2))  # Overstock
        else:
            agent_stock_choice = int(round(optimal_stock))

        # Update the environment based on the agent's action
        next_state, rewards, done, _ = env.retailer_step(agent_action)
        _, _ = rewards

        # Update agent (only opponent model, not Q-table during human interaction)
        retailer_agent.update_opponent_model(human_action, state, contract_type)

        # Generate random demand for the round
        env.demand = random.randint(50, 150)

        # Calculate realized profits for human (manufacturer) and agent (retailer)
        if contract_type == "wholesale":
            human_profit = (w - env.min_w) * min(agent_stock_choice, env.demand)
            agent_profit = (
                12 * min(agent_stock_choice, env.demand) - w * agent_stock_choice
            )

        elif contract_type == "buyback":
            human_profit = (
                (w - env.min_w) * min(agent_stock_choice, env.demand)
                - b * (agent_stock_choice - min(agent_stock_choice, env.demand))
            )
            agent_profit = (
                12 * min(agent_stock_choice, env.demand)
                - w * agent_stock_choice
                + b * (agent_stock_choice - min(agent_stock_choice, env.demand))
            )

        elif contract_type == "revenue-sharing":
            human_profit = (
                (w - env.min_w) * min(agent_stock_choice, env.demand)
                + r * (agent_stock_choice - min(agent_stock_choice, env.demand))
            )
            agent_profit = (
                12 * min(agent_stock_choice, env.demand)
                - w * agent_stock_choice
                - r * (agent_stock_choice - min(agent_stock_choice, env.demand))
            )

        # Update total profits
        total_human_profit += human_profit
        total_agent_profit += agent_profit

    else:  # role == "retailer"
        # Agent's turn (manufacturer)
        manufacturer_action = manufacturer_agent.get_action(state)

        # Update the environment based on the agent's action
        state = env.manufacturer_step(manufacturer_action)

        # Get the manufacturer's decision variables from the state
        w, b, r = state

        # Human's turn (retailer)
        # Provide the manufacturer's decision to the human player
        if contract_type == "wholesale":
            print(f"  Manufacturer's wholesale price: {w}")
        elif contract_type == "buyback":
            print(f"  Manufacturer's wholesale price: {w}, Buyback price: {b}")
        elif contract_type == "revenue-sharing":
            print(f"  Manufacturer's wholesale price: {w}, Revenue share: {r}")

        # Get human's stock decision
        valid_stock = False
        while not valid_stock:
            try:
                stock_choice = int(
                    input(
                        f"Enter retailer stock (integer between 50 and {env.max_stock}): "
                    )
                )
                if 50 <= stock_choice <= env.max_stock:
                    valid_stock = True
                else:
                    print("Invalid stock. ", end="")
            except ValueError:
                print("Invalid input. Please enter an integer between 50 and 150. ", end="")

        # Convert stock choice to action (0, 1, or 2)
        optimal_stock = env.get_optimal_stock()
        if stock_choice <= optimal_stock * 0.8:
            human_action = 0  # Understocking
        elif stock_choice <= optimal_stock * 1.2:
            human_action = 1  # Optimal
        else:
            human_action = 2  # Overstocking

        # Update the environment based on the human's action
        # Note: The environment still uses the 0, 1, 2 representation for internal calculations
        next_state, rewards, done, _ = env.retailer_step(human_action)
        _, _ = rewards

        # Update agent (only opponent model, not Q-table during human interaction)
        manufacturer_agent.update_opponent_model(human_action, state, contract_type)

        # Generate random demand for the round
        env.demand = random.randint(50, 150)

        # Calculate realized profits for human (retailer) and agent (manufacturer)
        if contract_type == "wholesale":
            human_profit = 12 * min(stock_choice, env.demand) - w * stock_choice
            agent_profit = (w - env.min_w) * min(stock_choice, env.demand)
        elif contract_type == "buyback":
            human_profit = (
                12 * min(stock_choice, env.demand)
                - w * stock_choice
                + b * (stock_choice - min(stock_choice, env.demand))
            )
            agent_profit = (
                (w - env.min_w) * min(stock_choice, env.demand)
                - b * (stock_choice - min(stock_choice, env.demand))
            )
        elif contract_type == "revenue-sharing":
            human_profit = (
                12 * min(stock_choice, env.demand)
                - w * stock_choice
                - r * (stock_choice - min(stock_choice, env.demand))
            )
            agent_profit = (
                (w - env.min_w) * min(stock_choice, env.demand)
                + r * (stock_choice - min(stock_choice, env.demand))
            )
        else:
            human_profit = 0
            agent_profit = 0

        # Update total profits
        total_human_profit += human_profit
        total_agent_profit += agent_profit

    print(f"  Demand in this round: {env.demand}")
    print(f"  Your profit this round: {human_profit}")
    print(f"  Agent's profit this round: {agent_profit}")
    print(f"  Your cumulative profit: {total_human_profit}")
    print(f"  Agent's cumulative profit: {total_agent_profit}")

    state = next_state  # Update state for the next round

print("\nGame Over!")
print(f"Final Score - You: {total_human_profit}, Agent: {total_agent_profit}")


--- Round 1 ---
  Manufacturer's wholesale price: 6
  Demand in this round: 120
  Your profit this round: 300
  Agent's profit this round: 150
  Your cumulative profit: 300
  Agent's cumulative profit: 150

--- Round 2 ---
  Manufacturer's wholesale price: 5
  Demand in this round: 133
  Your profit this round: 350
  Agent's profit this round: 100
  Your cumulative profit: 650
  Agent's cumulative profit: 250

--- Round 3 ---
  Manufacturer's wholesale price: 4
  Demand in this round: 107
  Your profit this round: 400
  Agent's profit this round: 50
  Your cumulative profit: 1050
  Agent's cumulative profit: 300

--- Round 4 ---
  Manufacturer's wholesale price: 6
  Demand in this round: 118
  Your profit this round: 480
  Agent's profit this round: 240
  Your cumulative profit: 1530
  Agent's cumulative profit: 540

--- Round 5 ---
  Manufacturer's wholesale price: 6
Invalid input. Please enter an integer between 50 and 150. Invalid input. Please enter an integer between 50 and 150. 