In [None]:
import numpy as np
import random

In [None]:
# Actions
COOPERATE = 0
DEFECT = 1
actions = [COOPERATE, DEFECT]

In [None]:

# Payoff matrix: reward[player_action][opponent_action]
payoff = {
    (COOPERATE, COOPERATE): 3,
    (COOPERATE, DEFECT): 0,
    (DEFECT, COOPERATE): 5,
    (DEFECT, DEFECT): -1
}

In [None]:

# Q-learning parameters
alpha = 0.1      # learning rate
gamma = 0.99      # discount factor
epsilon = 0.1    # exploration rate

In [None]:

# Q-table: state is opponent's last action
Q = np.zeros((2, 2))  # Q[state][action]

def opponent_strategy(last_move):
    """Grim Trigger: cooperate until defected against, then defect forever"""
    if last_move == DEFECT:
        return DEFECT
    return COOPERATE

def choose_action(state):
    if random.random() < epsilon:
        return random.choice(actions)
    return np.argmax(Q[state])

def update_q(state, action, reward, next_state):
    best_next = np.max(Q[next_state])
    Q[state, action] += alpha * (reward + gamma * best_next - Q[state, action])

In [None]:

# Training loop
episodes = 5000
opponent_last = None
agent_last = COOPERATE

for _ in range(episodes):
    state = agent_last
    action = choose_action(state)
    opponent_action = opponent_strategy(opponent_last)

    reward = payoff[(action, opponent_action)]
    next_state = opponent_action

    update_q(state, action, reward, next_state)

    opponent_last = action
    agent_last = action

In [None]:

# Results
print("Learned Q-table:")
print(Q)

print("\nLearned strategy:")
for state in actions:
    move = "Cooperate" if np.argmax(Q[state]) == COOPERATE else "Defect"
    print(f"If opponent last played {state}: {move}")

Learned Q-table:
[[365.65784391 399.45163247]
 [  0.          -0.9962429 ]]

Learned strategy:
If opponent last played 0: Defect
If opponent last played 1: Cooperate
