In [1]:
from tournament.agents.constant import AllC, AllD
from tournament.agents.pavlov import Pavlov
from tournament.agents.q_learning.dqn import DeepQLearner
from tournament.agents.tft import TFTT, GenerousTFT, TitForTat, OmegaTFT, TTFT
from tournament.environments.single import SingleRuleBasedAgentEnvironment
from tournament.environments.multiple import MultipleRuleBasedAgentEnvironment
from tournament.action import Action
from tournament.agents.agents import AGENTS
from tournament.agents.axelrod_first import (
    Davis,
    Downing,
    Feld,
    Grofman,
    Grudger,
    Joss,
    Nydegger,
    Shubik,
    SteinAndRapoport,
    TidemanAndChieruzzi,
    Tullock,
)
from tournament.agents.axelrod_second import (
    Borufsen,
    Champion,
    Leyvraz,
    SecondByBlackK83R,
    Cave,
    GraaskampAndKatzen,
    Harrington,
    TidemanAndChieruzzi2,
    Weiner,
    White,
    Adams,
)
from tournament.tournament import RoundRobinTournament

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

# plt.rcParams["figure.figsize"] = (20, 12)
plt.rcParams["figure.figsize"] = (10, 6)

In [2]:
# agents = [TitForTat, Joss, Downing]
# ['TitForTat', 'TFTT', 'TTFT', 'GenerousTFT', 'OmegaTFT', 'Davis', 'Grofman', 'Leyvraz']
agents = [TitForTat, TFTT, TTFT, GenerousTFT, OmegaTFT, Davis, Grofman, Leyvraz]

env = MultipleRuleBasedAgentEnvironment(agents)

In [3]:

class QNetwork(nn.Module):
    def __init__(self, lookback):
        super().__init__()

        self.flatten = nn.Flatten()
        self.gru = nn.GRU(2, 2, 1, batch_first=True)
        self.linear1 = nn.Linear(2 * lookback, 2 * lookback)
        self.linear2 = nn.Linear(2 * lookback, 2)

        nn.init.kaiming_uniform_(self.linear1.weight, mode="fan_in", nonlinearity="relu")
        nn.init.kaiming_uniform_(self.linear2.weight, mode="fan_in", nonlinearity="relu")

    def forward(self, x):
        x = x.unsqueeze(dim=0)
        x, hn = self.gru(x)
        x = self.flatten(x)
        x = torch.relu(self.linear1(x))
        x = torch.relu(self.linear2(x))

        return x

class QLearningTest(DeepQLearner):
    def __init__(self):
        super().__init__()

        self.lookback = 4
        self.epsilon = 0.2

        self._learning_rate = 0.05
        self._discount_rate = 0.99
        self._q_network = QNetwork(self.lookback)


agent = QLearningTest()

In [4]:
env.train(
    trainee=agent,
    limit=100,
    epochs=10,
)

[00:29:20] Commencement of training.
BEFORE: tensor([[0.0000, 0.0176]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.2574]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.2574]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.3970]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.3741]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.4362]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.4007]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.4230]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.5107]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.5004]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.5734]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.5629]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.5693]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.5754]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.5754]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.5992]], grad_fn=<ReluBackward0>)
BEFORE: ten

KeyboardInterrupt: 

In [None]:
print(agent._q_network)
print(env.counts)

s = sum(env.counts.values())
print({a: env.counts[a] / s for a in env.counts})

In [None]:
plt.title("Loss")
plt.plot(env.metric_history)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Rewards")
plt.plot(env.rewards)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
N = 20

plt.title("Rolling average of rewards")
plt.plot(np.convolve(env.rewards, np.ones(N), mode='valid') / N)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Cumulative reward")
plt.plot(np.cumsum(env.rewards))
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Cumulative regret (vs cooperating)")
plt.plot(np.cumsum(3 - np.array(env.rewards)))
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Cooperation over the epochs")
print(env.normalised_epoch_counts)
plt.plot([x[Action.COOPERATE] for x in env.normalised_epoch_counts])
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
# torch.save(agent._q_network.state_dict(), "models/drqn/gru-1.pt")

In [None]:
agent._q_network.eval()

In [None]:
tournament = RoundRobinTournament(AGENTS, [agent])

scores, times = tournament.play(
    continuation_probability=0.99654, repetitions=20, jobs=12
)

results = [
    (a, round(sum(scores[a]) / len(scores[a])), sum(times[a]))
    for a in scores
]
results.sort(key=lambda x: x[1], reverse=True)

for c, score, time in results:
    print(f"{c.__name__:<30} {score:<20} {time:<20}")


In [None]:
from tournament.agent import Agent
from tournament.match import Match

class ManualAgent(Agent):
    def play_move(self, history, opp_history):
        move = input("Move: ")
        if move == "C":
            return Action.COOPERATE
        elif move == "D":
            return Action.DEFECT
        elif move == "Q":
            raise RuntimeError()

manual_agent = ManualAgent()

match = Match(agent, manual_agent)

for i, ((move1, move2), (score1, score2), (reward1, reward2)) in enumerate(
    match.play_moves(continuation_probability=0.99654, limit=1000, noise=0)
):
    print(agent._q_network(agent._prev_state), agent._q_network(agent._state))
    print(
        f"{i:<4} | \t {move1:<20} {score1:<8} {f'({reward1})':<20} {move2:<20} {score2:<8} {f'({reward2})':<10}"
    )