In [1]:
from tournament.agents.constant import AllC, AllD
from tournament.agents.pavlov import Pavlov
from tournament.agents.q_learning.dqn import DeepQLearner
from tournament.agents.tft import TFTT, GenerousTFT, TitForTat, OmegaTFT
from tournament.environments.single import SingleRuleBasedAgentEnvironment
from tournament.environments.multiple import MultipleRuleBasedAgentEnvironment
from tournament.action import Action
from tournament.agents.agents import AGENTS
from tournament.agents.axelrod_first import (
    Davis,
    Downing,
    Feld,
    Grofman,
    Grudger,
    Joss,
    Nydegger,
    Shubik,
    SteinAndRapoport,
    TidemanAndChieruzzi,
    Tullock,
)
from tournament.agents.axelrod_second import (
    Borufsen,
    Champion,
    Leyvraz,
    SecondByBlackK83R,
    SecondByCave,
    SecondByGraaskampKatzen,
    SecondByHarrington,
    SecondByTidemanAndChieruzzi,
    SecondByWeiner,
    SecondByWhiteK72R,
    SecondByWmAdams,
)
from tournament.tournament import RoundRobinTournament

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

# plt.rcParams["figure.figsize"] = (20, 12)
plt.rcParams["figure.figsize"] = (10, 6)

In [2]:
agents = [TitForTat, Pavlov, SecondByWeiner, Borufsen, Leyvraz, SecondByWhiteK72R]

env = MultipleRuleBasedAgentEnvironment(agents)

In [3]:
class QNetwork(nn.Module):
    def __init__(self, lookback):
        super().__init__()

        self.flatten = nn.Flatten()
        self.layer1 = nn.Linear(2 * lookback, 10)
        self.layer2 = nn.Linear(10, 10)
        # self.layer3 = nn.Linear(6, 6)
        self.layer4 = nn.Linear(10, 2)

        nn.init.kaiming_uniform_(self.layer1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layer2.weight, mode='fan_in', nonlinearity='relu')
        # nn.init.kaiming_uniform_(self.layer3.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layer4.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        x = x.unsqueeze(dim=0)
        x = self.flatten(x)
        x = torch.tanh(self.layer1(x))
        x = torch.tanh(self.layer2(x))
        # x = torch.relu(self.layer3(x))
        x = torch.relu(self.layer4(x))

        return x

class QLearningTest(DeepQLearner):
    def __init__(self):
        super().__init__()

        self.lookback = 4
        self.epsilon = 0.2

        self._learning_rate = 0.0001
        self._discount_rate = 0.99
        self._q_network = QNetwork(self.lookback)


agent = QLearningTest()

In [4]:
env.train(
    trainee=agent,
    limit=200,
    epochs=100,
)

[00:39:59] Commencement of training.
BEFORE: tensor([[0.0000, 0.1909]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.1897]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0000, 0.1897]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0000, 0.1889]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0256, 0.1904]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0246, 0.1903]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0393, 0.2195]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0378, 0.2199]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.2883, 0.0000]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.2864, 0.0000]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.2107, 0.1564]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.2085, 0.1573]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.2085, 0.1573]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.2063, 0.1582]], grad_fn=<ReluBackward0>)
BEFORE: tensor([[0.0478, 0.0945]], grad_fn=<ReluBackward0>)
AFTER: tensor([[0.0461, 0.0954]], grad_fn=<ReluBackward0>)
BEFORE: ten

KeyboardInterrupt: 

In [None]:
print(agent._q_network)
print(env.counts)

s = sum(env.counts.values())
print({a: env.counts[a] / s for a in env.counts})

In [None]:
plt.title("Loss")
plt.plot(env.metric_history)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Rewards")
plt.plot(env.rewards)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
N = 20

plt.title("Rolling average of rewards")
plt.plot(np.convolve(env.rewards, np.ones(N), mode='valid') / N)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Cumulative reward")
plt.plot(np.cumsum(env.rewards))
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Cumulative regret (vs cooperating)")
plt.plot(np.cumsum(3 - np.array(env.rewards)))
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
plt.title("Cooperation over the epochs")
print(env.normalised_epoch_counts)
plt.plot([x[Action.COOPERATE] for x in env.normalised_epoch_counts])
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

In [None]:
# torch.save(agent._q_network.state_dict(), "model.pt")

In [None]:
agent._q_network.eval()

In [None]:
tournament = RoundRobinTournament(AGENTS, [agent])

scores, times = tournament.play(
    continuation_probability=0.99654, repetitions=20, jobs=12
)

results = [
    (agent, round(sum(scores[agent]) / len(scores[agent])), sum(times[agent]))
    for agent in scores
]
results.sort(key=lambda x: x[1], reverse=True)

for c, score, time in results:
    print(f"{c.__name__:<30} {score:<20} {time:<20}")
