In [1]:
from utils.card_engine import Card_Game, Card_Env, random_agent

In [2]:
import math
import random
from collections import namedtuple, deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt

from itertools import count

# Replay Memory

In [4]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        # save a transition
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

# Q network

In [6]:
class DQN(nn.Module):

    # n_input: the current state
    #  (1x52)    +  (56x52)       +       (1x52): the current state
    #    ^hand       ^who plays each card  ^cards not seen yet
    #                       + cards played
    # n_output: probability of playing each card
    #   (1x52)
    def __init__(self, n_input, n_output):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_input, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_output)

    def forward(self, x):
        if x.dtype == torch.float32:
            x = F.relu(self.layer1(x))
            x = F.relu(self.layer2(x))
        else:
            x=x.to(torch.float32)
            x = F.relu(self.layer1(x))
            x = F.relu(self.layer2(x))
        return self.layer3(x)

# Training

### The network agent
Selects a move according to epsilon-greedy policy:
sometimes uses the model to select move, sometimes just select one randomally

In [9]:
'''
A single step optimization of the model using Deep Q-Learning
1) samples a batch from memory, concatenates all the tensors into a single one
2) computes Q(s_t, a_t) and V(s_{t+1}) = max_a Q(s_{t+1}, a), where s_t --(a_t)--> s_{t+1}
3) computes the loss
4) updates the target network (which is computing V(s_{t+1})) at every step with soft update
'''
def optimize_model():
    transitions = []
    for turn, mem in memory.items():
        if len(mem) >= BATCH_SIZE:
            transitions += mem.sample(BATCH_SIZE)
    if transitions == []:
        return

    # transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    # mask the non final states and find the corresponding next states
    # We need an illegal move to be a non-final state
    # Right now, we are throwing out all the final states which include the case when
    # the agent ends the game prematurely after playing an illegal move

    
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    # print('non_final_mask is', non_final_mask)
    non_final_next_states = [s for s in batch.next_state if s is not None]

    non_final_next_states = torch.cat(non_final_next_states)
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    
    # print('reward_batch is', reward_batch)
    
    
    # compute Q(s_t, a)
    # for each state in the batch, find the value of the corresponding action
    state_action_values = policy_net(state_batch.to(torch.float)).gather(1, action_batch)
    
    # compute V(s_{t+1}) = max_a Q(s_{t+1}, a) for the next_states using the target_net
    next_state_values = torch.zeros(len(transitions), device=device)
    # next_state_values = -10.0 * torch.ones(len(transitions), device=device)

    if non_final_next_states != []:
        
        with torch.no_grad():
            # print(non_final_mask.shape, target_net(non_final_next_states).max(1).values)
            next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values  # these will all be negative!

    # R + \gamma max_a Q(s', a)
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # compute the Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # back propagate
    optimizer.zero_grad()
    loss.backward()

    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()


In [10]:


# if GPU is to be used
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)
if torch.cuda.is_available():
    print("CUDA is available. GPU can be used.")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Using CPU.")

env = Card_Env()

CUDA is available. GPU can be used.
Device name: NVIDIA GeForce GTX 1650


In [11]:
def move_to_card(move):
    return ['2','3','4','5','6','7','8','9','10','J','Q','K','A'][move % 13], ['C', 'D', 'H', 'S'][int(move / 13)]

def simulate_game(policy, verbose=False, from_move=0):
    with torch.no_grad():
        moves_played = 0
        active_player = from_move % 4
        if verbose:
            print(f"Starting new game as player {active_player} from turn {from_move}.")
        test_game = Card_Game()
        for turn in range(52):
            if test_game.current_player != active_player or turn < from_move:
                move = test_game.sample_legal_move()
            else:
                # print(policy_net(test_game.get_network_input().to(device)))
                moves_played += 1
                move = policy_net(test_game.get_network_input().to(device)).argmax().item()
                if not(move in test_game.get_legal_moves()):
                    if verbose:
                        print(f"Tried to play illegal move {move_to_card(move)}")
                    return moves_played
            if verbose:
                print(f"Player {test_game.current_player} plays {move_to_card(move)}")
            test_game.play_card(move)
            if turn % 4 == 3 and verbose:
                print()
        return moves_played

In [27]:
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000

# I am making batch_size small here so that we can test if this goes through in shorter time
BATCH_SIZE = 100
MEMORY_SIZE = 10000

# Learning rate of the optimizer
LR = 1e-4

# soft update rate
TAU = 0.005

# future discount
# GAMMA = 1.0
GAMMA = 0


state = env.game.get_network_input()

n_input = len(state)
n_actions = 52

policy_net = DQN(n_input, n_actions).to(device)
# use a target network to prevent oscillation or divergence
target_net = DQN(n_input, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
# target_net.load_state_dict(torch.load('ev_working_function.pth'))



optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = {turn: ReplayMemory(MEMORY_SIZE) for turn in range(13)}

steps_done = 0

'''
Given the game state, select an action by the epsilon-greedy policy
'''
def select_action(game):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1

    # epsilon-greedy choice
        
    if sample > eps_threshold:
        with torch.no_grad():
            # return the index of the card with highest probability
            # predicted from the policy net
            # print(policy_net(game.get_network_input().to(torch.float32).to(device)))
            return policy_net(game.get_network_input().to(torch.float32).to(device)).max(0).indices.view(1,1)
    else:
        # random select a legal action
        return torch.tensor([[game.sample_legal_move()]], device=device, dtype=torch.long) #changed from long



In [31]:
with torch.no_grad():
    print(policy_net(env.game.get_network_input().to(device)))
    print(target_net(env.game.get_network_input().to(device)))

tensor([ 0.0407, -0.0332, -0.0790,  0.0465, -0.0660, -0.0715, -0.0573,  0.0063,
         0.0153,  0.0771,  0.0285, -0.0536,  0.0822, -0.0819,  0.0109, -0.1007,
        -0.0319,  0.0219,  0.0108,  0.0045,  0.0022,  0.0050,  0.1069,  0.0508,
         0.0795, -0.0333,  0.0504,  0.0288, -0.0312, -0.0985, -0.0405, -0.0056,
         0.0174,  0.0303, -0.0761,  0.0066,  0.0535, -0.0558, -0.0275,  0.0384,
         0.0345,  0.0211, -0.0121, -0.0511, -0.0630,  0.0259,  0.0545,  0.0538,
        -0.0268,  0.0404, -0.0394,  0.0149], device='cuda:0')
tensor([ 0.0407, -0.0332, -0.0790,  0.0465, -0.0660, -0.0715, -0.0573,  0.0063,
         0.0153,  0.0771,  0.0285, -0.0536,  0.0822, -0.0819,  0.0109, -0.1007,
        -0.0319,  0.0219,  0.0108,  0.0045,  0.0022,  0.0050,  0.1069,  0.0508,
         0.0795, -0.0333,  0.0504,  0.0288, -0.0312, -0.0985, -0.0405, -0.0056,
         0.0174,  0.0303, -0.0761,  0.0066,  0.0535, -0.0558, -0.0275,  0.0384,
         0.0345,  0.0211, -0.0121, -0.0511, -0.0630,  0.02

In [None]:
# fp =  DQN(n_input, 52).to(device)
# fp.load_state_dict(torch.load('latest_q_function.pth', map_location=torch.device('cpu')))

update_ind = 5000

rewards_list = []
benchmark = (-9.6, 1)

for i_episode in count():

    if i_episode % update_ind == 0 and i_episode != 0:
        print(f"Trained {i_episode} episodes")
        avg_reward = sum(rewards_list) / len(rewards_list)
        print(f'Average reward per episode: {avg_reward}.')
        simul_results = []
        simul_dist = [0 for i in range(13)]
        for g in range(100):
            res = simulate_game(policy_net, verbose=False, from_move=random.randint(0,3))
            simul_results.append(res)
            simul_dist[res-1] += 1
        print(f"Average simulated game duration: {sum(simul_results) / 100}")
        print(f"Distribution of game lengths: {simul_dist}")
        print(f"Memory bank: {[len(mem) for i, mem in memory.items()]}")
        if benchmark[0] < avg_reward and benchmark[1] < sum(simul_results) / 100 and i_episode >= 10000:
            print("New benchmark set.")
            torch.save(policy_net.state_dict(), 'ev_q_function_output.pth')
            benchmark = (avg_reward, sum(simul_results) / 100)
        rewards_list = []
        print()

    
    env.reset()
    state = torch.tensor(env.game.get_network_input(), dtype=torch.float32, device=device).unsqueeze(0)

    player_ind = random.randint(0, 3)
    while env.game.current_player != player_ind:
        move = env.game.sample_legal_move()
        env.game.play_card(move)
    
    for t in count():
        # Select action based on policy network
        
        with torch.no_grad():
            q_values = policy_net(state)
            action = q_values.max(1)[1].view(1, 1)

        # Perform action in the environment
        observation, reward, terminated = env.step(action.item(),fp=None)
        rewards_list.append(reward)
        reward = torch.tensor([reward], device=device)
        done = terminated

        # Compute next state
        if not terminated:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        else:
            next_state = None

        # Store transition in memory
        # int(env.game.turn_counter / 4)
        memory[t].push(state, action, next_state, reward)

        # Move to next state
        state = next_state

        # Perform optimization step
        optimize_model()

        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)

        if done:
            # print(f'Episode {i_episode} ended in {t} steps.')
            break



  state = torch.tensor(env.game.get_network_input(), dtype=torch.float32, device=device).unsqueeze(0)
  next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)


Trained 5000 episodes
Average reward per episode: -0.34120171673819744.
Average simulated game duration: 1.79
Distribution of game lengths: [62, 12, 15, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0]
Memory bank: [10000, 10000, 9140, 3479, 844, 121, 10, 1, 0, 0, 0, 0, 0]

Trained 10000 episodes
Average reward per episode: -0.3051422605063952.
Average simulated game duration: 1.84
Distribution of game lengths: [63, 12, 10, 10, 3, 2, 0, 0, 0, 0, 0, 0, 0]
Memory bank: [10000, 10000, 10000, 3923, 982, 151, 13, 2, 0, 0, 0, 0, 0]
New benchmark set.

Trained 15000 episodes
Average reward per episode: -0.29819550824354146.
Average simulated game duration: 1.97
Distribution of game lengths: [53, 18, 15, 9, 3, 2, 0, 0, 0, 0, 0, 0, 0]
Memory bank: [10000, 10000, 10000, 4376, 1136, 180, 18, 3, 0, 0, 0, 0, 0]
New benchmark set.

Trained 20000 episodes
Average reward per episode: -0.3007284079084287.
Average simulated game duration: 1.78
Distribution of game lengths: [66, 10, 10, 9, 4, 1, 0, 0, 0, 0, 0, 0, 0]
Memory

In [35]:
with torch.no_grad():
    env.game.reset()
    net_input = env.game.get_network_input().to(device)
    hand = net_input[:52]
    cards = torch.arange(52).to(device)
    q_func = policy_net(net_input)
    print(f'{q_func.argmax()}')
    print(torch.stack((cards, hand, q_func)).transpose(0, 1))

32
tensor([[ 0.0000,  1.0000, -1.1869],
        [ 1.0000,  0.0000, -1.3314],
        [ 2.0000,  0.0000, -1.3131],
        [ 3.0000,  0.0000, -2.1086],
        [ 4.0000,  0.0000, -3.4949],
        [ 5.0000,  0.0000, -3.0317],
        [ 6.0000,  0.0000, -1.5275],
        [ 7.0000,  0.0000, -2.9876],
        [ 8.0000,  0.0000, -2.5245],
        [ 9.0000,  0.0000, -2.5576],
        [10.0000,  0.0000, -1.5012],
        [11.0000,  0.0000, -1.8418],
        [12.0000,  0.0000, -0.9287],
        [13.0000,  1.0000, -1.7412],
        [14.0000,  0.0000, -1.1265],
        [15.0000,  0.0000, -2.1763],
        [16.0000,  1.0000, -1.9561],
        [17.0000,  0.0000, -2.4696],
        [18.0000,  0.0000, -2.5106],
        [19.0000,  0.0000, -1.3729],
        [20.0000,  1.0000, -1.1327],
        [21.0000,  0.0000, -1.1238],
        [22.0000,  0.0000, -2.3514],
        [23.0000,  1.0000, -2.1013],
        [24.0000,  1.0000, -0.9404],
        [25.0000,  0.0000, -2.3779],
        [26.0000,  0.0000, -1.5517]

In [27]:
torch.save(policy_net.state_dict(), 'ev_q_function_output.pth')

In [31]:
# simulate_game(policy_net, verbose=True, from_move=random.randint(0,3))
simul_results = []
with torch.no_grad():
    simul_dist = [0 for i in range(13)]
    for g in range(1000):
        res = simulate_game(policy_net, verbose=False, from_move=random.randint(0,3))
        simul_results.append(res)
        simul_dist[res-1] += 1
    print(f"Average simulated game duration: {sum(simul_results) / 1000}")
    print(f"Distribution of game lengths: {simul_dist}")


Average simulated game duration: 1.111
Distribution of game lengths: [891, 107, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [163]:
simulate_game(policy_net, verbose=True, from_move=random.randint(0, 3))

Starting new game as player 2 from turn 2.
Player 0 plays ('10', 'D')
Player 1 plays ('8', 'D')
Player 2 plays ('J', 'D')
Player 3 plays ('3', 'D')

Tried to play illegal move ('J', 'D')


2