In [1]:
import torch
from torch import nn
import random,numpy
import numpy as np
from copy import deepcopy
from game import Game, Move, RandomPlayer, MyPlayer, TrainedPlayer, translate_number_to_position_direction, translate_number_to_position
import torch.nn.init as init
from tqdm import tqdm

In [2]:
class QuixoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(5*5, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.Linear(100, 44),
        )
        for layer in self.linear_relu_stack:
                if isinstance(layer, nn.Linear):
                    init.xavier_uniform_(layer.weight)
                    layer.bias.data.fill_(0.01) # bias is initialized to 0.01 

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    

## Training the agent

In [3]:
## Constants definition
num_iterations = 500
num_matches = 10
max_dim_replay_buff = 10_000
time_to_update = 100
gamma = 0.99
batch_size=50

device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")
torch.device(device)


Using cpu device


  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [4]:
class ReplayBuffer:    
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size        
        self.buffer = []
        self.position = 0
        
    def add_experience(self, experience):        
        if len(self.buffer) < self.buffer_size:
            self.buffer.append(experience)        
        else:
            self.buffer[self.position] = experience        
            self.position = (self.position + 1) % self.buffer_size
            
    def sample_batch(self, batch_size):
        batch_indices = np.random.choice(len(self.buffer), batch_size, replace=True)
        batch = [self.buffer[i] for i in batch_indices]
        return batch

In [5]:
from game import Player
from tqdm import tqdm


def training(player1: 'Player', player2: 'Player'):
    loss_tot = []
   
    replay_buff = ReplayBuffer(max_dim_replay_buff) # replay buffer, from which we sample for BATCH learning
    # inizializza i modelli -> passali per parametro
    torch.set_grad_enabled(True)

    
    for step in tqdm(range(num_iterations), desc="Training Iterations"):  
        # we fill the replay buffer with experiences made with matches
    
        for match in range(num_matches):
            # lo uso solo in inferenza
            # gioca una intera partita qui dentro e per ogni mossa, metti l'experience dentro il ReplayBuffer
            # ...
            # ... qui descrivere come viene effettivamente usata la rete neurale (magari impacchettarla in qualche funzione)
            winner = -1
            g = Game()
            #player1 = MyPlayer()
            #player2 = RandomPlayer()
            players = [player1, player2]
            
            while winner < 0:
              
                    g.current_player_idx += 1
                    g.current_player_idx %= len(players)
                    #print(g.current_player_idx)
                    prev_state=deepcopy(g)
                    from_pos, slide = players[g.current_player_idx].make_move(g)
                    
                    if g.current_player_idx==0:
                        ##l'azione è il valore del q-value quindi un numero? o la posizione in questo caso  
                        ##in quel caso replay_row=(prev_state.get_board,GeneratorNet((prev_state.get_board),g.get_board,reward)
                        reward=g.compute_reward(from_pos, slide)
                        #io qua dentro x ogni row voglio i 44 output della mia rete
                        
                        replay_row=(prev_state.get_flat_board(),player1.last_action_number, deepcopy(g), reward)
                        #eliminato last_action da replay row?? Non dovrebbe servire
                        replay_buff.add_experience(replay_row)
                        
                    
                    if g.check_winner() != -1:
                        break
            
            # replay_buffer.push(...) per ogni transizione di stato osservata ()
            # restituisci il risultato sotto forma di tupla (st, a, st+1, r) - (State, action, next_state, reward)
            
        # Now we sample a batch of data from the ReplayBuffer in order to train the Agent
        batch_to_train = replay_buff.sample_batch(batch_size)
      
        #divide the batch
        state_batch,action_num, next_state_batch, reward = zip(*batch_to_train) 

        #forward the Generator
        q_values = player1.GeneratorNet(torch.tensor(state_batch, dtype=torch.float32).to(player1.device))
        q_values_target = torch.zeros(batch_size, 44).to(player1.device)


        #update q_values target by using Bellman Equation  
        for i in range(batch_size): 
            if reward[i]!=1:
                q_values_target[i, action_num[i]] = reward[i] + gamma * torch.tensor(player1.compute_target(next_state_batch[i])).to(player1.device).item()
            else:
               q_values_target[i, action_num[i]] =reward[i]
          
        player1.optimizer.zero_grad()
        loss_curr=player1.criterion(q_values,q_values_target)
        loss_curr.backward()
    
        player1.optimizer.step()

        loss_tot.append(loss_curr)
      
        if (step % time_to_update) == 0:
            # update the parameter of the TargetNet
            player1.copy_params_TargetNet()
            
       
        #printa come varia 
   
    return player1.GeneratorNet.state_dict()
            
      

In [6]:
player1 = MyPlayer()
player2 = RandomPlayer()
#TrainedGeneratorNet=MyPlayer()


trained_model_params = training(player1, player2)
trained_player = TrainedPlayer(trained_model_params)
#TrainedGeneratorNet.GeneratorNet.load_state_dict(trained_model_params) # TRAINED NETWORK to use in INFERENCE PHASE


  q_values = player1.GeneratorNet(torch.tensor(state_batch, dtype=torch.float32).to(player1.device))
Training Iterations: 100%|██████████| 500/500 [00:32<00:00, 15.45it/s]


## Inference

In [15]:
wins = 0
losts = 0
draws = 0
num_match_test = 10

for _ in tqdm(range(num_match_test)):
    player=RandomPlayer()
    g=Game()
    #winner=g.play(TrainedGeneratorNet,player)
    winner=g.play(trained_player,player)

    if winner==0:
        wins+=1
        print("sii")
       
    if winner==1:
        print("noo")
        losts+=1
        
    if winner==-2:
        print("bho")
        draws+=1

print(f"Accuracy: {(wins/num_match_test)*100}")
print(f"Wins: {wins} - Losts: {losts} - Draws {draws}")



100%|██████████| 10/10 [00:00<00:00, 355.26it/s]

sii
noo
noo
noo
sii
sii
sii
sii
sii
sii
Accuracy: 70.0
Wins: 7 - Losts: 3 - Draws 0



