In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import deque
import random
from tqdm.notebook import tqdm

np.random.seed(42)

In [None]:
def random_distribution(width,height,snake_list,k=1):
    x = np.arange(width)
    y = np.arange(height)
    cart = np.transpose([np.tile(x, len(y)), np.repeat(y, len(x))])

    weight = np.ones((height,width))
    for (i,j) in snake_list:
        weight[j,i] = 0

    weight = np.ravel(weight)
    Pweights = weight/weight.sum(axis=0,keepdims=1)

    return cart[np.random.choice(len(cart),1,p=Pweights)[0]]

In [None]:
switch_actions = [(0,1),(0,-1),(-1,0),(1,0)]
#0 = UP
#1 = DOWN
#2 = LEFT
#3 = RIGHT

In [None]:
def generate_fruit_certain_distance(width,height,snake_list,distance=2):
    snake_head = snake_list[len(snake_list)-1]
    
    while True:
        x = snake_head[0] + random.randint(-distance,distance)
        y = snake_head[1] + random.randint(-distance,distance)
        x_s = min(max(0,x),width-1)
        y_s = min(max(0,y),height-1)
        if(snake_head != (x_s,y_s)):
            return [x_s,y_s]

In [None]:
def angle_between(p1, p2):
    ang1 = np.arctan2(*p1[::-1])
    ang2 = np.arctan2(*p2[::-1])
    return np.rad2deg((ang1 - ang2) % (2 * np.pi))

In [None]:
def mean_results(all_data):
  somma = sum(all_data)
  size = len(all_data)
  return int(somma/size)

In [None]:
def switch_matrix(X,Y):
    aug_x = []
    aug_y = []
    for (a,q) in tqdm(tuple(zip(X,Y))):
        aug_x.append(a)
        aug_y.append(q)

        b = np.zeros((a.shape[0],a.shape[1],1))
        for i in range(int(b.shape[0]/2)):
            b[i] = a[a.shape[0]-1-i]
            b[a.shape[0]-1-i] = a[i]

        aug_x.append(b)
        aug_y.append(np.array([q[1],q[0],q[2],q[3]]))

        c = np.zeros((a.shape[0],a.shape[1],1))
        for i in range(int(c.shape[0])):
            for j in range(int(c.shape[1]/2)):
                c[i,j] = a[i,a.shape[1]-1-j]
                c[i,c.shape[1]-1-j] = a[i,j]

        aug_x.append(b)
        aug_y.append(np.array([q[0],q[1],q[3],q[2]]))

        d = np.zeros((a.shape[0],a.shape[1],1))
        for i in range(int(d.shape[0])):
            for j in range(int(d.shape[1]/2)):
                d[i,j] = b[i,b.shape[1]-1-j]
                d[i,d.shape[1]-1-j] = b[i,j]

        aug_x.append(b)
        aug_y.append(np.array([q[1],q[0],q[3],q[2]]))

    return np.array(aug_x),np.array(aug_y)

In [None]:
class SnakeEnv:
    def __init__(self,width,height):
        self.width = width
        self.height = height

        #informazioni relative la singola partita
        #informazione sul serpente
        self.reset()
    
    def reward(self,act=-1):
        rewards = np.zeros((4))
        for action in range(4):
            new_head,new_body,_ = self.move_snake(action)
            rewards[action] = -1
            if self.hit_body(new_head,new_body) or self.hit_border(new_head,new_body):
                rewards[action] = -100
            if self.hit_fruit(new_head,new_body):
                rewards[action] = 100

        if act != -1:
            return rewards[act]
        
        return rewards
        
    def hit_border(self,new_head,new_body):
        #-1 se sbatto contro le pareti
        return new_head[0] >= self.width or new_head[0] < 0 or new_head[1] >= self.height or new_head[1] < 0
    
    def hit_body(self,new_head,new_body):

        for x in new_body[:-1]:
            if x == new_head:
                return 1
        return 0
        
    def hit_fruit(self,new_head,new_body):
        return new_head == self.fruit

    def stop_game(self):
        return self.hit_body(self.snake_head,self.snake_body) or self.hit_border(self.snake_head,self.snake_body) or len(self.snake_body) == (self.width*self.height -5)

    def move_snake(self,action):
        new_head = tuple(map(sum, zip(self.snake_head, switch_actions[action])))
        new_body = self.snake_body[1:] + [new_head]
        return new_head,new_body,self.snake_body[0]

    def next_state(self,action):
        self.snake_head,self.snake_body,start = self.move_snake(action)
        if self.hit_fruit(self.snake_head,self.snake_body):
            self.score += 1
            self.snake_body = [start] + self.snake_body
            self.gen_fruit()

    def transition(self,action):
        self.total_reward += self.reward(action)
        is_terminal = int(self.hit_fruit(self.snake_head,self.snake_body) or self.stop_game())
        transition = [self.field,action,self.reward(action),0,int(is_terminal)]
        self.next_state(action)
        self.update_field()
        transition[3] = self.field
        self.transitions.append(self.snake_head)
        return transition

    def random_start(self):
        [sx,sy] = random_distribution(self.width,self.height,[])
        self.snake_head = (int(sx),int(sy))
        self.snake_body = [self.snake_head]
        
        self.gen_fruit()
        self.score = 0
        self.total_reward = 0
        self.transitions= []
        self.update_field()


    def reset(self):
        self.snake_head = (int(self.width/2),int(self.height/2))
        self.snake_body = [self.snake_head]
        self.gen_fruit()
        self.score = 0
        self.total_reward = 0
        self.transitions= []
        self.update_field()


    def gen_fruit(self):
        [x,y] = random_distribution(self.width,self.height,self.snake_body)
        self.fruit = (x,y)

    def update_field(self):
        self.field = np.ones((self.width,self.height,1))
        if not self.stop_game():
            self.field = np.ones((self.width,self.height,1))
            for x,y in self.snake_body:
                self.field[y][x][0] = 10
            self.field[self.snake_head[1],self.snake_head[0],0] = 20
            self.field[self.fruit[1],self.fruit[0],0] = 30
    
    def print_field(self):
        print(self.field.reshape(self.width,self.height))

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import deque
import random

np.random.seed(42)

class DQM():
    def __init__(self,width,height,size_buffer=100000,update_term=2,batch_size = 5000, discount=0.5,equalize_term=4):
        self.width = width
        self.height = height

        self.train_model = self.create_graph()
        self.predict_model = self.create_graph()
        
        self.equalize_model()

        self.size_buffer = size_buffer
        self.buffer = deque(maxlen=size_buffer)
        self.update_counter = 0
        self.update_term = update_term
        self.batch_size = batch_size
        self.discount = discount

        self.equalize_counter = 0
        self.equalize_term = equalize_term

    def create_graph(self):
      model = keras.models.Sequential([
          keras.layers.InputLayer(input_shape=[width,height,1]),
          #keras.layers.Conv2D(16,(3,3),activation="tanh"),
          keras.layers.Flatten(),
          keras.layers.Dense(512,activation="tanh"),
          keras.layers.Dense(256,activation="tanh"),
          keras.layers.Dense(128,activation="tanh"),
          keras.layers.Dense(64,activation="tanh"),
          keras.layers.Dense(32,activation="tanh"),
          keras.layers.Dense(4,activation="linear")
      ])
      model.compile(loss="mse", optimizer="adam", metrics=['mean_absolute_percentage_error'])
      return model
  
    def add_train_data(self, transition):
        self.buffer.append(transition)

    def train(self):
        if(len(self.buffer)<self.batch_size):return 
        batch = random.sample(self.buffer, min(len(self.buffer),self.batch_size))

        X_states = tf.convert_to_tensor(np.array([t[0] for t in batch]), np.float32)
        X_next_states = tf.convert_to_tensor(np.array([t[3] for t in batch]), np.float32)

        Y_current_q_values = self.train_model.predict(X_states)
        Y_future_q_values = self.predict_model.predict(X_next_states)
                
        for index, (current_state, action, reward, new_state, done) in enumerate(batch):
            if not done:
                max_future_q_value = np.max(Y_future_q_values[index])
                Y_current_q_values[index][action] = reward + self.discount * max_future_q_value
            else:
                Y_current_q_values[index][action] = reward

        #X_states,Y_current_q_values = switch_matrix(X_states,Y_current_q_values)

        Y_q_values = tf.convert_to_tensor(Y_current_q_values, np.float32)

        self.train_model.fit(x=X_states, y=Y_q_values, batch_size=min(len(self.buffer),self.batch_size),epochs=10,shuffle=True ,verbose=0)
        self.equalize_counter += 1
        if self.equalize_counter == self.equalize_term:
            print("update target model")
            self.equalize_counter = 0
            self.equalize_model()
     
    def predict_q_values(self, state):
        X = np.zeros((1,self.width,self.height,1))
        X[0] = state
        X_tf = tf.convert_to_tensor(X, np.float32)
        return self.train_model.predict(X_tf)

    def equalize_model(self):
        self.predict_model.set_weights(self.train_model.get_weights())

    def load_trained_model(self,name):
        self.train_model.load_weights(name)
        self.equalize_model()

In [None]:
def choose_action(env,model,eps):  
  action = 0

  #scegliamo una reward add hoc
  rewards = env.reward()
  action = np.argmax(rewards)
  random_action = [i for i, j in enumerate(rewards) if j == rewards[action]]
  action = (random.sample(random_action,k=1))[0]

  if rewards[action] == -1:
    
    degrees = angle_between((0,0),(env.fruit[0]-env.snake_head[0],env.fruit[1]-env.snake_head[1]))
    vector_choice = 3
    if degrees >= 45 and degrees <= 135: vector_choice = 1
    if degrees >= 135 and degrees <= 225: vector_choice =  2
    if degrees >= 225 and degrees <= 315: vector_choice =  0
    if rewards[vector_choice] != -100:
        action = vector_choice

  #vediamo che ci consiglia il modello
  q_values = model.predict_q_values(env.field)
  q_action = np.argmax(q_values[0])
  
  return random.choices([action,q_action], weights=(max(0,eps),min(1-eps,1)), k=1)[0]

In [None]:
def test(model,env,verbose=0):
    tmosse = []
    trewards = []
    tscore = []
    out=0

    for _ in tqdm(range(20)):
        m,r,s,o = q_game(model,env)
        if o == 0:
            tmosse.append(m)
            trewards.append(r)
            tscore.append(s)
        else:
            out+=1  
        if verbose==1:
            print(m,r,s,o)    
    print("test",mean_results(tmosse),mean_results(trewards),mean_results(tscore),out)

In [None]:
width = 8
height = 8

In [None]:
env = SnakeEnv(width,height)
model = DQM(width,height)

In [None]:
def q_game(model,env):
  env.random_start()
  mosse = 0
  out = 0
  while env.stop_game() == False:

      q_values = model.predict_q_values(env.field)
      action = np.argmax(q_values[0])
      transition = env.transition(action)
      mosse += 1
      if mosse >= 100:
          out = 1
          break
  return (mosse,env.total_reward,env.score,out)

In [None]:
model.load_trained_model("new_new_model88_512.h5")

In [None]:
def trainIA():
    gm = 200
    tr = 6
    eps = 0.8
    global_score = 0

    it = 0
    while eps >= 0.5:
        print("ITERAZIONE:",it)
        reward_medio = []
        mosse_medie = []
        score_medio = []

        for _ in tqdm(range(gm)):
            env.random_start()
            mosse = 0
            while env.stop_game() == False:
                action = choose_action(env,model,eps)
                transition = env.transition(action)
                model.add_train_data(transition)
                global_score = max(global_score,env.score)
                mosse += 1
                if mosse >= 100:
                  break

            mosse_medie.append(mosse)
            reward_medio.append(env.total_reward)
            score_medio.append(env.score)
        eps -= 0.005
        print("TRAINING PHASE")
        for i in range(tr):
            print("training:",i)
            model.train()
        print("TEST PHASE")
        print("mosse",mean_results(mosse_medie),"reward",mean_results(reward_medio),"buffer",len(model.buffer),"score",mean_results(score_medio),"global_score",global_score,"eps",eps)
        test(model,env)
        it+=1
        model.train_model.save_weights("new_new_model88_512.h5")

In [None]:
trainIA()

ITERAZIONE: 0


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 40 reward 488 buffer 8123 score 6 global_score 16 eps 0.795


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 10 50 1 8
ITERAZIONE: 1


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 41 reward 497 buffer 16364 score 6 global_score 16 eps 0.79


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 10 50 1 8
ITERAZIONE: 2


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 42 reward 526 buffer 24876 score 6 global_score 18 eps 0.785


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 14 73 1 7
ITERAZIONE: 3


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 40 reward 480 buffer 32997 score 6 global_score 18 eps 0.78


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 62 1 9
ITERAZIONE: 4


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 40 reward 475 buffer 41020 score 6 global_score 18 eps 0.775


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 9 16 1 7
ITERAZIONE: 5


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 38 reward 462 buffer 48639 score 5 global_score 18 eps 0.77


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 7 -5 1 4
ITERAZIONE: 6


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 39 reward 477 buffer 56514 score 6 global_score 18 eps 0.765


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 17 147 2 7
ITERAZIONE: 7


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 41 reward 505 buffer 64749 score 6 global_score 18 eps 0.76


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 123 2 8
ITERAZIONE: 8


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 40 reward 491 buffer 72882 score 6 global_score 18 eps 0.755


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 12 51 1 7
ITERAZIONE: 9


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 39 reward 468 buffer 80869 score 6 global_score 18 eps 0.75


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 12 69 1 5
ITERAZIONE: 10


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 40 reward 499 buffer 88957 score 6 global_score 18 eps 0.745


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 11 91 2 7
ITERAZIONE: 11


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 37 reward 452 buffer 96440 score 5 global_score 18 eps 0.74


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 10 58 1 8
ITERAZIONE: 12


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 40 reward 481 buffer 100000 score 6 global_score 18 eps 0.735


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 18 138 2 5
ITERAZIONE: 13


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 40 reward 484 buffer 100000 score 6 global_score 18 eps 0.73


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 108 2 9
ITERAZIONE: 14


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 36 reward 438 buffer 100000 score 5 global_score 18 eps 0.725


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 16 118 2 7
ITERAZIONE: 15


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 38 reward 443 buffer 100000 score 5 global_score 18 eps 0.72


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 115 2 8
ITERAZIONE: 16


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 35 reward 410 buffer 100000 score 5 global_score 18 eps 0.715


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 42 1 5
ITERAZIONE: 17


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 36 reward 439 buffer 100000 score 5 global_score 18 eps 0.71


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 112 2 11
ITERAZIONE: 18


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 35 reward 408 buffer 100000 score 5 global_score 18 eps 0.705


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 15 103 2 7
ITERAZIONE: 19


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 37 reward 437 buffer 100000 score 5 global_score 19 eps 0.7


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 9 43 1 6
ITERAZIONE: 20


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 36 reward 422 buffer 100000 score 5 global_score 19 eps 0.695


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 12 40 1 8
ITERAZIONE: 21


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 35 reward 412 buffer 100000 score 5 global_score 19 eps 0.69


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 11 64 1 5
ITERAZIONE: 22


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 38 reward 450 buffer 100000 score 5 global_score 19 eps 0.6849999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 10 39 1 3
ITERAZIONE: 23


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 35 reward 412 buffer 100000 score 5 global_score 19 eps 0.6799999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 10 65 1 5
ITERAZIONE: 24


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 37 reward 436 buffer 100000 score 5 global_score 19 eps 0.6749999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 136 2 5
ITERAZIONE: 25


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 32 reward 368 buffer 100000 score 4 global_score 19 eps 0.6699999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 14 135 2 5
ITERAZIONE: 26


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 34 reward 399 buffer 100000 score 5 global_score 19 eps 0.6649999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 16 94 2 6
ITERAZIONE: 27


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 36 reward 403 buffer 100000 score 5 global_score 19 eps 0.6599999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 111 2 6
ITERAZIONE: 28


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 33 reward 393 buffer 100000 score 5 global_score 19 eps 0.6549999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 17 92 2 5
ITERAZIONE: 29


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 34 reward 397 buffer 100000 score 5 global_score 19 eps 0.6499999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 16 179 2 8
ITERAZIONE: 30


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 32 reward 370 buffer 100000 score 4 global_score 19 eps 0.6449999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 107 2 9
ITERAZIONE: 31


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 33 reward 377 buffer 100000 score 5 global_score 19 eps 0.6399999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 15 103 2 7
ITERAZIONE: 32


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 32 reward 368 buffer 100000 score 4 global_score 19 eps 0.6349999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 17 140 2 7
ITERAZIONE: 33


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 34 reward 374 buffer 100000 score 5 global_score 19 eps 0.6299999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 16 136 2 8
ITERAZIONE: 34


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 33 reward 380 buffer 100000 score 5 global_score 19 eps 0.6249999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 13 108 2 9
ITERAZIONE: 35


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 33 reward 376 buffer 100000 score 5 global_score 19 eps 0.6199999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 9 23 1 7
ITERAZIONE: 36


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 32 reward 365 buffer 100000 score 4 global_score 19 eps 0.6149999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 14 88 2 11
ITERAZIONE: 37


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 32 reward 384 buffer 100000 score 5 global_score 19 eps 0.6099999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 20 169 2 6
ITERAZIONE: 38


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 34 reward 399 buffer 100000 score 5 global_score 19 eps 0.6049999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 24 240 3 10
ITERAZIONE: 39


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 34 reward 401 buffer 100000 score 5 global_score 19 eps 0.5999999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 14 110 2 6
ITERAZIONE: 40


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 33 reward 391 buffer 100000 score 5 global_score 19 eps 0.5949999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 19 227 3 13
ITERAZIONE: 41


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 33 reward 370 buffer 100000 score 4 global_score 19 eps 0.5899999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 19 176 2 8
ITERAZIONE: 42


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 31 reward 361 buffer 100000 score 4 global_score 19 eps 0.5849999999999999


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 16 101 2 7
ITERAZIONE: 43


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 33 reward 395 buffer 100000 score 5 global_score 19 eps 0.5799999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 17 135 2 8
ITERAZIONE: 44


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 29 reward 314 buffer 100000 score 4 global_score 19 eps 0.5749999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 16 124 2 12
ITERAZIONE: 45


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 32 reward 380 buffer 100000 score 5 global_score 19 eps 0.5699999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 19 208 3 7
ITERAZIONE: 46


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 34 reward 396 buffer 100000 score 5 global_score 19 eps 0.5649999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 18 106 2 6
ITERAZIONE: 47


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 33 reward 386 buffer 100000 score 5 global_score 19 eps 0.5599999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 12 98 2 6
ITERAZIONE: 48


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 32 reward 351 buffer 100000 score 4 global_score 19 eps 0.5549999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 15 140 2 3
ITERAZIONE: 49


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 32 reward 363 buffer 100000 score 4 global_score 19 eps 0.5499999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 20 212 3 6
ITERAZIONE: 50


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 34 reward 409 buffer 100000 score 5 global_score 19 eps 0.5449999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 19 199 3 7
ITERAZIONE: 51


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 30 reward 345 buffer 100000 score 4 global_score 19 eps 0.5399999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 21 182 3 9
ITERAZIONE: 52


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 32 reward 370 buffer 100000 score 4 global_score 19 eps 0.5349999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 22 232 3 6
ITERAZIONE: 53


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 31 reward 348 buffer 100000 score 4 global_score 19 eps 0.5299999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 11 58 1 8
ITERAZIONE: 54


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 30 reward 321 buffer 100000 score 4 global_score 19 eps 0.5249999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 22 194 3 4
ITERAZIONE: 55


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 29 reward 311 buffer 100000 score 4 global_score 19 eps 0.5199999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 20 163 2 10
ITERAZIONE: 56


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 30 reward 331 buffer 100000 score 4 global_score 19 eps 0.5149999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 12 97 2 6
ITERAZIONE: 57


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 29 reward 300 buffer 100000 score 4 global_score 19 eps 0.5099999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 18 147 2 7
ITERAZIONE: 58


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
training: 2
training: 3
update target model
training: 4
training: 5
TEST PHASE
mosse 30 reward 335 buffer 100000 score 4 global_score 19 eps 0.5049999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 19 198 3 6
ITERAZIONE: 59


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


TRAINING PHASE
training: 0
training: 1
update target model
training: 2
training: 3
training: 4
training: 5
update target model
TEST PHASE
mosse 31 reward 325 buffer 100000 score 4 global_score 19 eps 0.4999999999999998


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


test 17 194 3 6


In [None]:
test(model,env,1)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

29 175.0 3 0
100 -100.0 0 1
24 281.0 4 0
4 -103.0 0 0
100 -100.0 0 1
13 90.0 2 0
100 -100.0 0 1
100 -100.0 0 1
14 89.0 2 0
2 -101.0 0 0
8 95.0 2 0
100 1.0 1 1
100 203.0 3 1
100 -100.0 0 1
23 181.0 3 0
13 90.0 2 0
5 -3.0 1 0
20 285.0 4 0
100 -100.0 0 1
100 -100.0 0 1

test 14 98 2 9
