In [1]:
import backgammon
import random
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU, BatchNormalization

# if GPU is available this code will state 1
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

class BGBoard:
    def __init__(self):
        self.board = backgammon.Backgammon()
        self.board.start()
        self.first_turn = self.board.match.player
        self.winner = -1
        self.state_players = []
        
    def get_moves_and_positions(self,swap=False):
        possible_moves_positions = {}
        possible_moves = []
        for play in self.board.generate_plays():
            pair = []
            for move in play.moves:
                pair.append((move.source,move.destination))
            pair = tuple(pair)
            turn = self.board.match.turn.value
            if turn == 0:
                current_player = 1
            else:
                current_player = 0
            if swap == False:
                position = play.position
            else:
                position = swap_position(play.position)
            possible_moves_positions[pair] = self.state(position,current_player)
            possible_moves.append(pair)
        return possible_moves_positions,possible_moves

    # def get_possible_moves(self, possible_moves_positions:dict):
    #     possible_moves = possible_moves_positions.keys()
    #     return list(possible_moves)
    
    def state(self,position,current_player):
        # position = self.board.position
        board_points = list(position.board_points)
        p_bar = [position.player_bar]
        p_off = [position.player_off]
        op_bar = [position.opponent_bar]
        op_off = [position.opponent_off]
        # die_1 = [self.board.match.dice[0]]
        # die_2 = [self.board.match.dice[1]]
        current_player = [current_player]
        
        state_list = board_points+p_bar+p_off+op_bar+op_off+current_player
        state_array = np.array(state_list)/15
        
        return state_array

    def make_move(self, move):
        '''
        move is tuple of tuples. e.g ((22,19),(23,21),...)
        '''
        if self.board.match.player.value == self.first_turn:
            self.state_players.append(0)
        else:
            self.state_players.append(1)
        
        if move == None:
            self.board.skip()
        else:
            self.board.play(move)
        
        
        if self.is_over():
            return self.winner
            
        if self.board.match.dice == (0, 0):
            self.board.roll()
        
    def is_over(self):
        p0_score = self.board.match.player_0_score
        p1_score = self.board.match.player_1_score
        
        if p0_score > p1_score:
            self.winner = 0
            return True
        elif p0_score < p1_score:
            self.winner = 1
            return True
        else:
            return False

    # def get_reward(self):
    #     player = self.board.match.player
    #     if winner != -1:
    #         if winner == player:
    #             return torch.tensor(1,device=device, dtype=torch.float32)
    #         elif winner != player:
    #             return torch.tensor(-1,device=device, dtype=torch.float32)
        

Num GPUs Available:  1


In [2]:
model = Sequential([
    Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.3), input_shape=(29,)), # 29 features for board state
    BatchNormalization(),
    Dense(64, activation=tf.keras.layers.LeakyReLU(alpha=0.3)),
    BatchNormalization(),
    Dropout(0.4),
    Dense(1, activation="tanh")  # Output: Estimated value of the state
])
learning_rate = 0.001
optimizer = Adam(learning_rate=learning_rate)

# Compile the model
model.compile(optimizer=optimizer,
              loss='mean_squared_error')



        
    
losses = []

def td_lambda_update(states, rewards, model, eligibility_traces, alpha=learning_rate, gamma=0.99, lambda_=0.8):
    """
    Perform TD(λ) updates with eligibility traces.

    Args:
        states: List of game states (numpy arrays).
        rewards: List of rewards for each state.
        model: The neural network model.
        eligibility_traces: Persistent eligibility traces for the model's parameters.
        alpha: Learning rate.
        gamma: Discount factor.
        lambda_: Eligibility trace decay factor.
        
    Returns:
        Average TD error loss for the episode.
    """
    total_loss = 0

    for t in range(len(states) - 1):
        with tf.GradientTape() as tape:
            # Predict current and next state values
            V_t = model(tf.convert_to_tensor([states[t]], dtype=tf.float32))  # Shape: (1, 1)
            V_t_plus_1 = model(tf.convert_to_tensor([states[t + 1]], dtype=tf.float32))  # Shape: (1, 1)

            # Compute TD error (delta_t)
            delta_t = rewards[t] + gamma * tf.squeeze(V_t_plus_1) - tf.squeeze(V_t)

        # Calculate gradients for the current state's value prediction
        gradients = tape.gradient(V_t, model.trainable_variables)

        # Update eligibility traces and model weights
        for i, gradient in enumerate(gradients):
            if gradient is not None:
                eligibility_traces[i] = gamma * lambda_ * eligibility_traces[i] + gradient
                model.trainable_variables[i].assign_add(alpha * delta_t * eligibility_traces[i])

        total_loss += delta_t ** 2

    return total_loss / (len(states) - 1)






In [3]:
from backgammon.position import Position

def swap_position(position) -> "Position":
        return Position(
            tuple(map(lambda n: -n, position.board_points[::-1])),
            position.opponent_bar,
            position.opponent_off,
            position.player_bar,
            position.player_off,
        )

In [4]:
def play_game(model,random_threshold=0.75):
    states = []
    b = BGBoard()
    first_player_turn = b.board.match.turn.value
    
    while not b.is_over():
        turn = b.board.match.turn.value
        position = b.board.position

        # Append the current state (for the current player)
        if turn == first_player_turn:
            states.append(b.state(position,turn))
            swap = False
        else:
            current_state_swapped = b.state(swap_position(position),turn)
            states.append(current_state_swapped)
            swap = True
        
        moves_positions,possible_moves = b.get_moves_and_positions(swap=swap)

        move_values = []
        
        if moves_positions != {}:
            if random.uniform(0,1)>random_threshold:
                best_move = random.choice(possible_moves)
                
            else:
                # Create a batch of next states for all possible moves
                moves_positions_list = [moves_positions[move] for move in possible_moves]
                
                next_states = np.array(moves_positions_list)
                
                # Predict values for all possible moves in a single batch
                move_values = model.predict(next_states).flatten()  # Flatten to 1D array
                
                best_move = possible_moves[np.argmax(move_values)]
                
        else:
            best_move = None

        b.make_move(best_move)

    
    states = np.array(states)
    
    rewards = []
    for idx,state in enumerate(states):
        if idx != len(states)
        if states[idx+1][25] 
    if states[-1][-1] * 15 == first_player_turn:
        rewards[-1] = 1
        
    else:
        rewards[-1] = -1
       
    return states, rewards

In [7]:
last_epoch = 0
epochs = 40000
random_threshold = 0.1
learning_rate = 0.01

In [8]:
print(last_epoch, random_threshold ,learning_rate)

0 0.1 0.01


In [31]:
learning_rate = 0.01
for epoch in range(last_epoch, epochs):  # Number of training games
    # print(f'progress: {epoch}/{epochs}, threshold: {random_threshold} ',end='\r')
    # Generate a game through self-play
    states,rewards = play_game(model,random_threshold)
    
    for state in states:
        print(state*15)
    break
    if random_threshold < 0.85:
        random_threshold += 0.00003
    eligibility_traces = [tf.zeros_like(var) for var in model.trainable_variables]

    # Perform TD(λ) updates
    loss = td_lambda_update(states, rewards, model,eligibility_traces, alpha=learning_rate)
    losses.append(loss)
    # Evaluate the model every 100 games
    if epoch % 100 == 0:
        model.save(f"backgammon_RLmodel_new_03_29_128_64_{epoch}.h5")
        
    if epoch % 300 == 0 and epoch != 0:
        learning_rate *= 0.65
    last_epoch = epoch
    if len(losses) % 10 == 0:
        clear_output(wait=True)  # Clear the output of the cell
        
        # Plot the losses
        plt.plot(losses, label='TD Error (Loss)')
        
        # Calculate the mean of the last 20 losses (if there are at least 20 values)
        if len(losses) >= 100:
            rolling_mean = np.convolve(losses, np.ones(100)/100, mode='valid')
            plt.plot(range(99, len(losses)), rolling_mean, label='Mean of last 100', color='orange', linestyle='--')
    
    
        plt.xlabel('Games')
        plt.ylabel('TD Error (Loss)')
        plt.legend()
        plt.show()  # Display the updated plot
    

[-2.  0.  0.  0.  0.  5.  0.  3.  0.  0.  0. -5.  5.  0.  0.  0. -3.  0.
 -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.  1.]
[-2.  0.  0.  0.  1.  5.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.  0.
 -5.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
[-2.  0.  0.  0.  1.  5.  0.  2.  0.  0.  0. -4.  5.  0.  0.  0. -3.  0.
 -6.  1.  0.  0.  0.  1.  0.  0.  0.  0.  1.]
[-2.  0.  1.  0.  1.  5.  0.  1.  0.  0.  0. -4.  5.  0.  0.  0. -3.  0.
 -6.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
[-2.  0.  1.  0.  1.  5.  0.  1.  0.  0.  0. -4.  5.  0.  0.  0. -2. -1.
 -5.  1.  0. -1.  1.  0.  0.  0.  0.  0.  1.]
[-2.  0.  1.  1.  1.  4.  0.  1.  0.  0.  0. -4.  5.  0.  1.  0. -2. -1.
 -5.  0.  0. -1.  1.  0.  0.  0.  0.  0.  0.]
[-2.  0.  1.  1.  1.  4.  0.  1.  0.  0.  0. -4.  5.  0.  1.  0. -1. -2.
 -5.  0.  0.  0.  1. -1.  0.  0.  0.  0.  1.]
[-2.  0.  1.  1.  1.  5.  0.  1.  0.  0.  0. -4.  5.  0.  0.  0. -1. -2.
 -5.  0.  0.  0.  1. -1.  0.  0.  0.  0.  0.]
[-1.  0.  1.  1. -1.  5.  0.  1.  0.  0.  0. -4.

In [42]:
rewards = [0]
for idx,state in enumerate(states):
    if idx != len(states) - 1:
        current_state = states[idx]
        next_state = states[idx+1]
        if next_state[25] > current_state
    # print(idx )
    # if states[idx+1][25] 

if states[-1][-1] * 15 == first_player_turn:
    rewards[-1] = 1
else:
    rewards[-1] = -1

[-2.  0.  0.  0.  0.  5.  0.  3.  0.  0.  0. -5.  5.  0.  0.  0. -3.  0.
 -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.  1.]
[-2.  0.  0.  0.  1.  5.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.  0.
 -5.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
[-2.  0.  0.  0.  1.  5.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.  0.
 -5.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
[-2.  0.  0.  0.  1.  5.  0.  2.  0.  0.  0. -4.  5.  0.  0.  0. -3.  0.
 -6.  1.  0.  0.  0.  1.  0.  0.  0.  0.  1.]
[-2.  0.  0.  0.  1.  5.  0.  2.  0.  0.  0. -4.  5.  0.  0.  0. -3.  0.
 -6.  1.  0.  0.  0.  1.  0.  0.  0.  0.  1.]
[-2.  0.  1.  0.  1.  5.  0.  1.  0.  0.  0. -4.  5.  0.  0.  0. -3.  0.
 -6.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
[-2.  0.  1.  0.  1.  5.  0.  1.  0.  0.  0. -4.  5.  0.  0.  0. -3.  0.
 -6.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
[-2.  0.  1.  0.  1.  5.  0.  1.  0.  0.  0. -4.  5.  0.  0.  0. -2. -1.
 -5.  1.  0. -1.  1.  0.  0.  0.  0.  0.  1.]
[-2.  0.  1.  0.  1.  5.  0.  1.  0.  0.  0. -4.

In [36]:
print(len(states))

193


In [37]:
states[193]

IndexError: index 193 is out of bounds for axis 0 with size 193

In [25]:
import time
def evaluate_play_game(model_1,model_2):
    b = BGBoard()
    first_player_turn = b.board.match.turn.value
    players_dict = {0:model_1, 1:model_2}
    while not b.is_over():
        turn = b.board.match.turn.value
        position = b.board.position
        if turn == first_player_turn:
            swap = False
        else:
            swap = True

        moves_positions,possible_moves = b.get_moves_and_positions(swap=swap)
        
        move_values = []
        
        if moves_positions != {}:

            # Create a batch of next states for all possible moves
            moves_positions_list = [moves_positions[move] for move in possible_moves]
            
            next_states = np.array(moves_positions_list)
            move_values = players_dict[turn].predict(next_states).flatten()
            # if turn == 0:
            #     # print(turn, "dumb")
            #     move_values = model_1.predict(next_states).flatten()  # Flatten to 1D array
            # else:
            #     # print(turn, "smart")
            #     move_values = model_2.predict(next_states).flatten()  # Flatten to 1D array
            # Predict values for all possible moves in a single batch
            
            
            best_move = possible_moves[np.argmax(move_values)]
        else:
            best_move = None
        b.make_move(best_move)
        
        # print(f'{b.board}', end='\r')
        # time.sleep(1)
        # input()
    p0_score = b.board.match.player_0_score
    p1_score = b.board.match.player_1_score
        # print(f'{p0_score}/{p1_score} game: {game}', end='\r')
            # print(b.board)
            
    
    # print(f"dumb's score : {p0_score},smart's score: {p1_score}")
    
    return p0_score,p1_score




In [26]:
model_dumb = tf.keras.models.load_model('./backgammon_RLmodel_new_03_29_128_64_32_500.h5')
model_smart = tf.keras.models.load_model('./backgammon_RLmodel_new_03_29_128_64_32_900.h5')
p0_scores = []
p1_scores = []
def calculate_wins(p):
    all_wins = []
    for score in p:
        if score > 0:
            all_wins.append(score)
    return len(all_wins)
for i in range(400):
    p0,p1 = evaluate_play_game(model_dumb,model_smart)
    # print(f'{p0}/{p1}')
    p0_scores.append(p0)
    p1_scores.append(p1)
    print(f"{calculate_wins(p0_scores)}/{calculate_wins(p1_scores)}", end='\r')
    
    


29/23


KeyboardInterrupt



In [31]:
print(sum(p0_scores),sum(p1_scores))
all_wins_0 = []
all_wins_1 =  []
def calculate_wins(p):
    all_wins = []
    for score in p:
        if score > 0:
            all_wins.append(score)
    return len(all_wins)

102 140


In [68]:
model_dumb = None
model_smart = None 
del model_dumb,model_smart

In [33]:
print(calculate_wins(p0_scores))
print(calculate_wins(p1_scores))


49
64


In [21]:
b = BGBoard()
# b.start()


In [31]:
print(b.board)

                 Position ID: 4HPwATDgc/ABMA
                 Match ID   : cIlsAAAAAAAA
 +13-14-15-16-17-18------19-20-21-22-23-24-+
 | X           O    |   | O              X |
 | X           O    |   | O              X |
 | X           O    |   | O                |
 | X                |   | O                |
 | X                |   | O                |
v|                  |BAR|                  |
 | O                |   | X                |
 | O                |   | X                |
 | O           X    |   | X                |
 | O           X    |   | X              O |
 | O           X    |   | X              O |
 +12-11-10--9--8--7-------6--5--4--3--2--1-+



In [20]:
k = 0.31
for i in range(15000):
    k += 0.00003
print(k)

0.7599999999996174


In [29]:
random_threshold

0.3809499999999489

In [17]:
learning_rate = 0.03

In [18]:
for i in range(10):
    learning_rate *= 0.70
    print(learning_rate)

0.020999999999999998
0.014699999999999998
0.010289999999999997
0.007202999999999998
0.005042099999999998
0.0035294699999999985
0.002470628999999999
0.001729440299999999
0.0012106082099999993
0.0008474257469999995


In [39]:
model1 = tf.keras.models.load_model('./backgammon_RLmodel05_64_64_100.h5')
model2 = tf.keras.models.load_model('./backgammon_RLmodel05_64_64_10000.h5')

In [40]:
model1.trainable_variables

[<tf.Variable 'dense/kernel:0' shape=(31, 64) dtype=float32, numpy=
 array([[ 0.24865772, -0.10203935, -0.15886983, ..., -0.22225152,
          0.02141946, -0.11422437],
        [-0.12545873,  0.02884449,  0.22662452, ..., -0.24045987,
          0.19724675, -0.20846383],
        [-0.05151866, -0.03881101, -0.08221499, ..., -0.14624432,
          0.20926365, -0.14613411],
        ...,
        [-0.16588624, -0.1387194 ,  0.07627302, ..., -0.12613666,
         -0.15953946,  0.06095667],
        [ 0.14565748,  0.00089765, -0.0970078 , ...,  0.06033377,
          0.24657752, -0.05173407],
        [ 0.14460222, -0.01814933, -0.04799673, ...,  0.07900107,
          0.2433995 , -0.10517941]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(64,) dtype=float32, numpy=
 array([ 0.02782262,  0.01611478,  0.05063552,  0.02329272, -0.00166099,
         0.00719492, -0.00584468, -0.01495863, -0.01518282,  0.01080851,
         0.0078951 ,  0.03142129, -0.01899794,  0.06092185,  0.0067733 ,
       

In [41]:
model2.trainable_variables

[<tf.Variable 'dense/kernel:0' shape=(31, 64) dtype=float32, numpy=
 array([[ 0.25598252, -0.10503063, -0.05014001, ..., -0.21671061,
          0.05244816, -0.11172025],
        [-0.07616533,  0.0381564 ,  0.2235378 , ..., -0.236818  ,
          0.21805777, -0.21494205],
        [-0.01008032, -0.04090825, -0.04120066, ..., -0.15100454,
          0.2351648 , -0.14327295],
        ...,
        [-0.16425456, -0.12189455,  0.06000654, ..., -0.1337059 ,
         -0.16070904,  0.05958709],
        [ 0.13544248,  0.00185266, -0.07016875, ...,  0.05273537,
          0.25023785, -0.04626461],
        [ 0.13141398, -0.03379642, -0.06835079, ...,  0.08885832,
          0.20393403, -0.09868057]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(64,) dtype=float32, numpy=
 array([ 1.6066642e-04,  6.9072351e-02,  1.5786183e-01,  2.0008404e-02,
        -1.4154663e-02, -9.2420364e-03, -2.3906535e-02, -2.8751912e-02,
        -4.7398051e-03,  9.7969763e-02,  2.2650191e-03, -1.1017655e-02,
        -5