In [1]:
import backgammon
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout

# if GPU is available this code will state 1
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

class BGBoard:
    def __init__(self):
        self.board = backgammon.Backgammon()
        self.board.start()
        self.turn = self.board.match.player
        self.winner = -1
        self.state_players = []
        
    def get_moves_and_positions(self):
        possible_moves_positions = {}
        for play in self.board.generate_plays():
            pair = []
            for move in play.moves:
                pair.append((move.source,move.destination))
            possible_moves_positions[tuple(pair)] = self.state(play.position)
        return possible_moves_positions

    def get_possible_moves(self, possible_moves_positions:dict):
        possible_moves = possible_moves_positions.keys()
        return list(possible_moves)
    
    def state(self,position):
        # position = self.board.position
        board_points = list(position.board_points)
        p_bar = [position.player_bar]
        p_off = [position.player_off]
        op_bar = [position.opponent_bar]
        op_off = [position.opponent_off]
        die_1 = [self.board.match.dice[0]]
        die_2 = [self.board.match.dice[1]]
        current_player = [self.board.match.player.value]
        state_list = board_points+p_bar+p_off+op_bar+op_off+die_1+die_2+current_player
        state_array = np.array(state_list)/15
        
        return state_array

    def make_move(self, move):
        '''
        move is tuple of tuples. e.g ((22,19),(23,21),...)
        '''
        self.state_players.append(int(self.board.match.turn.value))
        if move == None:
            self.board.skip()
        else:
            self.board.play(move)
        
        
        if self.is_over():
            return self.winner
            
        if self.board.match.dice == (0, 0):
            self.board.roll()
        
    def is_over(self):
        p0_score = self.board.match.player_0_score
        p1_score = self.board.match.player_1_score
        
        if p0_score > p1_score:
            self.winner = 0
            return True
        elif p0_score < p1_score:
            self.winner = 1
            return True
        else:
            return False

    # def get_reward(self):
    #     player = self.board.match.player
    #     if winner != -1:
    #         if winner == player:
    #             return torch.tensor(1,device=device, dtype=torch.float32)
    #         elif winner != player:
    #             return torch.tensor(-1,device=device, dtype=torch.float32)
        

Num GPUs Available:  1


In [2]:
model = Sequential([
    Dense(96, activation='relu', input_shape=(31,)), # 31 features for board state
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation="tanh")  # Output: Estimated value of the state
])

optimizer = Adam(learning_rate=0.00002)

# Compile the model
model.compile(optimizer=optimizer,
              loss='mean_squared_error')

lambda_ = 0.7 # Eligibility trace decay
alpha = 0.1 # Learning rate
gamma = 0.99  # Discount factor

        

def td_lambda_update(self, states, rewards, model, alpha=0.00002, gamma=gamma, lambda_=lambda_):
    """
    Perform TD(λ) updates on the model.

    Args:
        states: List of game states (numpy arrays).
        rewards: List of rewards for each state.
        model: The neural network model.
        alpha: Learning rate.
        gamma: Discount factor.
        lambda_: Eligibility trace decay factor.
        
    """
    # Initialize eligibility traces with the same shapes as trainable variables
    eligibility_traces = [tf.zeros_like(var) for var in model.trainable_variables]

    for t in range(len(states) - 1):
        with tf.GradientTape() as tape:
            # Predict current and next state values
            V_t = model(tf.convert_to_tensor([states[t]], dtype=tf.float32))  # Shape: (1, 1)
            V_t_plus_1 = model(tf.convert_to_tensor([states[t + 1]], dtype=tf.float32))  # Shape: (1, 1)

            # Compute TD error (delta_t)
            delta_t = rewards[t] + gamma * tf.squeeze(V_t_plus_1) - tf.squeeze(V_t)

        # Calculate gradients for the current state's value prediction
        gradients = tape.gradient(V_t, model.trainable_variables)

        # Update eligibility traces and model weights
        for i in range(len(model.trainable_variables)):
            # Ensure gradient and eligibility trace shapes match
            if gradients[i] is not None:  # Some variables may not contribute to the gradient
                eligibility_traces[i] = gamma * lambda_ * eligibility_traces[i] + gradients[i]
                
                # Ensure eligibility trace matches the variable's shape
                eligibility_trace = tf.reshape(eligibility_traces[i], model.trainable_variables[i].shape)
                
                # Perform the TD(λ) weight update
                model.trainable_variables[i].assign_add(alpha * delta_t * eligibility_trace)


In [3]:

def play_game(model,random_threshold=0.75):
    states = []
    b = BGBoard()
    
    while not b.is_over():
        position = b.board.position
        current_state = b.state(position)
        # Append the current state (for the current player)
        states.append(current_state)
        
        moves_positions = b.get_moves_and_positions()
        possible_moves = b.get_possible_moves(moves_positions)
        
        
        move_values = []
        
        if moves_positions != {}:
            if random.uniform(0,1)<random_threshold:
                best_move = random.choice(possible_moves)
            
            else:
                    # Create a batch of next states for all possible moves
                next_states = np.array([moves_positions[move] for move in possible_moves])
                
                # Predict values for all possible moves in a single batch
                move_values = model.predict(next_states).flatten()  # Flatten to 1D array
            
                best_move = possible_moves[np.argmax(move_values)]
        else:
            best_move = None
        b.make_move(best_move)
        # print(b.board)
    filter_array = np.array(b.state_players)
    states = np.array(states)
    states_winner = states[filter_array == b.winner]
    states_loser = states[filter_array != b.winner]
    final_rewards_winner = [0 for state_player in b.state_players if state_player == b.winner]
    final_rewards_loser = [0 for state_player in b.state_players if state_player != b.winner]
    final_rewards_winner[-1] = 1
    final_rewards_loser[-1] = -1
    # print("winner")
    # print(b.winner)
    # print(final_rewards_winner)
    # print("loser")
    # print(final_rewards_loser)
    return states_winner, states_loser, final_rewards_winner, final_rewards_loser

    

In [5]:
last_epoch = 0
epochs = 2500
random_threshold = 0.3
for epoch in range(last_epoch, epochs):  # Number of training games
    print(f'progress: {epoch}/{epochs}, threshold: {random_threshold} ',end='\r')
    # Generate a game through self-play
    states_winner,states_loser, rewards_winner, rewards_loser = play_game(model,random_threshold)
    random_threshold += 0.000002
    
    # Perform TD(λ) updates
    td_lambda_update(states_winner, rewards_winner, model)
    td_lambda_update(states_loser,rewards_loser,model)

    # Evaluate the model every 100 games
    if epoch % 100 == 0:
        model.save(f"backgammon_RLmodel_64_02_{epoch}.h5")
    

progress: 909/2500, threshold: 0.48179999999997997 


KeyboardInterrupt

