In [49]:
import ludopy
import numpy as np
import random
from tqdm import tqdm
import timeit

from collections.abc import Iterable

def flatten(l):
    return [item for sublist in l for item in sublist]

In [94]:
import matplotlib.pyplot as plt

In [50]:
states = {}

In [51]:
learning_rate = 0.1
discount_factor = 0.9

In [52]:
game = ludopy.Game()

In [47]:
def gen_state_key(player_pieces, enemy_pieces):
    state_key = ",".join([str(el) for el in player_pieces + flatten(enemy_pieces)])
    return state_key

def make_state_action(player_pieces, enemy_pieces, q_init = 0):
    p = ludopy.player.Player()
    p.set_pieces(player_pieces.copy())
    
    dice_list = np.arange(1,6+1)
    
    new_states = [{} for _ in dice_list]
    
    for dice in dice_list:    
        move_pices = p.get_pieces_that_can_move(dice)
        for pices in move_pices:
            new_enemys = p.move_piece(pices, dice, enemy_pieces)
            
            new_states[dice-1][pices] = [q_init, gen_state_key(p.get_pieces(), new_enemys)]
            
            p.set_pieces(player_pieces.copy())
    return new_states

def get_q_from_state(states, state, q_init=0):
    action_q_s = []
    
    if state in states:
        action_list = states[state]
        for dice_action in action_list:
            for pice in dice_action:
                action_q, action_state = dice_action[pice]
                action_q_s.append(action_q)
    else:
        action_q_s.append(q_init)
    return action_q_s

def get_max_q(states, state):
    qs = get_q_from_state(states, state)
    return max(qs)
    
def update_Q(states, state, dice, pice, future_state, reward, learning_rate, discount_factor):
    old_value, action_state = states[state][dice - 1][pice]

    new_value = reward + discount_factor * get_max_q(states, future_state)
    temporal_difference = new_value - old_value

    new_update_q = old_value + learning_rate * temporal_difference

    states[state][dice - 1][pice] = [new_update_q, action_state]
    
def get_best_action(states, state_key, dice):
    actions = states[state_key][dice - 1]
    pices = list(actions.keys())
    
    max_action = actions[pices[0]][0]
    max_pice = pices[0]
    if len(pices) > 1:
        for pice in pices[1:]:
            action_q = actions[pice][0]
            if action_q > max_action:
                max_pice = pice
                max_action = action_q
    return max_pice

In [53]:
for _ in tqdm(range(10000)):
    there_is_a_winner = False

    game.reset()

    while not there_is_a_winner:
        (dice, move_pieces, player_pieces, enemy_pieces, player_is_a_winner, there_is_a_winner), player_i = game.get_observation()

        if len(move_pieces):
            pice_to_move = random.choice(move_pieces)
        else:
            pice_to_move = -1

        state_key = gen_state_key(player_pieces, enemy_pieces)
        if state_key not in states:
            states[state_key] = make_state_action(player_pieces, enemy_pieces)


        (new_dice, new_move_pieces, new_player_pieces, new_enemy_pieces, player_is_a_winner, there_is_a_winner) = game.answer_observation(pice_to_move)

        future_state = gen_state_key(new_player_pieces, new_enemy_pieces)
        if player_is_a_winner:
            reward = 100
        else:
            reward = 0
        if pice_to_move != -1:
            update_Q(states, state_key, dice, pice_to_move, future_state, reward, learning_rate, discount_factor)

        #states, state, dice, pice, future_state, reward, learning_rate, discount_factor
   

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [38:24<00:00,  4.34it/s]


In [93]:
winners = []

for _ in tqdm(range(10000)):
    game.reset()

    there_is_a_winner = False
    while not there_is_a_winner:
        (dice, move_pieces, player_pieces, enemy_pieces, player_is_a_winner, there_is_a_winner), player_i = game.get_observation()

        state_key = gen_state_key(player_pieces, enemy_pieces)

        if len(move_pieces):
            if player_i != 1:
                pice_to_move = random.choice(move_pieces)
            else:
                if state_key in states:
                    pice_to_move = get_best_action(states, state_key, dice)
                else:
                    pice_to_move = random.choice(move_pieces)
        else:
            pice_to_move = -1

        (new_dice, new_move_pieces, new_player_pieces, new_enemy_pieces, player_is_a_winner, there_is_a_winner) = game.answer_observation(pice_to_move)
    
    winners.append(game.get_winner_of_game())


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:26<00:00, 48.51it/s]


In [103]:
hex(19)

'0x13'

In [105]:
3*16**0 + 1 * 16**1

19

In [None]:
np.bas

In [None]:
def gen_state_key_v2(player_pieces, enemy_pieces):
    

In [97]:
player_pieces + flatten(enemy_pieces)


[47, 59, 59, 59, 0, 4, 7, 59, 59, 59, 0, 59, 59, 0, 22, 0]

In [109]:
def to_base_to_int(numbers, base):
    int_number = np.array([0], dtype=np.uint64)
    for i, num in enumerate(numbers):
        int_number[0] += num * np.power(base, i)
    return int_number
    

In [111]:
to_base_to_int(player_pieces + flatten(enemy_pieces), 60)

  after removing the cwd from sys.path.


array([18446744073508577280], dtype=uint64)

In [113]:
60 ** 16

28211099074560000000000000000