In [1]:
import gym
import itertools
import gym_tic_tac_toe
import plotting
from plotting import EpisodeStats
from collections import defaultdict 
from copy import deepcopy
import numpy as np 
import operator

In [2]:
# Example of Q dictionary used in q_learn function
#
# Q = {
#     "0000000001": {
#         0: 0,
#         1: 0,
#         2: 0,
#         # ...
#         8: 0
#     },
#     # ...
#     "1-11-1-11-11-10-1": {
#         8: 0
#     },
# }

In [3]:
def hash_state(state):
    board = state['board']
    move = state['on_move']
    return ''.join(str(b) for b in board)+str(move)
    
def hash_action(action):
    return action[1]

In [4]:
def get_best_action_idx(Q, state_hash, action_hashes):
    if state_hash in Q:
        best_action_idx = np.argmax(Q[state_hash]) 
    else:
        Q[state_hash] = dict((ah, 0) for ah in action_hashes)
        best_action_idx = np.random.choice(len(action_hashes))
        
    return best_action_idx

In [5]:
def create_policy(Q, epsilon):

    def get_action_probs(state_hash, action_hashes):
        num_actions = len(action_hashes)
        action_probs = np.ones(num_actions, 
                dtype = float) * epsilon / num_actions 

        best_action_idx = get_best_action_idx(Q, state_hash, action_hashes)
        action_probs[best_action_idx] += (1.0 - epsilon)
        return action_probs
   
    return get_action_probs 

In [56]:
def q_learn(init_state, num_episodes, discount_factor = 1.0, alpha = 0.6, epsilon = 0.1, print_log = False): 
    env = gym.make('tic_tac_toe-v1')
 
    Q = {}
    
    stats = plotting.EpisodeStats( 
        episode_lengths = np.zeros(num_episodes), 
        episode_rewards = np.zeros(num_episodes))
    
    policy = create_policy(Q, epsilon) 
       
    for ith_episode in range(num_episodes): 
           
        env.set_state(deepcopy(init_state))
        state = deepcopy(env.state)
        for t in itertools.count(): 
               
            state_hash = hash_state(state)
            actions = env.move_generator()
            action_hashes = [hash_action(act) for act in actions]
            action_probabilities = policy(state_hash, action_hashes)
            
            action_idx = np.random.choice(np.arange(len(actions)), p=action_probabilities)
            action = actions[action_idx]
            action_hash = hash_action(action)
                           
            next_state, reward, done, _ = env.step(action)
   
            stats.episode_rewards[ith_episode] += reward 
            stats.episode_lengths[ith_episode] = t 
            
            next_state_hash = hash_state(next_state)
            next_action_hashes = [hash_action(act) for act in env.move_generator()]
            
            if len(next_action_hashes) == 0:
                next_action_score = 0
            else:   
                best_next_action_idx = get_best_action_idx(Q, next_state_hash, next_action_hashes)
                best_next_action_hash = next_action_hashes[best_next_action_idx]
                next_action_score = Q[next_state_hash][best_next_action_hash]
                
            td_target = reward + discount_factor * next_action_score
            td_delta = td_target - Q[state_hash][action_hash] 
            Q[state_hash][action_hash] += alpha * td_delta
            
            if print_log:
                print('\n\n----------- STATE -------------')
                env.render()
                print(state_hash)
                print(actions)
                print(action_hashes)
                print(action_probabilities)
                print(action_idx)
                print(action)
                print(action_hash)
                print(reward)
                print(td_target)
                print(td_delta)
                print(done)
            
            if done: 
                break
                   
            state = deepcopy(next_state)
       
    return Q, stats 

In [52]:
def play_game(player = -1):
    env = gym.make('tic_tac_toe-v1')
    state = env.reset()
    env.render()
    
    on_move = state['on_move']
    reward = 0
    done = False
    
    while not done:
        on_move = state['on_move']
        
        if player == on_move:
            print('Pick a move index')
            moves = env.move_generator()
            print(list(enumerate(moves)))
            idx = int(input())
            action = moves[idx]
        else:
            (Q, stats) = q_learn(state, 1000, print_log=False)
            actions = Q[hash_state(state)].items()
            print(actions)
            best_action_hash = max(actions, key=operator.itemgetter(1))
            print(best_action_hash)
            best_action_hash = best_action_hash[0]
            action = [on_move, best_action_hash]
        
        state, reward, done, _ = env.step(action) 

        env.render()
    
    if reward == 0:
        print("Draw!")
    elif on_move == player:
        print('You won!')
    else:
        print('AI won!')
        
    return env

In [53]:
play_game()

on move:  X
      
      
      
dict_items([(0, 1.9692939614240935e+188), (1, 6802350.519654975), (2, 495911.9935164463), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, -0.04367001599999999)])
(0, 1.9692939614240935e+188)
on move:  O
X     
      
      
Pick a move index
[(0, [-1, 1]), (1, [-1, 2]), (2, [-1, 3]), (3, [-1, 4]), (4, [-1, 5]), (5, [-1, 6]), (6, [-1, 7]), (7, [-1, 8])]


KeyboardInterrupt: 