# 1.0 Install gym environment

Run this cell then restart the runtime


In [0]:
!git clone https://github.com/SafiDewshi/tic_tac_toe_gym_env.git
!pip install -e tic_tac_toe_gym_env

Cloning into 'tic_tac_toe_gym_env'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 92 (delta 28), reused 86 (delta 23), pack-reused 0[K
Unpacking objects: 100% (92/92), done.
Obtaining file:///content/tic_tac_toe_gym_env
Installing collected packages: gym-tictactoe
  Running setup.py develop for gym-tictactoe
Successfully installed gym-tictactoe


# 2.0 Import libraries

In [0]:
import gym
import gym_tictactoe
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


# 3.0 REINFORCE Algorithm

In [0]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        # Call the model, give it single observation, outputs probability of playing each square
        prob = model(obs.reshape(9)[np.newaxis])

        # find the max probability spot
        max_prob = tf.math.reduce_max(prob)
        
        take_best = (max_prob > tf.random.uniform([1,1]))

        if take_best:
          x = tf.argmax(tf.reshape(tf.squeeze(prob), 9))
          action = (x//3, x%3)
        else:
          mov = env.observation_space.sample()
          action = (mov[0], mov[1])

        mean_prob = tf.math.reduce_mean(prob)

        confidence = tf.math.subtract(max_prob, mean_prob)
        
        # find the difference between the highest probability and the lowest 
        loss = tf.reduce_mean(loss_fn(tf.constant([[1.]]), confidence))

        # the higher the difference between the most confident play and 
        # the second most confident play, the better
    
    grads = tape.gradient(loss, model.trainable_variables)

    #find the gradients that reduce that loss
    obs, reward, done = env.step(action)
    
    return obs, reward, done, grads

In [0]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model1, model2, loss_fn):
    model1_all_rewards = []
    model1_all_grads = []
    model2_all_rewards = []
    model2_all_grads = []

    # Plays multiple episodes
    for episode in range(n_episodes):
        model1_current_rewards = []
        model1_current_grads = []
        model2_current_rewards = []
        model2_current_grads = []
        obs = env.reset()

        # Plays an episode
        for step in range(n_max_steps):

            obs, reward, done, grads = play_one_step(env, obs, model1, loss_fn)
            model1_current_rewards.append(reward)
            model1_current_grads.append(grads)

            if done:
                break

            obs, reward, done, grads = play_one_step(env, obs, model2, loss_fn)
            model2_current_rewards.append(reward)
            model2_current_grads.append(grads)

            if done:
                break

        model1_all_rewards.append(model1_current_rewards)
        model1_all_grads.append(model1_current_grads)

        model2_all_rewards.append(model2_current_rewards)
        model2_all_grads.append(model2_current_grads)
        
    # Returns list of reward lists (one per episode, containing one reward per step) and
    # a list of gradient lists (one per episode, one tuple of gradients per step, each tuple containing one gradient
    # tensor per trainable variable)
    return model1_all_rewards, model1_all_grads, model2_all_rewards, model2_all_grads

In [0]:
# Computes the discounted rewards
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

# Discounts and normalizes rewards
def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards] 
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std for discounted_rewards in all_discounted_rewards]

# 4.0 Train Algorithm

In [0]:
# Hyperparameters
n_iterations = 10000
n_episodes_per_update = 10
n_max_steps = 10
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.binary_crossentropy

In [0]:
def model():
  """model = Sequential()
  model.add(Dense(72, activation='relu', input_shape=[9]))
  model.add(Dense(36, activation='relu'))
  model.add(Dense(18, activation='relu'))
  model.add(Dense(9, activation = 'sigmoid'))"""

  model = keras.models.Sequential([
    keras.layers.Dense(72, activation='relu', input_shape=[9]),
    keras.layers.Dense(36, activation='relu'),
    keras.layers.Dense(18, activation='relu'),
    keras.layers.Dense(9, activation='sigmoid'),                             
  ])
  return model

In [0]:
keras.backend.clear_session()

model1 = model()
model2 = model()

In [0]:
with tf.device('/device:GPU:0'):

  env = gym.make("tictactoe-v0")

  for iteration in range(n_iterations):
    
      # Plays the game n times and returns the rewards and gradients for every episode and step
      model1_all_rewards, model1_all_grads, model2_all_rewards, model2_all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model1, model2, loss_fn)
      
      model1_total_rewards = sum(map(sum, model1_all_rewards))
      model2_total_rewards = sum(map(sum, model2_all_rewards))
      model1_mean = model1_total_rewards / n_episodes_per_update
      model2_mean = model2_total_rewards / n_episodes_per_update
      print("\rIteration: {}, model 1 mean rewards: {:.1f}, model 2 mean rewards: {:.1f}, sum of both:{:.1f}".format(iteration, model1_mean, model2_mean, model1_mean+model2_mean), end="")
      
      # Computes each action's normalized advantage, provides measure of how good each action was
      model1_all_final_rewards = discount_and_normalize_rewards(model1_all_rewards, discount_rate)
      model2_all_final_rewards = discount_and_normalize_rewards(model2_all_rewards, discount_rate)


      model1_all_mean_grads = []
      model2_all_mean_grads = []
      for var_index in range(len(model1.trainable_variables)):

          model1_mean_grads = tf.reduce_mean([model1_final_reward * model1_all_grads [episode_index][step][var_index] 
                                      for episode_index, model1_final_rewards in enumerate(model1_all_final_rewards)
                                      for step, model1_final_reward in enumerate(model1_final_rewards)], axis=0)
          model1_all_mean_grads.append(model1_mean_grads)

          model2_mean_grads = tf.reduce_mean([model2_final_reward * model2_all_grads [episode_index][step][var_index] 
                                      for episode_index, model2_final_rewards in enumerate(model2_all_final_rewards)
                                      for step, model2_final_reward in enumerate(model2_final_rewards)], axis=0)
          model2_all_mean_grads.append(model2_mean_grads)

      optimizer.apply_gradients(zip(model1_all_mean_grads, model1.trainable_variables))
      optimizer.apply_gradients(zip(model2_all_mean_grads, model2.trainable_variables))
      
  env.close()

Iteration: 73, model 1 mean rewards: -18.0, model 2 mean rewards: 2.0, sum of both:-16.0

  del sys.path[0]


Iteration: 9999, model 1 mean rewards: -6.7, model 2 mean rewards: -6.1, sum of both:-12.8

# 5.0 Play Tic-Tac-Toe

In [0]:
obs = env.reset()
obs

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [0]:
prob = model2(obs.reshape(9)[np.newaxis])
max_prob = tf.math.reduce_max(prob)
x = tf.argmax(tf.reshape(tf.squeeze(prob), 9))
action = (x//3, x%3)
obs, reward, done = env.step(action)
print(prob, obs, reward, done)

tf.Tensor([[nan nan nan nan nan nan nan nan nan]], shape=(1, 9), dtype=float32) [[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]] -20 True


In [0]:
prob = model1(obs.reshape(9)[np.newaxis])
max_prob = tf.math.reduce_max(prob)
x = tf.argmax(tf.reshape(tf.squeeze(prob), 9))
action = (x//3, x%3)
obs, reward, done = env.step(action)
print(prob, obs, reward, done)

tf.Tensor([[nan nan nan nan nan nan nan nan nan]], shape=(1, 9), dtype=float32) [[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]] 1 False
