# Temporal Difference Learning for 3D Tic Tac Toe

This notebook contains the implementation of a Temporal Difference (TD) learning model using a Deep Q-Network (DQN) for playing 3D 4x4x4 Tic Tac Toe. The implementation is based on the approach outlined in the provided paper.


In [136]:
# Import necessary libraries
import numpy as np
import os
import random

# Setting Directory
os.chdir('C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/')

from python_scripts import state_formulation, utils, algorithm, tictactoe_4x4
import torch
import torch.nn as nn
import torch.optim as optim
from torchinfo import summary
import torch.nn.init as init


In [255]:
def get_rows(input_tensor):
    # Get diagonals (across 2 faces),digonals (across 3 faces) and horizontal and vertical rows
    diag_two_faces = []
    diag_two_faces.extend(
        [torch.diagonal(input_tensor[i, :, :]), torch.diagonal(input_tensor[:, i, :]), torch.diagonal(input_tensor[:, :, i]), 
        torch.diagonal(torch.fliplr(input_tensor)[i, :, :]), torch.diagonal(torch.fliplr(input_tensor)[:, i, :]), torch.diagonal(torch.fliplr(input_tensor)[:, :, i])] 
        for i in range(input_tensor.shape[0]))
    diag_two_faces = [item for sublist in diag_two_faces for item in sublist]
    
    diag_three_faces = []
    diag_three_faces = [[[[input_tensor[i, i, i], input_tensor[3 - i, i, i], input_tensor[i, 3 - i, i], input_tensor[i, i, 3 - i]] 
                          for i in range(4)][k][j] for j in range(4) for k in range(4)][l:l + 4] for l in range(0, 16, 4)]
    diag_three_faces = [torch.tensor([t.item() for t in row]) for row in diag_three_faces]

    horizontal_and_vertical_rows = []
    horizontal_and_vertical_rows.extend([input_tensor[i, j, :], input_tensor[i, :, j], input_tensor[:, i, j]]
                                        for i in range(input_tensor.shape[0]) for j in range(input_tensor.shape[0]))
    horizontal_and_vertical_rows = [item for sublist in horizontal_and_vertical_rows for item in sublist]
    
    return horizontal_and_vertical_rows + diag_two_faces + diag_three_faces

In [256]:
input_tensor = torch.arange(64).view(4, 4, 4)
overall_rows = get_rows(input_tensor)
print(f'Number of rows: {len(overall_rows)}')

Number of rows: 76


In [310]:
class customDotProduct(nn.Module):
    def __init__(self, structure_weight, block_size):
        super(customDotProduct, self).__init__()
        self.block_size = block_size
        # Convert structure_weight to nn.Parameter
        self.structure_weight = self.get_block_weights(structure_weight, block_size)
        self.structure_weight = nn.ParameterList([nn.Parameter(sw.float()) for sw in self.structure_weight])

    def get_block_weights(self, weight_list, block_size):
        for i in range(0, 304, block_size):
            weight_list[i: i + block_size, i: i + block_size] = torch.ones(block_size, block_size)
        learnable_blocks = [weight_list[i:i + block_size, i:i + block_size] for i in range(0, weight_list.shape[0], block_size)]
        updated = [block for block in learnable_blocks]
        return updated
    
    def forward(self, feature_map):
        self.feature_map = [fm.float() for fm in feature_map]
        # Calculate dot products and concatenate along dim=1
        concatenated_products = torch.cat([torch.matmul(fm.unsqueeze(0), sw) for fm, sw in zip(self.feature_map, self.structure_weight)], dim = 1)
        return concatenated_products


In [309]:
# Testing Code
# weights = torch.zeros((304, 304))

# block_size = 4
# for i in range(0, 304, block_size):
#     weights[i: i + block_size, i: i + block_size] = torch.ones(block_size, block_size)
# learnable_blocks = [weights[i:i + block_size, i:i + block_size] for i in range(0, weights.shape[0], block_size)]
# weights = [init.xavier_normal_(block) for block in learnable_blocks]

# print(f'Before Update: Weights = {weights[0]} \n')

# # Assume some loss function and optimizer have been defined
# custom_dot_product_module = customDotProduct(weights, 4)
# loss_function = nn.MSELoss()
# optimizer = torch.optim.SGD(custom_dot_product_module.parameters(), lr = 0.01)

# # Example training loop iteration
# optimizer.zero_grad()  # Clear gradients
# output = custom_dot_product_module(overall_rows)  # Perform forward pass
# loss = loss_function(output, torch.randn(1, 304))  # Compute loss
# loss.backward()  # Perform backward pass
# optimizer.step()  # Update weights
# print(f'After Update: Weights = {custom_dot_product_module.structure_weight[0]}')


In [324]:
class StructuredLinear(nn.Module):
    def __init__(self, input_size, output_size, block_size = 4):
        super(StructuredLinear, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.block_size = block_size

        # Initialize weights and biases
        self.weights = nn.init.xavier_normal_(torch.zeros((output_size, input_size)))
        self.act = nn.Tanh()

    def get_rows(self, input_tensor):
        # Get diagonals (across 2 faces),digonals (across 3 faces) and horizontal and vertical rows
        diag_two_faces = []
        diag_two_faces.extend(
            [torch.diagonal(input_tensor[i, :, :]), torch.diagonal(input_tensor[:, i, :]), torch.diagonal(input_tensor[:, :, i]), 
            torch.diagonal(torch.fliplr(input_tensor)[i, :, :]), torch.diagonal(torch.fliplr(input_tensor)[:, i, :]), torch.diagonal(torch.fliplr(input_tensor)[:, :, i])] 
            for i in range(input_tensor.shape[0]))
        diag_two_faces = [item for sublist in diag_two_faces for item in sublist]
        
        diag_three_faces = []
        diag_three_faces = [[[[input_tensor[i, i, i], input_tensor[3 - i, i, i], input_tensor[i, 3 - i, i], input_tensor[i, i, 3 - i]] 
                            for i in range(4)][k][j] for j in range(4) for k in range(4)][l:l + 4] for l in range(0, 16, 4)]
        diag_three_faces = [torch.tensor([t.item() for t in row]) for row in diag_three_faces]

        horizontal_and_vertical_rows = []
        horizontal_and_vertical_rows.extend([input_tensor[i, j, :], input_tensor[i, :, j], input_tensor[:, i, j]]
                                            for i in range(input_tensor.shape[0]) for j in range(input_tensor.shape[0]))
        horizontal_and_vertical_rows = [item for sublist in horizontal_and_vertical_rows for item in sublist]
        
        return horizontal_and_vertical_rows + diag_two_faces + diag_three_faces

    def forward(self, x):
        print("RR")
        rows = self.get_rows(x)
        print("GG")
        custom_init = customDotProduct(self.weights, self.block_size)
        print("!!")
        result = custom_init.forward(rows)
        print("#$#")
        return result

class MyNeuralNetwork(nn.Module):
    def __init__(self):
        super(MyNeuralNetwork, self).__init__()
        self.structured_layer = StructuredLinear(304, 304)
        self.second_layer = nn.Linear(304, 32, bias = False)
        init.xavier_normal_(self.second_layer.weight)
        self.output_layer = nn.Linear(32, 1, bias = False)

    def forward(self, x):
        x = self.structured_layer(x)
        print(x)
        # x = self.act(x) # --> Tanh
        print("2")
        x = self.second_layer(x)
        print("3")
        # x = self.act(x)
        print("4")
        x = self.output_layer(x)
        print("5")
        return x

# Example usage
model = MyNeuralNetwork()


In [325]:
input_tensor = torch.randn(4, 4, 4)

In [326]:
summary(model, input_data = input_tensor, col_names = ['input_size', 'output_size', 'num_params'])

RR
GG
!!
#$#
tensor([[-0.0462, -0.0462, -0.0462, -0.0462, -0.2330, -0.2330, -0.2330, -0.2330,
          0.9862,  0.9862,  0.9862,  0.9862,  2.2830,  2.2830,  2.2830,  2.2830,
         -0.7087, -0.7087, -0.7087, -0.7087, -0.4895, -0.4895, -0.4895, -0.4895,
          0.4043,  0.4043,  0.4043,  0.4043,  1.8118,  1.8118,  1.8118,  1.8118,
          0.5791,  0.5791,  0.5791,  0.5791,  1.0230,  1.0230,  1.0230,  1.0230,
          2.7939,  2.7939,  2.7939,  2.7939,  0.9883,  0.9883,  0.9883,  0.9883,
          0.5562,  0.5562,  0.5562,  0.5562,  2.5767,  2.5767,  2.5767,  2.5767,
          3.1591,  3.1591,  3.1591,  3.1591,  1.2641,  1.2641,  1.2641,  1.2641,
         -2.7488, -2.7488, -2.7488, -2.7488, -0.2804, -0.2804, -0.2804, -0.2804,
         -0.7418, -0.7418, -0.7418, -0.7418,  0.7623,  0.7623,  0.7623,  0.7623,
          4.1820,  4.1820,  4.1820,  4.1820, -2.2008, -2.2008, -2.2008, -2.2008,
         -1.7125, -1.7125, -1.7125, -1.7125, -0.3943, -0.3943, -0.3943, -0.3943,
          0.591

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
MyNeuralNetwork                          [4, 4, 4]                 [1, 1]                    --
├─StructuredLinear: 1-1                  [4, 4, 4]                 [1, 304]                  --
├─Linear: 1-2                            [1, 304]                  [1, 32]                   9,728
├─Linear: 1-3                            [1, 32]                   [1, 1]                    32
Total params: 9,760
Trainable params: 9,760
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.01
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.04
Estimated Total Size (MB): 0.04

In [322]:
d = StructuredLinear(304, 304, 4)
f = d(input_tensor)

RR
GG
!!
#$#


In [323]:
act = nn.Tanh()
act(f)

tensor([[-0.5902, -0.5902, -0.5902, -0.5902, -0.9876, -0.9876, -0.9876, -0.9876,
          0.7733,  0.7733,  0.7733,  0.7733, -0.7729, -0.7729, -0.7729, -0.7729,
         -0.9977, -0.9977, -0.9977, -0.9977, -0.4742, -0.4742, -0.4742, -0.4742,
         -0.1815, -0.1815, -0.1815, -0.1815,  0.0666,  0.0666,  0.0666,  0.0666,
         -0.1981, -0.1981, -0.1981, -0.1981, -0.8992, -0.8992, -0.8992, -0.8992,
          0.9865,  0.9865,  0.9865,  0.9865, -0.9318, -0.9318, -0.9318, -0.9318,
          0.9077,  0.9077,  0.9077,  0.9077,  0.2671,  0.2671,  0.2671,  0.2671,
         -0.9999, -0.9999, -0.9999, -0.9999, -0.5330, -0.5330, -0.5330, -0.5330,
          0.9852,  0.9852,  0.9852,  0.9852, -0.1272, -0.1272, -0.1272, -0.1272,
          0.9627,  0.9627,  0.9627,  0.9627, -0.0704, -0.0704, -0.0704, -0.0704,
         -0.6843, -0.6843, -0.6843, -0.6843,  0.6885,  0.6885,  0.6885,  0.6885,
          0.7982,  0.7982,  0.7982,  0.7982,  0.7845,  0.7845,  0.7845,  0.7845,
         -0.3858, -0.3858, -

In [3]:
# Define the MLP architecture for the TD learning model
class TDNetwork(nn.Module, tictactoe_4x4.TicTacToe4x4x4):
    def __init__(self, input_size, hidden_sizes, output_size):
        # Note: Hidden Sizes will be a list. According to the paper, it will be 304, 32
        super(TDNetwork, self).__init__()
        layers = []
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
            input_size = hidden_size
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x):
        feature_map = get_rows(x)
        return self.layers(feature_map)


In [None]:
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            x = self.linears[i // 2](x) + l(x)
        return x

In [4]:
# Initialize the TD learning model
input_size = 64  # Assuming each space on the 4x4x4 board is represented as a binary (occupied or not)
hidden_sizes = [128, 128]  # Hidden layers sizes as per the paper's experimentation
output_size = 1  # Output size representing the value function
model = TDNetwork(input_size, hidden_sizes, output_size)


In [5]:
# Training parameters
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.MSELoss()


In [6]:
# Placeholder for the training loop
def train_td_model(model, num_episodes):
    for episode in range(num_episodes):
        # The training loop should include:
        # 1. Interacting with the environment
        # 2. Computing TD target and TD error
        # 3. Updating the model using backpropagation
        pass

# Placeholder for saving the model
def save_model(model, path):
    torch.save(model.state_dict(), path)


In [13]:
# Example usage
num_episodes = 1000  # Number of episodes for training
train_td_model(model, num_episodes)  # Train the model

# Save the trained model
os.makedirs('C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/Phase_3_3D_Tic_Tac_Toe/models', exist_ok = True)
model_path = 'C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/Phase_3_3D_Tic_Tac_Toe/models/td_tictactoe_model.pth'
save_model(model, model_path)

model_path

'C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/Phase_3_3D_Tic_Tac_Toe/models/td_tictactoe_model.pth'