# Temporal Difference Learning for 3D Tic Tac Toe

This notebook contains the implementation of a Temporal Difference (TD) learning model using a Deep Q-Network (DQN) for playing 3D 4x4x4 Tic Tac Toe. The implementation is based on the approach outlined in the provided paper.


In [136]:
# Import necessary libraries
import numpy as np
import os
import random

# Setting Directory
os.chdir('C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/')

from python_scripts import state_formulation, utils, algorithm, tictactoe_4x4
import torch
import torch.nn as nn
import torch.optim as optim
from torchinfo import summary
import torch.nn.init as init


In [176]:
def get_rows(input_tensor):
    # Get diagonals (across 2 faces),digonals (across 3 faces) and horizontal and vertical rows
    diag_two_faces = []
    diag_two_faces.extend(
        [torch.diagonal(input_tensor[i, :, :]), torch.diagonal(input_tensor[:, i, :]), torch.diagonal(input_tensor[:, :, i]), 
        torch.diagonal(torch.fliplr(input_tensor)[i, :, :]), torch.diagonal(torch.fliplr(input_tensor)[:, i, :]), torch.diagonal(torch.fliplr(input_tensor)[:, :, i])] 
        for i in range(input_tensor.shape[0]))
    diag_two_faces = [item for sublist in diag_two_faces for item in sublist]
    
    diag_three_faces = []
    diag_three_faces = [[[[input_tensor[i, i, i], input_tensor[3 - i, i, i], input_tensor[i, 3 - i, i], input_tensor[i, i, 3 - i]] 
                          for i in range(4)][k][j] for j in range(4) for k in range(4)][l:l + 4] for l in range(0, 16, 4)]
    diag_three_faces = [torch.tensor([t.item() for t in row]) for row in diag_three_faces]

    horizontal_and_vertical_rows = []
    horizontal_and_vertical_rows.extend([input_tensor[i, j, :], input_tensor[i, :, j], input_tensor[:, i, j]]
                                        for i in range(input_tensor.shape[0]) for j in range(input_tensor.shape[0]))
    horizontal_and_vertical_rows = [item for sublist in horizontal_and_vertical_rows for item in sublist]
    
    return horizontal_and_vertical_rows + diag_two_faces + diag_three_faces

In [177]:
input_tensor = torch.arange(64).view(4, 4, 4)
overall_rows = get_rows(input_tensor)
print(f'Number of rows: {len(overall_rows)}')

Number of rows: 76


In [178]:
overall_rows

[tensor([0, 1, 2, 3]),
 tensor([ 0,  4,  8, 12]),
 tensor([ 0, 16, 32, 48]),
 tensor([4, 5, 6, 7]),
 tensor([ 1,  5,  9, 13]),
 tensor([ 1, 17, 33, 49]),
 tensor([ 8,  9, 10, 11]),
 tensor([ 2,  6, 10, 14]),
 tensor([ 2, 18, 34, 50]),
 tensor([12, 13, 14, 15]),
 tensor([ 3,  7, 11, 15]),
 tensor([ 3, 19, 35, 51]),
 tensor([16, 17, 18, 19]),
 tensor([16, 20, 24, 28]),
 tensor([ 4, 20, 36, 52]),
 tensor([20, 21, 22, 23]),
 tensor([17, 21, 25, 29]),
 tensor([ 5, 21, 37, 53]),
 tensor([24, 25, 26, 27]),
 tensor([18, 22, 26, 30]),
 tensor([ 6, 22, 38, 54]),
 tensor([28, 29, 30, 31]),
 tensor([19, 23, 27, 31]),
 tensor([ 7, 23, 39, 55]),
 tensor([32, 33, 34, 35]),
 tensor([32, 36, 40, 44]),
 tensor([ 8, 24, 40, 56]),
 tensor([36, 37, 38, 39]),
 tensor([33, 37, 41, 45]),
 tensor([ 9, 25, 41, 57]),
 tensor([40, 41, 42, 43]),
 tensor([34, 38, 42, 46]),
 tensor([10, 26, 42, 58]),
 tensor([44, 45, 46, 47]),
 tensor([35, 39, 43, 47]),
 tensor([11, 27, 43, 59]),
 tensor([48, 49, 50, 51]),
 tensor([

In [135]:
test = torch.arange(304).view(1, 304)
test_weight = torch.arange(304 ** 2).view(304, 304)

test_weight[0:4, 0:4]

tensor([[  0,   1,   2,   3],
        [304, 305, 306, 307],
        [608, 609, 610, 611],
        [912, 913, 914, 915]])

In [151]:
weights = torch.zeros((304, 304))
for i in range(0, 304, 4):
    # weights[(i + 4) * i: i + 4, (i + 4) * i: i + 4] = 1 ## --> 0:4, 0:4 --> 5:9, 5:9 --> 
    weights[i:i+4, i:i+4] = torch.ones(4, 4)
print(weights[0:12, 0:12])

tensor([[1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.]])


In [174]:
class StructuredLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super(StructuredLinear, self).__init__()
        self.input_size = input_size
        self.output_size = output_size

        # Initialize weights and biases
        self.weights = nn.Parameter(torch.zeros((output_size, input_size)))
        # self.biases = nn.Parameter(torch.zeros(output_size))
        self.reset_parameters()

    def reset_parameters(self):
        # Set weights for structured connections
        with torch.no_grad():
            for i in range(0, self.output_size, 4):
                entry = (torch.ones(4, 4))
                entry.requires_grad_(True)
                self.weights[i: i + 4, i: i + 4] = entry
        # Initialize biases
        # nn.init.zeros_(self.biases)

    def forward(self, x):
        return torch.matmul(x, self.weights)

class MyNeuralNetwork(nn.Module):
    def __init__(self):
        super(MyNeuralNetwork, self).__init__()
        self.structured_layer = StructuredLinear(304, 304)
        self.second_layer = nn.Linear(304, 32, bias = False)
        self.output_layer = nn.Linear(32, 1, bias = False)

    def forward(self, x):
        x = self.structured_layer(x)
        x = torch.relu(self.second_layer(x))
        x = self.output_layer(x)
        return x

# Example usage
model = MyNeuralNetwork()


In [175]:
summary(model, input_size = (1, 304), col_names = ['input_size', 'output_size', 'num_params'])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
MyNeuralNetwork                          [1, 304]                  [1, 1]                    --
├─StructuredLinear: 1-1                  [1, 304]                  [1, 304]                  --
├─Linear: 1-2                            [1, 304]                  [1, 32]                   9,728
├─Linear: 1-3                            [1, 32]                   [1, 1]                    32
Total params: 9,760
Trainable params: 9,760
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.01
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.04
Estimated Total Size (MB): 0.04

In [None]:
class StructuredLinearLayer(nn.Module):
    def __init__(self, num_rows, board_size):
        super(StructuredLinearLayer, self).__init__()
        self.num_rows = num_rows
        self.board_size = board_size
    
    def createlayer():
        
        pass

In [3]:
# Define the MLP architecture for the TD learning model
class TDNetwork(nn.Module, tictactoe_4x4.TicTacToe4x4x4):
    def __init__(self, input_size, hidden_sizes, output_size):
        # Note: Hidden Sizes will be a list. According to the paper, it will be 304, 32
        super(TDNetwork, self).__init__()
        layers = []
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
            input_size = hidden_size
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x):
        feature_map = get_rows(x)
        return self.layers(feature_map)


In [None]:
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            x = self.linears[i // 2](x) + l(x)
        return x

In [4]:
# Initialize the TD learning model
input_size = 64  # Assuming each space on the 4x4x4 board is represented as a binary (occupied or not)
hidden_sizes = [128, 128]  # Hidden layers sizes as per the paper's experimentation
output_size = 1  # Output size representing the value function
model = TDNetwork(input_size, hidden_sizes, output_size)


In [5]:
# Training parameters
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.MSELoss()


In [6]:
# Placeholder for the training loop
def train_td_model(model, num_episodes):
    for episode in range(num_episodes):
        # The training loop should include:
        # 1. Interacting with the environment
        # 2. Computing TD target and TD error
        # 3. Updating the model using backpropagation
        pass

# Placeholder for saving the model
def save_model(model, path):
    torch.save(model.state_dict(), path)


In [13]:
# Example usage
num_episodes = 1000  # Number of episodes for training
train_td_model(model, num_episodes)  # Train the model

# Save the trained model
os.makedirs('C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/Phase_3_3D_Tic_Tac_Toe/models', exist_ok = True)
model_path = 'C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/Phase_3_3D_Tic_Tac_Toe/models/td_tictactoe_model.pth'
save_model(model, model_path)

model_path

'C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/Phase_3_3D_Tic_Tac_Toe/models/td_tictactoe_model.pth'