In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [100]:
import pandas as pd
df = pd.read_csv("tictactoemoves.csv")
len(df)

7645

In [101]:
X = df.drop('move', axis=1)
y = df['move']


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,pos_1_empty,pos_1_x,pos_1_o,pos_2_empty,pos_2_x,pos_2_o,pos_3_empty,pos_3_x,pos_3_o,pos_4_empty,...,pos_6_o,pos_7_empty,pos_7_x,pos_7_o,pos_8_empty,pos_8_x,pos_8_o,pos_9_empty,pos_9_x,pos_9_o
1513,1,0,0,0,1,0,1,0,0,1,...,0,1,0,0,0,0,1,1,0,0
216,0,1,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
7357,1,0,0,1,0,0,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1
5170,1,0,0,0,1,0,0,0,1,1,...,0,0,1,0,0,0,1,0,1,0
421,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0,0,1,1,0,0,1,0,0,1,...,1,0,1,0,0,1,0,0,1,0
5390,0,0,1,0,0,1,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0
860,0,1,0,0,1,0,1,0,0,0,...,1,0,0,1,0,1,0,0,1,0
7603,0,1,0,1,0,0,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0


In [103]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [104]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [105]:
X_train_tf = tf.convert_to_tensor(X_train_scaled, dtype=tf.float32)
x_test_tf = tf.convert_to_tensor(X_test_scaled, dtype=tf.float32)
y_train_tf = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_test_tf = tf.convert_to_tensor(y_test, dtype=tf.int32)

In [106]:
X_train_tf

<tf.Tensor: shape=(6116, 27), dtype=float32, numpy=
array([[ 0.9805681 , -0.5985028 , -0.5410637 , ...,  0.99348104,
        -0.6080829 , -0.5415687 ],
       [-1.019817  ,  1.6708359 , -0.5410637 , ...,  0.99348104,
        -0.6080829 , -0.5415687 ],
       [ 0.9805681 , -0.5985028 , -0.5410637 , ..., -1.0065618 ,
        -0.6080829 ,  1.8464878 ],
       ...,
       [-1.019817  ,  1.6708359 , -0.5410637 , ..., -1.0065618 ,
         1.6445125 , -0.5415687 ],
       [-1.019817  ,  1.6708359 , -0.5410637 , ..., -1.0065618 ,
         1.6445125 , -0.5415687 ],
       [-1.019817  , -0.5985028 ,  1.8482112 , ..., -1.0065618 ,
         1.6445125 , -0.5415687 ]], dtype=float32)>

In [107]:
def build_dqn(input_shape, num_actions):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_actions, activation='linear')  # Output layer: Q-values for each action
    ])
    return model

In [108]:
state_size = 9
input_shape = (state_size,)
num_actions = 9

In [109]:
dqn_model = build_dqn(input_shape, num_actions)

In [110]:
dqn_model.summary()

In [111]:
dqn_model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [112]:
# Initialize environment and DQN model
print('starting...')
import random
from logic import TicTacToeEnvironment
env = TicTacToeEnvironment()
dqn_model = build_dqn(input_shape, num_actions)

# Define hyperparameters
replay_memory = []
num_episodes = 1000
max_steps_per_episode = 100
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
gamma = 0.99
batch_size = 32

for episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0

    for step in range(max_steps_per_episode):

        if np.random.rand() <= epsilon:
            action = np.random.choice(env.get_valid_actions())
        else:
            state = state.reshape(1,-1)
            action = np.argmax(dqn_model.predict(state))


        next_state, reward, done = env.step(action)
        episode_reward += reward


        replay_memory.append((state, action, reward, next_state, done))


        if len(replay_memory) >= batch_size:
            minibatch = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*minibatch))
            print("Original shape of next_states_batch:", next_states_batch.shape)
            next_states_batch = next_states_batch.reshape(next_states_batch.shape[0], -1)
            print("Reshaped shape of next_states_batch:", next_states_batch.shape)

            predicted_Q_values = dqn_model.predict(next_states_batch)
            print("Shapes: reward_batch:", reward_batch.shape, "next_states_batch:", next_states_batch.shape, "done_batch:", done_batch.shape)
            target_Q_values = reward_batch + (gamma * np.max(predicted_Q_values, axis=1)) * (1 - done_batch)





            dqn_model.train_on_batch(states_batch, action_batch, sample_weight=target_Q_values)

        state = next_state

        if done:
            break


    epsilon = max(epsilon_min, epsilon * epsilon_decay)


    print(f"Episode {episode}: Reward = {episode_reward}")

starting...
Episode 0: Reward = 1
Episode 1: Reward = 1
Episode 2: Reward = 1
Episode 3: Reward = 1
Episode 4: Reward = 1
Episode 5: Reward = 1
Original shape of next_states_batch: (32, 27)
Reshaped shape of next_states_batch: (32, 27)


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_66" is incompatible with the layer: expected axis -1 of input shape to have value 9, but received input with shape (32, 27)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 27), dtype=int32)
  • training=False
  • mask=None