# ConnectX Reinforcement Learning (Skeleton)
_Last updated: 2025-08-14 08:34 UTC_

**Goals**
- Define environment interface (state, actions, reward)
- Implement a simple baseline agent (random/heuristic)
- (Stretch) Train a DQN/Policy-Gradient agent and evaluate

## 0. Environment (simplified 6x7 Connect Four)

In [None]:

import numpy as np

ROWS, COLS = 6, 7

class ConnectXEnv:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros((ROWS, COLS), dtype=int)  # 0 empty, 1 agent, -1 opponent
        self.player = 1
        return self.board.copy()

    def valid_actions(self):
        return [c for c in range(COLS) if self.board[0, c] == 0]

    def step(self, action):
        if action not in self.valid_actions():
            return self.board.copy(), -10.0, True, {"illegal": True}
        # drop piece
        r = ROWS-1
        while r>=0 and self.board[r, action] != 0:
            r -= 1
        self.board[r, action] = self.player
        reward, done = self._check_terminal(self.player)
        self.player *= -1  # switch player
        return self.board.copy(), reward, done, {}

    def _check_terminal(self, p):
        # win = +1 for p (agent), -1 when opponent wins (from agent perspective)
        lines = []
        b = self.board == p
        # horiz
        for r in range(ROWS):
            for c in range(COLS-3):
                if b[r, c:c+4].all(): return (1.0 if p==1 else -1.0), True
        # vert
        for r in range(ROWS-3):
            for c in range(COLS):
                if b[r:r+4, c].all(): return (1.0 if p==1 else -1.0), True
        # diag         for r in range(ROWS-3):
            for c in range(COLS-3):
                if all(b[r+i, c+i] for i in range(4)): return (1.0 if p==1 else -1.0), True
        # diag /
        for r in range(3, ROWS):
            for c in range(COLS-3):
                if all(b[r-i, c+i] for i in range(4)): return (1.0 if p==1 else -1.0), True
        # draw?
        if (self.board != 0).all():
            return 0.0, True
        return 0.0, False

env = ConnectXEnv()
state = env.reset()
state, env.valid_actions()[:5]


## 1. Random / Heuristic Agent (baseline)

In [None]:

import random

def random_agent(obs):
    acts = env.valid_actions()
    return random.choice(acts) if acts else 0

# Play one game vs random opponent
obs = env.reset()
done = False
moves = 0
while not done and moves < 100:
    action = random_agent(obs)
    obs, reward, done, info = env.step(action)
    if done: break
    # opponent
    action_op = random_agent(obs)
    obs, reward_op, done, info = env.step(action_op)
    moves += 1

reward, done, moves


## 2. (Stretch) DQN Skeleton

In [None]:

# Pseudocode outline for students to fill:
# - Build a small CNN/MLP over board state (6x7)
# - Experience replay buffer
# - epsilon-greedy policy
# - target network updates
# Tip: transform board for 'current player' perspective to simplify value function.
