In [None]:
import json
import random
import numpy as np
from Pong import PongGame

# Q-Learning for Pong

This notebook demonstrates how Q-learning — a model-free reinforcement learning algorithm — can be applied to train an agent to play a simplified version of Pong.

## What is Q-learning?

Q-learning is an off-policy, value-based reinforcement learning algorithm. Its goal is to learn the optimal action-selection policy that maximizes the expected cumulative reward over time.

The agent learns a function Q(s, a), which estimates the value (expected future reward) of taking action `a` in state `s`, and then following the optimal policy. Over time, these Q-values are updated using the Bellman equation:


$$Q(s, a) \leftarrow Q(s, a) + \alpha \left[ r + \gamma \max_{a'} Q(s', a') - Q(s, a) \right]$$


Where:
- `α` is the learning rate (how much to update)
- `γ` is the discount factor (how much future rewards matter)
- `r` is the reward received after taking action `a` in state `s`
- `s'` is the next state
- `max_a' Q(s', a')` is the estimated value of the best action in the next state

## How it's applied here

In this project:
- The game environment (Pong) is **discretized** — continuous variables like ball and paddle positions are binned into finite ranges.
- The **state space** consists of: paddle Y position, ball X and Y position, and ball X and Y directions.
- The **action space** is limited to three options: move up, stay, or move down.
- A **Q-table** is used to store and update values for each (state, action) pair.

The agent uses an **epsilon-greedy strategy** to explore:
- With probability ε, it chooses a random action (exploration).
- Otherwise, it selects the action with the highest Q-value (exploitation).

During training:
- The agent interacts with the environment for many episodes.
- Rewards guide the paddle to track and intercept the ball.
- Q-values are updated incrementally based on experience.

At the end of training, the Q-table is converted into a deterministic policy by selecting the best action for each known state. This policy is saved to a file and can be used in the game for evaluation.



In [1]:
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.95
EPSILON = 0.1
EPISODES = 200000
MAX_ACTIONS_PER_EPISODE = 2000

NUM_PADDLE_POS = 30
NUM_BALL_X = 50
NUM_BALL_Y = 40

ACTIONS = [-1, 0, 1] 

WIDTH, HEIGHT = 800, 600
PADDLE_WIDTH, PADDLE_HEIGHT = 10, 100

In [2]:
def discretize(value, max_value, bins):
    return min(bins - 1, max(0, int(value / max_value * bins)))

In [None]:
def calculate_reward(state, action):
    paddle_y, ball_x, ball_y, ball_dx, ball_dy = state

    paddle_center = paddle_y * (HEIGHT / NUM_PADDLE_POS) + PADDLE_HEIGHT / 2
    ball_actual_y = ball_y * (HEIGHT / NUM_BALL_Y)

    distance = abs(paddle_center - ball_actual_y)
    reward = -distance / (HEIGHT / 2)

    if (ball_actual_y > paddle_center and action == 1) or (ball_actual_y < paddle_center and action == -1):
        reward += 0.5 

    if ball_dx == 1 and ball_x == NUM_BALL_X - 1:
        if distance <= PADDLE_HEIGHT / 2:
            reward += 5  
        else:
            reward -= 100 
    return reward

In [None]:
q_table = {}

def choose_action(state, epsilon):
    if state not in q_table:
        q_table[state] = [0] * len(ACTIONS)
    if random.uniform(0, 1) < epsilon:
        return random.choice(ACTIONS)
    else:
        return ACTIONS[np.argmax(q_table[state])]

In [None]:
def update_q_table(state, action, reward, next_state):
    if next_state not in q_table:
        q_table[next_state] = [0] * len(ACTIONS)
    action_index = ACTIONS.index(action)
    best_next_action = max(q_table[next_state])
    q_table[state][action_index] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * best_next_action - q_table[state][action_index])


In [7]:
for episode in range(EPISODES):

    paddle_y = random.randint(0, NUM_PADDLE_POS - 1)
    ball_x = random.randint(0, NUM_BALL_X - 1)
    ball_y = random.randint(0, NUM_BALL_Y - 1)
    ball_dx = random.choice([-1, 1])
    ball_dy = random.choice([-1, 1])
    
    done = False
    action_count = 0

    while not done and action_count < MAX_ACTIONS_PER_EPISODE:
        action_count += 1
        state = (paddle_y, ball_x, ball_y, ball_dx, ball_dy)
        action = choose_action(state, EPSILON)

        paddle_y = max(0, min(NUM_PADDLE_POS - 1, paddle_y + action))

        ball_x += ball_dx
        ball_y += ball_dy

        if ball_y <= 0 or ball_y >= NUM_BALL_Y - 1:
            ball_dy = -ball_dy
        if ball_x <= 0:
            ball_dx = -ball_dx
        if ball_x >= NUM_BALL_X - 1:
            ball_dx = -ball_dx

        next_state = (paddle_y, ball_x, ball_y, ball_dx, ball_dy)

        reward = calculate_reward(state, action)

        update_q_table(state, action, reward, next_state)

    if (episode + 1) % 25000 == 0:
        print(f"Episode {episode + 1}/{EPISODES}")

policy = {str(state): ACTIONS[np.argmax(actions)] for state, actions in q_table.items()}
with open("pong_q_policy", "w") as f:
    json.dump(policy, f)


Episode 25000/200000
Episode 50000/200000
Episode 75000/200000
Episode 100000/200000
Episode 125000/200000
Episode 150000/200000
Episode 175000/200000
Episode 200000/200000


In [4]:
policy_file = "pong_q_policy.json"
game = PongGame(policy_file)
game.run()