In [33]:
eposides = 10000000
zoom_ratio=8

In [22]:
import gym
import cv2
import numpy as np
import random
import math
import time
import pickle
from tqdm import tqdm

env = gym.make('PongDeterministic-v4', render_mode='rgb_array')

In [23]:
observation = env.reset()
cumulated_reward = 0

In [40]:
def discretizer(state,zoom_ratio=zoom_ratio):
    grey = cv2.cvtColor(state[34:-16], cv2.COLOR_BGR2GRAY)
    greyvals = {}
    unique_values = [64, 123, 147, 236]
    for value in unique_values:
        greyvals[value] = np.where(grey == value)
    states = []
    if greyvals[123][0].any():
        states.extend([(greyvals[123][0][0]+greyvals[123][0][-1])//2, (greyvals[123][1][0]+greyvals[123][1][-1])//2])
    else:
        states.extend([0,0])  
    if greyvals[147][0].any():
        states.extend([(greyvals[147][0][0]+greyvals[147][0][-1])//2, (greyvals[147][1][0]+greyvals[147][1][-1])//2])
    else:
        states.extend([0,0])
    if greyvals[236][0].any():
        states.extend([(greyvals[236][0][0]+greyvals[236][0][-1])//2, (greyvals[236][1][0]+greyvals[236][1][-1])//2])
    else:
        states.extend([0,0])
    return tuple([i//zoom_ratio for i in states])

In [25]:
Q_table = np.zeros([160//zoom_ratio, 160//zoom_ratio,160//zoom_ratio,160//zoom_ratio, 160//zoom_ratio, 160//zoom_ratio, 6],dtype=np.uint8)

In [28]:
def policy(state:tuple):
    return np.argmax(Q_table[state])
def new_Q_value(reward: float, state_new: tuple, discount_factor=1):
    future_optimal_value = np.max(Q_table[state_new])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

def learning_rate(n:int, min_rate=0.01):
    return max(min_rate, min(1.0, 1.0-math.log10((n+1)/25)))

def exploration_rate(n:int, min_rate=0.1):
    return max(min_rate, min(1, 1.0-math.log10((n+1)/25)))

In [41]:
for e in range(eposides):
    current_state, done = discretizer(env.reset()[0]), False
    while done == False:
        action = policy(current_state)
        if np.random.random() < exploration_rate(e):
            action = env.action_space.sample()
        obs, reward, done,tun, _ = env.step(action)
        new_state = discretizer(obs)
        print(new_state)
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward, new_state)
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state


[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 69, 141, 0, 0]
[0, 0, 8, 17, 0, 0]
(0, 0, 8, 17, 0, 0)
[0, 0, 69, 141, 0, 0]
[0, 0, 8, 17, 0, 0]
(0, 0, 8, 17, 0, 0)
[0, 0, 59, 141, 0, 0]
[0, 0, 7, 17, 0, 0]
(0, 0, 7, 17, 0, 0)
[0, 0, 59, 141, 0, 0]
[0, 0, 7, 17, 0, 0]
(0, 0, 7, 17, 0, 0)
[0, 0, 75, 141, 0, 0]
[0, 0, 9, 17, 0, 0]
(0, 0, 9, 17, 0, 0)
[0, 0, 77, 141, 0, 0]
[0, 0, 9, 17, 0, 0]
(0, 0, 9, 17, 0, 0)
[0, 0, 61, 141, 0, 0]
[0, 0, 7, 17, 0, 0]
(0, 0, 7, 17, 0, 0)
[0, 0, 39, 141, 0, 0]
[0, 0, 4, 17, 0, 0]
(0, 0, 4, 17, 0, 0)
[0, 0, 17, 141, 0, 0]
[0, 0, 2, 17, 0, 0]
(0, 0, 2, 17, 0, 0)
[0, 0, 4, 141, 0, 0]
[0, 0, 0, 17, 0, 0]
(0, 0, 0, 17, 0, 0)
[0, 0, 3, 141, 0, 0]
[0, 0, 0, 17, 0, 0]
(0, 0, 0, 17, 0, 0)
[0, 0, 2, 141, 0, 0]
[0, 0, 0, 17, 0, 0]
(0, 0, 0, 17, 0, 0)
[0, 0, 2, 141, 0, 0]
[0, 0, 0, 17, 0, 0]
(0, 0, 0, 17, 0, 0)
[0, 0, 2, 141, 0, 0]
[0, 0, 0, 17, 0, 0]
(0, 0, 0, 17, 0, 0)
[88, 17, 2, 141, 83, 77]
[11, 2, 0, 17, 10, 9]
(11, 2, 0, 17, 10, 9)
[92, 17, 7, 141, 87, 73]
[11, 

KeyboardInterrupt: 

In [None]:
np.save('Q_table.npy', Q_table)