In [1]:
import numpy as np # used for arrays
import gym # pull the environment
import time # to get the time
import math # needed for calculations
import matplotlib.pyplot as plt
from gym import wrappers  #gymの画像保存
import pickle
import os

In [2]:
# [1]Q関数を離散化して定義する関数　------------
# 観測した状態を離散値にデジタル変換する
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]

In [3]:
# 各値を離散値に変換
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])

In [4]:
# [2]行動a(t)を求める関数 -------------------------------------
def get_action(next_state, episode):
    #徐々に最適行動のみをとる、ε-greedy法
    # CAUTION1: epsilon increase as the game go on
    # CAUTION2: changing value in the function that has not been related in parameter -> look for global variable address 
    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        next_action = np.argmax(q_table[next_state])
    else:
        next_action = np.random.choice([0, 1])
    return next_action
 
 
# [3]Qテーブルを更新する関数 -------------------------------------
def update_Qtable(q_table, state, action, reward, next_state):
    next_Max_Q=max(q_table[next_state][0],q_table[next_state][1] )
    # update q tab;ke on q learning function
    q_table[state, action] = (1 - alpha) * q_table[state, action] +\
            alpha * (reward + gamma * next_Max_Q)
   
    return q_table

In [5]:
# ------------------------------------------------------------
# the variable for observation discretization 
# ------------------------------------------------------------
# 状態を6分割^（4変数）にデジタル変換してQ関数（表）を作成
num_dizitized = 6  #分割数
# ------------------------------------------------------------


# ------------------------------------------------------------
# Environment definition
# ------------------------------------------------------------
env = gym.make('CartPole-v0')
# ------------------------------------------------------------


# ------------------------------------------------------------
# The must initialized variables (book existed)
# ------------------------------------------------------------
# Qtable must satisfied Qtable[state][action] being defined
# Qtable is randomly initailized
num_episodes = 2000  #総試行回数
q_table = np.random.uniform(
    low=-1, high=1, size=(num_dizitized**4, env.action_space.n))

gamma = 0.99 # discount factor
alpha = 0.5 #learning rate
# CAUTION: dont have to initialize epsilon because it is being auto handled inside get_action function
# ------------------------------------------------------------

# ------------------------------------------------------------
# The variable to define when to stop learning
# ------------------------------------------------------------
# If the step continue to grow after max number of step -> cut off
max_number_of_steps = 200  #1試行のstep数

# To ensure the stability, we will average the last num_consecutive_iterations number of episode
# and compare with goal_average_reward
num_consecutive_iterations = 100 #学習完了評価に使用する平均試行回数

# the average of this vector will be used to compare with goal average reward
total_reward_vec = np.zeros(num_consecutive_iterations)  #各試行の報酬を格納

# goal average to be expected to get
goal_average_reward = max_number_of_steps - 5  #この報酬を超えると学習終了（中心への制御なし）

# record the last position before done
final_x = np.zeros((num_episodes, 1))  #学習後、各試行のt=200でのｘの位置を格納

islearned = 0  #学習が終わったフラグ
isrender = 0  #描画フラグ
# ------------------------------------------------------------

# Save pickle weight file name
save_pickle_folder = "cartpole_weight"
os.makedirs(save_pickle_folder, exist_ok=True)
save_pickle_name = f"{max_number_of_steps}-step-qtable.pkl"
save_pickle_path = os.path.join(save_pickle_folder, save_pickle_name)

In [6]:
# Q learning algorithm implementation
for espisode in range(num_episodes):

    # initialize S which is the S0 state
    S = env.reset()
    discretized_S = digitize_state(S)
    is_done = False
    t = 0
    episode_reward = 0

    # Loop for each step in the espisode
    while t < max_number_of_steps and not is_done:
        t += 1
        # choose A from Q on epsilon greedy algorithm
        A = get_action(discretized_S, espisode)

        S_next, reward, is_done, _ = env.step(A)

        # for automatically playing game after learning
        if islearned:
            env.render()
            time.sleep(0.1)

        if is_done:
            if t < goal_average_reward:
                reward = - max_number_of_steps  #こけたら罰則
            else:
                reward = 1  #立ったまま終了時は罰則はなし
        else:
            reward = 1  #各ステップで立ってたら報酬追加

        discretized_S_next = digitize_state(S_next)

        episode_reward += reward

        update_Qtable(q_table, discretized_S, A, reward, discretized_S_next)

        discretized_S = discretized_S_next

        #終了時の処理
        if is_done:
            # The reward vector is stack from the ebd to start position
            total_reward_vec = np.hstack((total_reward_vec[1:], episode_reward))  #報酬を記録

            if islearned:
                final_x[espisode, 0] = S_next[0]

            break

    if np.average(total_reward_vec) > goal_average_reward:
        # Learning completed
        if islearned:
            break
        
        print("learning completed")
        islearned = 1

if islearned:
    with open(save_pickle_path, "wb") as f:
        pickle.dump(q_table, f)



learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed
learning completed


KeyboardInterrupt: 