# CSC 580 HW#4 "QLearning.ipynb" -- Q-learning for the Snake Game
# ** Initial Code **

In [1]:
# First install this library so that we can import code from other Notebooks
## https://newbedev.com/how-to-import-functions-of-a-jupyter-notebook-into-another-jupyter-notebook-in-google-colab#:~:text=How%20to%20import%20functions%20of%20a%20jupyter%20notebook,mount%20your%20google%20drive%20to%20access%20your%20xxx.ipynb
!pip install import-ipynb
import import_ipynb

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import SnakeEnv as snake_env
import Agent as agent_class

import numpy as np
import matplotlib.pyplot as plt
import random

importing Jupyter notebook from SnakeEnv.ipynb
Defaulting to user installation because normal site-packages is not writeable
importing Jupyter notebook from Agent.ipynb


## Q-Learning -- Off-policy Temporal Difference Control

In [3]:
def q_learning(agent, env, max_steps, train=True):
    """
    This function simulates a RL game, where the agent learns the (hopefully) optimal policy
    by Q-learning.  The parameters 'agent' and 'env' are created in the calling function and
    passed in, while 'max_step' specifies the maximum timesteps to play (Note: continuous 
    after failing) and 'train' indicates the run is a training or otherwise (i.e., evaluation).
    Most lines are basic and general, calling functions in the environment or the agent.  
    Details depend on the implementations of those components (and their functions).
    """
    # First reset the environment
    state = env.reset()
    agent.init_state(state) #(A)
    
    # Initialize some housekeeping variables
    total_return, n_apples, n_stops, n_goodsteps = 0.0, 0, 0, 0
    done = False
   
    # Play continuously until max_steps.
    for i in range(max_steps):
        
        # Select the action to take at this state. 
        if train:
            action = agent.select_action(state)  #(A) epsilon greedy selection
        else:
            action = agent.select_greedy(state)  #(A) greedy selection
        
        # Environment executes the selected action.
        next_state, reward, done, _ = env.step(action) 
        
        # Q-learning if training -- update the Q-table
        if train:
            agent.update_Qtable(state, action, reward, next_state)  #(A) 
            
        # Update to prepare for the next iteration
        state = next_state
        
        # Accumulate the total return and other counts from this step
        total_return += pow(agent.gamma, i) * reward

        if reward == 10:
            n_apples += 1
        elif reward == 1:
            n_goodsteps += 1
        # The play is continuous, so this condition doesn't make the play terminate,
        # but an episode stops when a snake curls itself or hits a wall.
        elif reward == -100:  # i.e., done
            n_stops += 1
        #
    return total_return, n_apples, n_stops, n_goodsteps, agent.num_states_visited() #(A)


# TD Training/Learning -- If you are ** evaluating ** the learned policy, do NOT click this cell; instead click the next cell (TD evaluation).
## The cell below *learns* the optimal policy

In [4]:
# Do/call q_learning for 'num_runs' times.  For each run, 'num_steps' steps is done.
# Note that agent and environment are created anew and parameters are (re-)initialized 
# for every run.
#
# NOTE: Set env.display to True or False to control the graphic visualization. 
# But note that after visualizing once, the whole code has to be re-started (i.e., 
# restart kernel) due to the bug(s) in the turtle library.

num_runs = 1
num_steps = 1000
results_list = []

for run in range(num_runs):
    params = dict()
    params['gamma'] = 0.95
    params['alpha'] = 0.7
    params['epsilon'] = 0.6         # exploration probability at start
    params['epsilon_min'] = .01     # minimum epsilon
    params['epsilon_decay'] = .995  # exponential decay rate for epsilon

    # Create an environment and an agent
    env = snake_env.SnakeEnv()
    agent = agent_class.Agent(env, params)

    env.display = False #True      ## (**) display on/off

    ret = q_learning(agent, env, num_steps, True) # training=True
    results_list.append(ret)

    env.close()
    print ("* Run {}: Return={:>8.3f}, #Apples={}, #Stops={}, #GoodSteps={}, #StatesVisited={}"
           .format(run, ret[0], ret[1], ret[2], ret[3], ret[4]))
    
# Display the mean
results = np.array(results_list)
cmean = np.mean(results, axis=0)
print ("\n** Mean: Return={:>8.3f}, #Apples={}, #Stops={}, #GoodSteps={}, #StatesVisited={}"
           .format(cmean[0], cmean[1], cmean[2], cmean[3], cmean[4]))

agent.write_qtable("qtable.csv") #(A)

* Run 0: Return= -20.146, #Apples=2, #Stops=38, #GoodSteps=477, #StatesVisited=0

** Mean: Return= -20.146, #Apples=2.0, #Stops=38.0, #GoodSteps=477.0, #StatesVisited=0.0


# TD Evaluation -- you run this cell ** after ** you finished coding the training/learning cell above.
## Code to *evaluate* the learned policy.  The q-table is read in from the saved csv file.  No learning takes place.

In [None]:
# Do q_learning for 'num_runs' times.  For each run, 'num_steps' steps is done.

num_runs = 1
num_steps = 300
results_list = []

for run in range(num_runs):
    params = dict()
    params['gamma'] = 0.95
    params['alpha'] = 0.7
    params['epsilon'] = 0.8         # exploration probability at start
    params['epsilon_min'] = .01     # minimum epsilon
    params['epsilon_decay'] = .995  # exponential decay rate for epsilon

    # Create an environment and an agent
    env = snake_env.SnakeEnv()
    agent = agent_class.Agent(env, params)
    
    # Read in the q-table
    agent.read_qtable("qtable.csv")

    env.display = True      ## <== display on/off

    ret = q_learning(agent, env, num_steps, False) # training=False for evaluation
    results_list.append(ret)

    env.close()
    print ("* Run {}: Return={:>8.3f}, #Apples={}, #Stops={}, #GoodSteps={}, #StatesVisited={}"
           .format(run, ret[0], ret[1], ret[2], ret[3], ret[4]))
    
# Display the mean
results = np.array(results_list)
cmean = np.mean(results, axis=0)
print ("\n** Mean: Return={:>8.3f}, #Apples={}, #Stops={}, #GoodSteps={}, #StatesVisited={}"
           .format(cmean[0], cmean[1], cmean[2], cmean[3], cmean[4]))