In [1]:
# !pip install gym
# !pip install cmake "gym[atari]" scipy
# !pip install gym[toy_text]

In [2]:
import gym
import numpy as np
import random
from IPython.display import clear_output
import pandas as pd
from time import sleep
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [3]:
class Q_learn:

  def __init__(self,env):
    self.env=env  


  def buildQ_leanTable(self):
    self.q_table = np.zeros([self.env.observation_space.n,self.env.action_space.n])
    return self.q_table


  def train(self,epoch=100001,alpha = 0.1,gamma = 0.6,epsilon = 0.1):
    self.buildQ_leanTable()
    print(epoch)
    for i in range(1, epoch):
        state = self.env.reset()
        epochs, reward, = 0, 0
        done = False

        while not done:
            
            
            if isinstance(state,int):
                state = state
            elif isinstance(state, tuple):
                state = state[0]
            
            if random.uniform(0, 1) < epsilon:
                action = self.env.action_space.sample() # Explore action space
            else:
                action = np.argmax(self.q_table[state]) # Exploit learned values

            next_state = self.env.step(action)
            reward = self.env.step(action)[0]
            done = self.env.step(action)
            info = self.env.step(action) 
            
            old_value = self.q_table[state, action]
            next_max = np.max(self.q_table[next_state[0]])
            
            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            self.q_table[state, action] = new_value

            state = next_state[0]
            epochs += 1
            
        if i % 100 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")

    print("Training finished.\n")

  def trainTune(self,alpha = 0.5,gamma = 0.8,epsilon = 0.6):
    self.buildQ_leanTable()
    for i in range(1, 100001):
        state = self.env.reset()
        epochs, reward, = 0, 0
        done = False

        while not done:
            
            if isinstance(state,int):
                state = state
            elif isinstance(state, tuple):
                state = state[0]
            
            if random.uniform(0, 1) < epsilon:
                action = self.env.action_space.sample() # Explore action space
            else:
                action = np.argmax(self.q_table[state]) # Exploit learned values

            next_state = self.env.step(action) 
            reward  = self.env.step(action)[0]
            done  = self.env.step(action)  
            info = self.env.step(action) 
            
            old_value = self.q_table[state, action]
            next_max = np.max(self.q_table[next_state[0]])
            
            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            self.q_table[state, action] = new_value

            state = next_state[0]
            epochs += 1
            
        if i % 100 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")
        if i % 1000 == 0:
            epsilon-=0.0058
            gamma-=0.0078
            alpha-=0.0048
            print(epsilon,gamma,alpha,sep='\n')

    print("Training finished.\n")
  def trainGraidSearch(self,hyper):
    self.buildQ_leanTable()
    df=pd.DataFrame(columns=['epoch','alpha','gama','epsilon','average Time'])
    epochF=False
    alphaF=False
    gammaF=False
    epsilonF=False
    try:
      epochs=hyper['epochs']
      epochF=True
    except:
      epochs=[100001]
    try:
      alphas=hyper['alphas']
      alphaF=False
    except:
      alphas=[0.1]
    try:
      gammas=hyper['gammas']
      gammaF=True
    except:
      gammas=[0.6]
    try:
      epsilons=hyper['epsilons']
      epsilonF=True
    except:
      epsilons=[0.1]
    first=True
    best_parameter=[]
    for epoch in epochs:
      for alpha in alphas:
        for gamma in gammas:
          for epsilon in epsilons:
            self.train(epoch,alpha,gamma,epsilon)
            average=self.evaloute()
            df.loc[len(df.index)]=[epoch,alpha,gamma,epsilon,average]
            display(df)
    best_parmeter=df.loc[df['average Time'].idxmin()]
    return best_parmeter 


  def evaloute(self,episodes = 100):
    total_epochs= 0
    for _ in range(episodes):
        state = self.env.reset()
        epochs, reward = 0, 0
        done = False
        
        while not done:
            action = np.argmax(self.q_table[state[0]])
            state  = self.env.step(action) 
            reward = self.env.step(action)
            done = self.env.step(action) 
            info = self.env.step(action)
            epochs += 1
        total_epochs += epochs
    average=total_epochs / episodes
    print(f"Results after {episodes} episodes:")
    print(f"Average timesteps per episode: {average}")
    return average

In [4]:
env = gym.make("CliffWalking-v0",render_mode="ansi")
env.reset()
print(env.render())

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T




In [5]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(4)
State Space Discrete(48)


In [6]:
RL_model=Q_learn(env)
RL_model.train()


Episode: 100000
Training finished.



In [7]:
average_time=RL_model.evaloute()

Results after 100 episodes:
Average timesteps per episode: 1.0


In [8]:
RL_model.trainTune()
RL_model.evaloute(episodes=200)

Episode: 100000
0.019999999999998352
0.01999999999999974
0.019999999999998685
Training finished.

Results after 200 episodes:
Average timesteps per episode: 1.0


1.0

In [9]:
hyper={
    'epochs':[100001],
    'alphas':[0.8,0.5,0.3],
    'gammas':[0.8,0.5,0.2],
    'epsilons':[0.7,0.4,0.1,0.06]
}
best_param=RL_model.trainGraidSearch(hyper)
print("best parameters is :",best_param)


Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 1.0


Unnamed: 0,epoch,alpha,gama,epsilon,average Time
0,100001.0,0.8,0.8,0.7,1.0
1,100001.0,0.8,0.8,0.4,1.0
2,100001.0,0.8,0.8,0.1,1.0
3,100001.0,0.8,0.8,0.06,1.0
4,100001.0,0.8,0.5,0.7,1.0
5,100001.0,0.8,0.5,0.4,1.0
6,100001.0,0.8,0.5,0.1,1.0
7,100001.0,0.8,0.5,0.06,1.0
8,100001.0,0.8,0.2,0.7,1.0
9,100001.0,0.8,0.2,0.4,1.0


best parameters is : epoch           100001.0
alpha                0.8
gama                 0.8
epsilon              0.7
average Time         1.0
Name: 0, dtype: float64
