In [1]:
import tensorflow as tf #Tensorflow handles the Training and Testing
from tensorflow import keras #Keras handles the importing of Data
import numpy as np #NumPy does funny math good
import gym #imports OpenAI Gym which has a bunch of environments(games) to play with
import matplotlib.pyplot as plt
from statistics import mean, median 
from tqdm import tqdm

In [2]:
env = gym.make("CartPole-v1")

In [None]:
#Environment Notes:
#   -new_state is an array of 4 values 
#       Num	    Observation	            Min	        Max
#       0	    Cart Position	        -2.4	    2.4
#       1	    Cart Velocity	        -Inf	    Inf
#       2	    Pole Angle	            ~ -41.8°	~ 41.8°
#       3	    Pole Velocity At Tip	-Inf	    Inf
# 
#   - actions are left or right mapped as 0=left 1=right
# 
#   - Episodes are terminated if:
#       - Pole Angle is more than ±12°
#       - Cart Position is more than ±2.4 (center of the cart reaches the edge of the display)
#       - Episode length is greater than 200 (500 for v1).
# 
#   - Problem is considered solved if:
#       - Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials


In [None]:
#Random Action Test
# for e in range(10):
#     state = env.reset()
#     for s in range(100):

#         env.render()
#         action = env.action_space.sample()
#         new_state, reward, done, _ = env.step(action)

#         # if done:
#         #     break
# env.close()

In [3]:
#Hpyer Parameters
score_threshhold = 50
n_training_games = 275000
n_training_steps = 500

#Creating Training Data
training_data = [] #Each Piece of data will be in format (observations, moves)

loss = 0
progress = tqdm(total=n_training_games, position=0, leave=False)
progress.set_description("Data Generation Progress")
scores = [] #List of good data
for _ in range(n_training_games): #Generate 20000 training data
    score = 0 #Score counter over episode

    memory = [] #List of all moves
    prev_obs = [] #Last observation made

    progress.update(1)

    env.reset()
    for _ in range(n_training_steps): #Runs 250 steps per episode
        action = env.action_space.sample() #Chooses random action
        new_state, reward, done, _ = env.step(action) #Runs random action

        if(len(prev_obs)>0): #Except for the first time, add all sets of previous observations and current actions to memory
            memory.append([prev_obs, action])
        prev_obs = new_state #Update previous observation

        score+=reward #Update Score
        if done: break #Episode data is <= 250 observations 

    if score >= score_threshhold:  
        scores.append(score)
        
        #Converting actions(left,right) to one-hot(0,1) representation
        #NOTE: data[0] is the observation of size 4, data[1] is resulting left-right action
        for data in memory:
            if(data[1] == 0):
                output=[1,0]
            elif(data[1] == 1):
                output=[0,1]

        training_data.append([data[0], output]) #Compiles data for training set
    
    else: loss+=1

print()
print("Training data has", len(training_data), "episodes (Loss =", (loss/n_training_games)*100, "%)")
print('Average accepted score:', mean(scores))
print('Median score for accepted scores:', median(scores))

Data Generation Progress: 100%|█████████▉| 274654/275000 [01:11<00:00, 3634.32it/s]
Training data has 10252 episodes (Loss = 96.272 %)
Average accepted score: 61.47639484978541
Median score for accepted scores: 58.0


In [5]:
#Making the Model
model = tf.keras.Sequential([
    keras.layers.InputLayer(input_shape=(4,1), name = "input_layer"),

    keras.layers.Dense(128, activation='relu', name="dense_1"),
    keras.layers.Dropout(0.8),

    keras.layers.Dense(256, activation='relu', name="dense_2"),
    keras.layers.Dropout(0.8),

    keras.layers.Dense(512, activation='relu', name="dense_3"),
    keras.layers.Dropout(0.8),

    keras.layers.Dense(256, activation='relu', name="dense_4"),
    keras.layers.Dropout(0.8),

    keras.layers.Dense(128, activation='relu', name="dense_5"),
    keras.layers.Dropout(0.8),

    keras.layers.Dense(2, activation='softmax', name="output"),
])
model.compile(optimizer="adam", loss ="categorical_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4, 128)            256       
_________________________________________________________________
dropout (Dropout)            (None, 4, 128)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 4, 256)            33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 4, 256)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 4, 512)            131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 4, 512)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 4, 256)            1

In [8]:
#Training Model

#Splitting training_data in x and y
#converts states from [cart_pos, cart_vel, pole_pos, pole,vel] to [[cart_pos], [cart_vel], [pole_pos], [pole,vel]]
train_states = np.array([i[0] for i in training_data]).reshape(-1,len(training_data[0][0]),1)
train_moves = np.array([i[1] for i in training_data])

model.fit(train_states, train_moves, epochs=5)

ValueError: cannot reshape array of size 41008 into shape (4,1)

In [None]:
#Learning
EPISODES = 150000
MAX_STEPS = 1000
EPSILON = 0.9
from tqdm import tqdm
progress = tqdm(total=EPISODES, position=0, leave=False)
progress.set_description("Progress")

#Q-Table
ALPHA = 0.81 #Represents the Learning Rate. This determines how much the agent will explore. High LR means more exploration
GAMMA = 0.96 #Represents the Discount Factor. This determines how much the agent values the future reward. High DF means future rewards are more heavily considered

rewards = [] #Log of Rewards per Episode

#Training
for e in range(EPISODES):
    progress.update(1)
    
    state = env.reset() #Resets Environment

    for s in range(MAX_STEPS):
        
        #env.render() #Renders Environment. CAUTION: Rendering takes more time to train
        
        #Picks Action. Chooses a random action EPSILON% of the time. Otherwise chooses the max reward option
        if(np.random.uniform(0,1) < EPSILON):
            action = env.action_space.sample() #Takes a random action, samples from env.action_space which is a list of possible actions {0: Left, 1: Down, 2: Right, 3: Up}
        else: 
            action = np.argmax(Q[state, : ])
        new_state, reward, done, _ = env.step(action) #Takes the action

        #Updating Q-Table
        Q[state, action] = getQ(state, action)

        #changes states
        state = new_state

        #Handles if game finished
        if done:
            rewards.append(reward) #Adds 
            EPSILON -= 1/EPISODES #Steps down the random action rate to prioritize rewards over exploration
            break

In [None]:
#Plotting the learning
def get_average(values):
  return sum(values)/len(values)

avg_rewards = []
for i in range(0, len(rewards), 100):
  avg_rewards.append(get_average(rewards[i:i+100])) 

plt.plot(avg_rewards)
plt.ylabel('average reward')
plt.xlabel('episodes (100\'s)')
plt.show()

In [None]:
state = env.reset() #Resets Environment

for s in range(MAX_STEPS):
    
    env.render() #Renders Environment. CAUTION: Rendering takes more time to train
    
    #Picks Action based on max reward
    action = np.argmax(Q[state, : ])
    new_state, reward, done, _ = env.step(action) #Takes the action

    #changes states
    state = new_state

    #Handles if game finished
    if done:
        break
print("It took", s, "steps to finish")
env.render()