# CartPole Balancing using Simple Neural Network

In [1]:
import gym
env = gym.make('CartPole-v1')
env.reset()
for step_index in range(1000):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print("Step {}:".format(step_index))
    print("action: {}".format(action))
    print("observation: {}".format(observation))
    print("reward: {}".format(reward))
    print("done: {}".format(done))
    print("info: {}".format(info))
    if done:
        break

Step 0:
action: 0
observation: [-0.03299965 -0.21300666  0.04897273  0.34003126]
reward: 1.0
done: False
info: {}
Step 1:
action: 0
observation: [-0.03725979 -0.40878998  0.05577336  0.64774637]
reward: 1.0
done: False
info: {}
Step 2:
action: 1
observation: [-0.04543559 -0.21448761  0.06872828  0.37313486]
reward: 1.0
done: False
info: {}
Step 3:
action: 1
observation: [-0.04972534 -0.02040586  0.07619098  0.10288976]
reward: 1.0
done: False
info: {}
Step 4:
action: 0
observation: [-0.05013346 -0.21653235  0.07824878  0.41860449]
reward: 1.0
done: False
info: {}
Step 5:
action: 0
observation: [-0.0544641  -0.41267087  0.08662087  0.7348937 ]
reward: 1.0
done: False
info: {}
Step 6:
action: 1
observation: [-0.06271752 -0.21884555  0.10131874  0.47068046]
reward: 1.0
done: False
info: {}
Step 7:
action: 1
observation: [-0.06709443 -0.02528976  0.11073235  0.21157249]
reward: 1.0
done: False
info: {}
Step 8:
action: 1
observation: [-0.06760023  0.16808911  0.1149638  -0.04423046]
reward:

In [2]:
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
env = gym.make('CartPole-v1')
env.reset()
goal_steps = 500
score_requirement = 60
initial_games = 100000

In [4]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(initial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 2)
            observation, reward, done, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])
        
        env.reset()

    print(accepted_scores)
    
    return training_data

In [5]:
training_data = model_data_preparation()

[73.0, 61.0, 66.0, 60.0, 72.0, 63.0, 60.0, 68.0, 66.0, 85.0, 88.0, 67.0, 69.0, 60.0, 60.0, 64.0, 85.0, 64.0, 65.0, 84.0, 69.0, 100.0, 74.0, 67.0, 76.0, 79.0, 62.0, 67.0, 60.0, 70.0, 69.0, 60.0, 61.0, 79.0, 60.0, 69.0, 104.0, 89.0, 61.0, 73.0, 82.0, 63.0, 62.0, 72.0, 61.0, 93.0, 92.0, 81.0, 74.0, 63.0, 81.0, 62.0, 64.0, 67.0, 72.0, 63.0, 69.0, 60.0, 65.0, 61.0, 82.0, 65.0, 62.0, 74.0, 63.0, 71.0, 71.0, 67.0, 62.0, 63.0, 68.0, 106.0, 92.0, 70.0, 89.0, 71.0, 101.0, 68.0, 62.0, 66.0, 74.0, 67.0, 73.0, 81.0, 60.0, 65.0, 80.0, 66.0, 67.0, 76.0, 72.0, 64.0, 88.0, 66.0, 62.0, 64.0, 76.0, 77.0, 67.0, 76.0, 88.0, 67.0, 76.0, 63.0, 73.0, 67.0, 74.0, 64.0, 94.0, 83.0, 73.0, 64.0, 60.0, 85.0, 64.0, 67.0, 63.0, 67.0, 75.0, 64.0, 86.0, 62.0, 61.0, 69.0, 64.0, 61.0, 61.0, 73.0, 64.0, 65.0, 61.0, 66.0, 111.0, 69.0, 60.0, 65.0, 66.0, 64.0, 63.0, 60.0, 83.0, 74.0, 61.0, 69.0, 68.0, 68.0, 61.0, 77.0, 76.0, 66.0, 68.0, 68.0, 64.0, 60.0, 87.0, 86.0, 61.0, 64.0, 89.0, 84.0, 63.0, 73.0, 73.0, 64.0, 68.0, 67.0

In [6]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())

    return model

In [7]:
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    print(X)
    print(y)
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=10)
    return model

In [8]:
trained_model = train_model(training_data)

[[-0.03234883  0.2155806  -0.04692269 -0.27976511]
 [-0.02803722  0.02115832 -0.05251799 -0.00224289]
 [-0.02761405  0.21699258 -0.05256285 -0.31102247]
 ...
 [ 0.66598387  1.49340807 -0.14695212 -1.47828072]
 [ 0.69585203  1.3003541  -0.17651774 -1.23486947]
 [ 0.72185911  1.10788444 -0.20121512 -1.00227825]]
[[1 0]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [0 1]]
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
scores = []
choices = []
for each_game in range(100):
    score = 0
    prev_obs = []
    for step_index in range(goal_steps):
        env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break

    env.reset()
    scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))

[159.0, 142.0, 132.0, 500.0, 137.0, 453.0, 500.0, 135.0, 111.0, 500.0, 500.0, 500.0, 484.0, 500.0, 500.0, 500.0, 326.0, 500.0, 269.0, 156.0, 500.0, 315.0, 500.0, 261.0, 500.0, 123.0, 500.0, 500.0, 110.0, 135.0, 500.0, 500.0, 118.0, 107.0, 500.0, 103.0, 148.0, 242.0, 500.0, 115.0, 500.0, 500.0, 139.0, 87.0, 138.0, 401.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 125.0, 135.0, 500.0, 91.0, 500.0, 500.0, 500.0, 500.0, 500.0, 127.0, 87.0, 500.0, 155.0, 500.0, 500.0, 500.0, 279.0, 89.0, 128.0, 145.0, 120.0, 500.0, 113.0, 112.0, 446.0, 500.0, 500.0, 500.0, 428.0, 152.0, 89.0, 107.0, 137.0, 148.0, 138.0, 500.0, 500.0, 117.0, 500.0, 160.0, 87.0, 500.0, 500.0, 500.0, 91.0, 466.0, 168.0]
Average Score: 329.86
choice 1:0.5033347480749408  choice 0:0.4966652519250591
