In [0]:
import gym
import random
import numpy as np
import tflearn
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from statistics import median, mean
from collections import Counter

LR = 1e-3
env = gym.make("CartPole-v0")
env.reset()
goal_steps = 500
score_requirement = 50
initial_games = 50000

def some_random_games_first():
    # Each of these is its own game.
    for episode in range(5):
        env.reset()
        # this is each frame, up to 200...but we wont make it that far.
        for t in range(200):
            # This will display the environment
            # Only display if you really want to see it.
            # Takes much longer to display it.
            
            
            # This will just create a sample action in any environment.
            # In this environment, the action can be 0 or 1, which is left or right
            action = env.action_space.sample()
            
            # this executes the environment with an action, 
            # and returns the observation of the environment, 
            # the reward, if the env is over, and other info.
            observation, reward, done, info = env.step(action)
            if done:
              break
            return done
def initial_population():
    # [OBS, MOVES]
    training_data = []
    # all scores:
    scores = []
    # just the scores that met our threshold:
    accepted_scores = []
    # iterate through however many games we want:
    for _ in range(initial_games):
        score = 0
        # moves specifically from this environment:
        game_memory = []
        # previous observation that we saw
        prev_observation = []
        # for each frame in 200
        for _ in range(goal_steps):
            # choose random action (0 or 1)
            action = random.randrange(0,2)
            # do it!
            observation, reward, done, info = env.step(action)
            
            # notice that the observation is returned FROM the action
            # so we'll store the previous observation here, pairing
            # the prev observation to the action we'll take.
            if len(prev_observation) > 0 :
                game_memory.append([prev_observation, action])
            prev_observation = observation
            score+=reward
            if done: break

        # IF our score is higher than our threshold, we'd like to save
        # every move we made
        # NOTE the reinforcement methodology here. 
        # all we're doing is reinforcing the score, we're not trying 
        # to influence the machine in any way as to HOW that score is 
        # reached.
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                # convert to one-hot (this is the output layer for our neural network)
                if data[1] == 1:
                    output = [0,1]
                elif data[1] == 0:
                    output = [1,0]
                    
                # saving our training data
                training_data.append([data[0], output])

        # reset env to play again
        env.reset()
        # save overall scores
        scores.append(score)
    
    # just in case you wanted to reference later
    training_data_save = np.array(training_data)
    np.save('saved.npy',training_data_save)
    
    # some stats here, to further illustrate the neural network magic!
    print('Average accepted score:',mean(accepted_scores))
    print('Median score for accepted scores:',median(accepted_scores))
    print(Counter(accepted_scores))
    
    return training_data

  
 
           

(21972, 4)

In [0]:
def neural_network_model(input_size):
    model = Sequential()
    model.add(Dense(4,activation = 'relu',input_shape = (4,)))
    model.add(Dense(16,activation = 'relu'))
    model.add(Dense(4,activation = 'relu'))
    model.add(Dense(2,activation = 'softmax'))
    
    model.compile(optimizer = 'adam',loss = 'categorical_crossentropy')

    return model

In [0]:
def train_model(training_data, model=False):
    Xt = np.array([i[0] for i in training_data]).reshape(-1,len(training_data[0][0]))
    yt = np.array([i[1] for i in training_data])

    if not model:
        model = neural_network_model(input_size = (len(Xt[1])))
    
    model.fit(Xt, yt, epochs=10, batch_size=50)
    return model

In [0]:
training_data = initial_population()


Average accepted score: 61.45498915401301
Median score for accepted scores: 58.0
Counter({51.0: 139, 50.0: 137, 52.0: 130, 56.0: 108, 54.0: 106, 53.0: 98, 55.0: 97, 58.0: 89, 57.0: 80, 59.0: 79, 60.0: 66, 61.0: 62, 62.0: 51, 66.0: 42, 65.0: 41, 63.0: 37, 64.0: 35, 68.0: 34, 67.0: 34, 70.0: 30, 72.0: 29, 69.0: 28, 71.0: 25, 76.0: 22, 74.0: 18, 77.0: 16, 75.0: 16, 78.0: 16, 73.0: 14, 84.0: 14, 81.0: 12, 83.0: 10, 79.0: 10, 88.0: 9, 92.0: 9, 82.0: 9, 85.0: 9, 80.0: 8, 89.0: 8, 86.0: 7, 90.0: 7, 87.0: 7, 95.0: 5, 94.0: 4, 96.0: 4, 100.0: 3, 97.0: 3, 91.0: 3, 110.0: 2, 111.0: 2, 104.0: 2, 98.0: 2, 101.0: 1, 108.0: 1, 99.0: 1, 138.0: 1, 102.0: 1, 129.0: 1, 112.0: 1, 116.0: 1, 117.0: 1, 103.0: 1, 133.0: 1, 115.0: 1, 93.0: 1, 107.0: 1, 105.0: 1, 130.0: 1})


In [0]:
model = train_model(training_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
action = np.argmax(model.predict(prev_obs.reshape(-1,len(prev_obs)))[0])
print(action)

In [0]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    env.reset()
    for _ in range(goal_steps):
      
      env.render()
      if len(prev_obs)==0:
            action = random.randrange(0,2)
      else:
        
        action = np.argmax(model.predict(prev_obs.reshape(-1,len(prev_obs)))[0])

        choices.append(action)
                
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score+=reward
        if done: break

    scores.append(score)

print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))
print(score_requirement)
show_video()

NameError: ignored

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

--2019-08-23 21:30:04--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 3.217.182.206, 35.173.3.255, 52.200.233.201, ...
Connecting to bin.equinox.io (bin.equinox.io)|3.217.182.206|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13607069 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2019-08-23 21:30:05 (19.0 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13607069/13607069]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [0]:
LOG_DIR = './log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

In [0]:
get_ipython().system_raw('./ngrok http 6006 &') 

In [0]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"


https://27e9c897.ngrok.io


In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import math
import glob
import io
import base64
from IPython.display import HTML

In [0]:
#eg screen resolution 1400x900

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

W0823 21:41:42.208195 139998089205632 abstractdisplay.py:151] xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

In [0]:
def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    env.reset()
    for _ in range(goal_steps):
      
      env.render()
      if len(prev_obs)==0:
            action = random.randrange(0,2)
      else:
        
        action = np.argmax(model.predict(prev_obs.reshape(-1,len(prev_obs)))[0])

        choices.append(action)
                
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score+=reward
        if done: break

    scores.append(score)

print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))
print(score_requirement)

env = wrap_env(gym.make("CartPole-v0"))
show_video()


NameError: ignored