# Open AI Atari Games. Reinforcement Learning with PyTorch, deep Learning
## By Nasrudin Bin Salim
### Requirements: Python 2.7. Linux Environment/UNIX Environment
    Please Install Pytorch
    OpenAI Gym
    Open AI Universe
    cv2

### Imports

In [1]:
from __future__ import print_function, division
import numpy as np

import json
import logging
from cv2 import resize
from skimage.color import rgb2gray
import os
os.environ["OMP_NUM_THREADS"] = "1" #should be set to 1 to prevent conflicts
import time



from torch.autograd import Variable



### Import OpenAI Universe environment and gym
### Import Pytorch for model

In [2]:
import gym
from universe import vectorized
from universe.wrappers import Unvectorize, Vectorize

from gym.spaces.box import Box
from gym.configuration import undo_logger_setup

import torch
from torch.multiprocessing import Process
import torch.nn.functional as F


import torch.optim as optim



# Helper Functions Section

### Logging function

In [3]:
def setup_logger(logger_name, log_file, level=logging.INFO):
    
    ''' Makes use of the logging module'''
    #Instantiates the logging class
    l = logging.getLogger(logger_name)
    
    #Formatter
    formatter = logging.Formatter('%(asctime)s : %(message)s')
    
    #file handler    
    fileHandler = logging.FileHandler(log_file, mode='w')
    fileHandler.setFormatter(formatter)
    
    #streamhandler
    streamHandler = logging.StreamHandler()
    streamHandler.setFormatter(formatter)
    
    #add the above handles to the logger instance
    l.setLevel(level)
    l.addHandler(fileHandler)
    l.addHandler(streamHandler)

### Read Json Object

In [4]:
def read_config(file_path):
    """Read JSON config."""
    #use the context manager
    with open(file_path, 'r') as f:
        json_object = json.load(f)
        
    return json_object

### Share grads between 2 models
#### More on this later

In [5]:
def ensure_shared_grads(model, shared_model):
    for param, shared_param in zip(model.parameters(),
                                   shared_model.parameters()):
        if shared_param.grad is not None:
            return
        shared_param._grad = param.grad

# Environment, setting up the openAI and Universe 

### Create the atari environment function

In [6]:
def atari_env(env_id, env_conf):
    env = gym.make(env_id)
    if len(env.observation_space.shape) > 1:
        env = Vectorize(env)
        env = AtariRescale(env, env_conf)
        env = NormalizedEnv(env)
        env = Unvectorize(env)
        
    return env

### Create a frame for environment

In [7]:
def _process_frame(frame, conf):
    frame = frame[conf["crop1"]:conf["crop2"] + 160, :160]
    frame = resize(rgb2gray(frame), (80, conf["dimension2"]))
    frame = resize(frame, (80, 80))
    frame = np.reshape(frame, [1, 80, 80])
    return frame

### Atari rescale class

In [8]:
class AtariRescale(vectorized.ObservationWrapper):
    def __init__(self, env, env_conf):
        super(AtariRescale, self).__init__(env)
        self.observation_space = Box(0.0, 1.0, [1, 80, 80])
        self.conf = env_conf

    def _observation(self, observation_n):
        return [
            _process_frame(observation, self.conf)
            for observation in observation_n
        ]

### Normalized environment class, where we can move from one state and observation to another

In [9]:
class NormalizedEnv(vectorized.ObservationWrapper):
    def __init__(self, env=None):
        super(NormalizedEnv, self).__init__(env)
        self.state_mean = 0
        self.state_std = 0
        self.alpha = 0.9999
        self.num_steps = 0

    def _observation(self, observation_n):
        for observation in observation_n:
            self.num_steps += 1
            self.state_mean = self.state_mean * self.alpha + \
                observation.mean() * (1 - self.alpha)
            self.state_std = self.state_std * self.alpha + \
                observation.std() * (1 - self.alpha)

        unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
        unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))

        return [(observation - unbiased_mean) / (unbiased_std + 1e-8)
                for observation in observation_n]

# Model

Using Google DeepMind's Idea. 

    Research Paper: https://arxiv.org/pdf/1602.01783.pdf
    Asynchronous Advantage Actor-Critic (A3C)


The A3C algorithm was released by Google’s DeepMind group earlier this year, and it made a splash by… essentially obsoleting DQN. It was faster, simpler, more robust, and able to achieve much better scores on the standard battery of Deep RL tasks. On top of all that it could work in continuous as well as discrete action spaces. Given this, it has become the go-to Deep RL algorithm for new challenging problems with complex state and action spaces


    
<a href= "https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2" >Medium Article explaining A3c reinforcement learning </a>

## The Actor-Critic Structure
<img src = "img/A3CStructure.png">

## Many workers training and learning concurrently, and then updates global network with gradients
### Process Flow
<img src = "img/A3CProcessFlow.png">
    
### Long Short Term Memory Recurrent Neural Nets
    
## Implementing LSTM and A3C with Pytorch

Created as a module and then imported

In [10]:
from A3CModel import A3Clstm

# The player Agent
## (Reinforcement Learning agent to interact with the env)

In [11]:
class Agent(object):
    def __init__(self, model, env, args, state):
        self.model = model
        self.env = env
        self.current_life = 0
        self.state = state
        self.hx = None
        self.cx = None
        self.eps_len = 0
        self.args = args
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        self.done = True
        self.info = None
        self.reward = 0

    def action_train(self):
        if self.done:
            self.cx = Variable(torch.zeros(1, 512))
            self.hx = Variable(torch.zeros(1, 512))
        else:
            self.cx = Variable(self.cx.data)
            self.hx = Variable(self.hx.data)
        value, logit, (self.hx, self.cx) = self.model((Variable(self.state.unsqueeze(0)), (self.hx, self.cx)))
        prob = F.softmax(logit)
        log_prob = F.log_softmax(logit)
        entropy = -(log_prob * prob).sum(1)
        self.entropies.append(entropy)
        action = prob.multinomial().data
        log_prob = log_prob.gather(1, Variable(action))
        state, self.reward, self.done, self.info = self.env.step(action.numpy())
        self.state = torch.from_numpy(state).float()
        self.eps_len += 1
        self.done = self.done or self.eps_len >= self.args['M']
        self.reward = max(min(self.reward, 1), -1)
        self.values.append(value)
        self.log_probs.append(log_prob)
        self.rewards.append(self.reward)
        return self

    def action_test(self):
        if self.done:
            self.cx = Variable(torch.zeros(1, 512), volatile=True)
            self.hx = Variable(torch.zeros(1, 512), volatile=True)
        else:
            self.cx = Variable(self.cx.data, volatile=True)
            self.hx = Variable(self.hx.data, volatile=True)
        value, logit, (self.hx, self.cx) = self.model((Variable(self.state.unsqueeze(0), volatile=True), (self.hx, self.cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()
        state, self.reward, self.done, self.info = self.env.step(action[0])
        self.state = torch.from_numpy(state).float()
        self.eps_len += 1
        self.done = self.done or self.eps_len >= self.args['M']
        return self

    def check_state(self):
        if self.current_life > self.info['ale.lives']:
            self.done = True
        self.current_life = self.info['ale.lives']
        return self

    def clear_actions(self):
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        return self


# Shared Memory and optimization algorithims
## As Part of the A3C Network, multiple workers will be working together to update a global network

## RMSprop

RMSprop is an unpublished, adaptive learning rate method proposed by Geoff Hinton in Lecture 6e of his Coursera Class.

RMSprop and Adadelta have both been developed independently around the same time stemming from the need to resolve Adagrad's radically diminishing learning rates. RMSprop in fact is identical to the first update vector of Adadelta 

RMSprop as well divides the learning rate by an exponentially decaying average of squared gradients. Hinton suggests γ
to be set to 0.9, while a good default value for the learning rate η is 0.001.

In [12]:
from SharedOptimizers import SharedRMSprop

## Adaptive Moment Estimation (Adam) 
is another method that computes adaptive learning rates for each parameter. In addition to storing an exponentially decaying average of past squared gradients vt like Adadelta and RMSprop, Adam also keeps an exponentially decaying average of past gradients mt, similar to momentum:

Adam (short for Adaptive Moment Estimation) is an update to the RMSProp optimizer. In this optimization algorithm, running averages of both the gradients and the second moments of the gradients are used.

In [13]:
from SharedOptimizers import SharedAdam

## Adam but only with shared Lr

In [14]:
from SharedOptimizers import SharedLrSchedAdam

# Functions to run the model on the environment

# Test
## Function To test the model on a game/environ

In [15]:
def test(args, shared_model, env_conf,render=False):
    log = {}
    setup_logger('{}_log'.format(args['ENV']),
                 r'{0}{1}_log'.format(args['LG'], args['ENV']))
    log['{}_log'.format(args['ENV'])] = logging.getLogger(
        '{}_log'.format(args['ENV']))
    d_args = args
    for k in d_args.keys():
        log['{}_log'.format(args['ENV'])].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args['seed'])
    env = atari_env(args['ENV'], env_conf)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.eval()

    while True:
        if player.done:
            player.model.load_state_dict(shared_model.state_dict())
        if render:
            env.render()
        player.action_test()
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args['ENV'])].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args['SSL']:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save, '{0}{1}.dat'.format(
                    args['SMD'], args['ENV']))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()


# Train
## Function to Train the model with an optimizer algorithim on an environment

In [16]:
def train(rank, args, shared_model, optimizer, env_conf):

    torch.manual_seed(args['seed'] + rank)
    env = atari_env(args['ENV'], env_conf)
    if optimizer is None:
        if args['OPT'] == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args['LR'])
        if args['OPT'] == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args['LR'])

    env.seed(args['seed'] + rank)
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    while True:
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args['NS']):
            player.action_train()
            if args['CL']:
                player.check_state()
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args['G'] * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args['G'] * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args['G'] * args['T'] + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()


# Putting it altogether

## List of Games, pick one here and then edit the environment accordingly
    Choose an Atari game and it has to be a 4D Tensor Game 
    Or if you don't know what that means, just guess and check

In [17]:
gym.envs.registry.all()

[EnvSpec(flashgames.UrbanMicroRacers-v0),
 EnvSpec(flashgames.PlopPlopLite-v0),
 EnvSpec(DoubleDunk-ramDeterministic-v4),
 EnvSpec(flashgames.Sieger2LevelPack-v0),
 EnvSpec(DoubleDunk-ramDeterministic-v0),
 EnvSpec(gym-core.Krull-v0),
 EnvSpec(gym-core.Krull-v3),
 EnvSpec(Pooyan-ram-v4),
 EnvSpec(Pooyan-ram-v0),
 EnvSpec(flashgames.GonAndMon-v0),
 EnvSpec(flashgames.NeonRaceLvl4-v0),
 EnvSpec(flashgames.Hash-v0),
 EnvSpec(gym-core.JourneyEscapeSlow-v3),
 EnvSpec(gym-core.JourneyEscapeSlow-v0),
 EnvSpec(flashgames.FlashBombs-v0),
 EnvSpec(gym-core.JamesbondDeterministicSlow-v0),
 EnvSpec(VentureNoFrameskip-v0),
 EnvSpec(Centipede-v0),
 EnvSpec(Centipede-v4),
 EnvSpec(flashgames.Crumbs2-v0),
 EnvSpec(flashgames.CosmoGravity2-v0),
 EnvSpec(gym-core.Zaxxon30FPS-v0),
 EnvSpec(gym-core.Zaxxon30FPS-v3),
 EnvSpec(flashgames.ZombiesAndDonuts-v0),
 EnvSpec(Frostbite-ramNoFrameskip-v0),
 EnvSpec(Frostbite-ramNoFrameskip-v4),
 EnvSpec(IceHockey-ramNoFrameskip-v4),
 EnvSpec(flashgames.SpacePunkRace

### Function to load arguments into play

In [18]:
def loadarguments():
    global env_conf
    global env
    global setup_json
    global shared_model
    global saved_state
    global optimizer
    global torch
    
    
    undo_logger_setup()

    torch.set_default_tensor_type('torch.FloatTensor')
    torch.manual_seed(args['seed'])
    
    setup_json = read_config(args['EC'])

    env_conf = setup_json[args['config']]

    for i in setup_json.keys():
        if i in args['ENV']:
            env_conf = setup_json[i]
    env = atari_env(args['ENV'], env_conf)

    shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)
    if args['L']:
        saved_state = torch.load(
            '{0}{1}.dat'.format(args['LMD'], args['ENV']))
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()



    if args['SO']:
        if args['OPT'] == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args['LR'])
        if args['OPT'] == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(), lr=args['LR'])
        if args['OPT'] == 'LrSchedAdam':
            optimizer = SharedLrSchedAdam(
                shared_model.parameters(), lr=args['LR'])
        optimizer.share_memory()
    else:
        optimizer = None


## Input Desription

##### Parameter: LR
    Type: float
    Description: Learning Rate
##### Parameter: G
    Type=float,
    Description: discount factor for rewards (default: 0.99)
##### Parameter: T
    Type=float,
    Description: parameter for GAE (default: 1.00)
##### Parameter:seed
    Type: int
    Descrition: random seed (default: 42)
##### Parameter:W
    Type=int,
    Description: how many training processes to use (default: 5)
##### Parameter: NS
    Type=int,
    Description: number of forward steps in A3C (default: 20)
##### Parameter: M
    Type=int,
    Description: maximum length of an episode (default: 10000)
##### Parameter: ENV
    Description: environment to train on (default: Pong-v0)
##### Parameter: EC
    Description: environment to crop and resize info (default: settings.json)
##### Parameter: SO
    Description: use an optimizer without shared statistics.(default: True)
##### Parameter: L
    Description: load a trained model, (default: False)
##### Parameter: SSL
    Type=int,
    Description: reward score test evaluation must get higher than to save model (default:20)
##### Parameter: OPT
    Description: shares optimizer choice of Adam, LrSchedAdam or RMSprop (default: Adam)
##### Parameter: CL
    Description: end of life is end of training episode.(default: False)
##### Parameter: LMD
    Description: folder to load trained models from (default: '/modeldata/')
##### Parameter: SMD
    Description: folder to save trained models (default: '/modeldata/')
##### Parameter: LG
    Description: folder to save log (default: '/log/')

# Running an Environment, Training and simulating(more below)
    1. L(Load)  is set to False because I have no training data for that particular game.
    Once trained, a training data is provided, then set L to True.
    2. Set SO to True so that it can accumualative learn among all workers.
### Interrupt the Kernal to Stop training or stop testing.
    
## Note: Important to run all cells above but don't run everything below this
### The cells below are in sections, choose 1 section to run. E.g if you want to train, just run the Training section. or if you want to play pacman, just run the cells in the PacMan Section ( From input to render)
    
### Training Notes

*It is important to limit number of worker threads to number of cpu cores available
More than one thread per cpu core available is detrimental in training speed and effectiveness*

## Training Section

## Input Dictionary

In [19]:
args = {'LR': 0.0001, "G":0.99, "T":1.00,"W":8,"NS":100,"M":10000,"ENV":'MsPacman-v0',
         "EC":'./settings.json',"SO":True,"L":True,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./modeldata/',"SMD":"./modeldata/","LG":'./log/', "seed":42,"config":"Default"
        }

loadarguments()

## Run This to Train
    Also logs it into a file

In [None]:
processes = []

p = Process(target=test, args=(args, shared_model, env_conf))
p.start()
processes.append(p)

time.sleep(0.1)
for rank in range(0, args['W']):
    p = Process(
        target=train, args=(rank, args, shared_model, optimizer, env_conf))
    p.start()
    processes.append(p)
for p in processes:
    p.join()

# Playing the Atari Games Section
## The best part

#### If it gives tensor errors, just run the cells again and somehow it works the 2nd time. This is because we don't have the full Share optimizer data generated yet
### Load model is disabled on default so that you can observe how it learns through iteration,  Set L to True if you want to load the trained models and see how well it performs

## Playing PacMan (10000 episodes)

### Input Parameters

In [20]:
args = {'LR': 0.0001, "G":0.99, "T":1.00,"W":8,"NS":20,"M":1000000,"ENV":'MsPacman-v0',
         "EC":'./settings.json',"SO":True,"L":True,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./modeldata/',"SMD":"./modeldata/","LG":'./log/', "seed":42,"config":"MsPacman"
        }


loadarguments()

### Run this to Start

In [21]:
test(args, shared_model, env_conf,render=True)

2017-11-29 23:00:51,534 : OPT: Adam
2017-11-29 23:00:51,535 : LG: ./log/
2017-11-29 23:00:51,536 : SMD: ./modeldata/
2017-11-29 23:00:51,537 : ENV: MsPacman-v0
2017-11-29 23:00:51,538 : G: 0.99
2017-11-29 23:00:51,539 : CL: False
2017-11-29 23:00:51,540 : config: MsPacman
2017-11-29 23:00:51,541 : M: 1000000
2017-11-29 23:00:51,542 : L: True
2017-11-29 23:00:51,543 : EC: ./settings.json
2017-11-29 23:00:51,543 : SSL: 20
2017-11-29 23:00:51,544 : seed: 42
2017-11-29 23:00:51,545 : LR: 0.0001
2017-11-29 23:00:51,545 : T: 1.0
2017-11-29 23:00:51,546 : W: 8
2017-11-29 23:00:51,547 : SO: True
2017-11-29 23:00:51,548 : NS: 20
2017-11-29 23:00:51,548 : LMD: ./modeldata/
2017-11-29 23:01:09,454 : Time 00h 00m 17s, episode reward 5040.0, episode length 2059, reward mean 5040.0000


KeyboardInterrupt: 

## Playing BeamRider (4000 episodes)

### Input

In [None]:
args = {'LR': 0.0001, "G":0.99, "T":1.00,"W":8,"NS":20,"M":4000,"ENV":'BeamRider-v0',
         "EC":'./settings.json',"SO":True,"L":True,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./modeldata/',"SMD":"./modeldata/","LG":'./log/', "seed":42,"config":"BeamRider"
        }


loadarguments()

### Run this to Start

In [None]:
test(args, shared_model, env_conf,render=True)

## Playing Breakout (3000 episodes)

### Input

In [40]:
args = {'LR': 0.0001, "G":0.99, "T":1.00, "S":1,"W":8,"NS":20,"M":3000,"ENV":'Breakout-v0',
         "EC":'./settings.json',"SO":True,"L":True,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./modeldata/',"SMD":"./modeldata/","LG":'./log/', "seed":42,"config":"Breakout"
        }


loadarguments()

While copying the parameter named actor_linear.weight, whose dimensions in the model are torch.Size([4, 512]) and whose dimensions in the checkpoint are torch.Size([6, 512]), ...


RuntimeError: inconsistent tensor size, expected tensor [4 x 512] and src [6 x 512] to have the same number of elements, but got 2048 and 3072 elements respectively at /opt/conda/conda-bld/pytorch_1503966894950/work/torch/lib/TH/generic/THTensorCopy.c:86

### Run this to Start

In [41]:
test(args, shared_model, env_conf,render=True)

2017-11-24 14:52:44,553 : OPT: Adam
2017-11-24 14:52:44,555 : LG: ./log/
2017-11-24 14:52:44,556 : SMD: ./modeldata/
2017-11-24 14:52:44,557 : ENV: Breakout-v0
2017-11-24 14:52:44,559 : G: 0.99
2017-11-24 14:52:44,561 : CL: False
2017-11-24 14:52:44,563 : config: Breakout
2017-11-24 14:52:44,565 : M: 3000
2017-11-24 14:52:44,567 : L: True
2017-11-24 14:52:44,570 : EC: ./settings.json
2017-11-24 14:52:44,572 : SSL: 20
2017-11-24 14:52:44,574 : S: 1
2017-11-24 14:52:44,576 : seed: 42
2017-11-24 14:52:44,578 : LR: 0.0001
2017-11-24 14:52:44,580 : T: 1.0
2017-11-24 14:52:44,583 : W: 8
2017-11-24 14:52:44,585 : SO: True
2017-11-24 14:52:44,587 : NS: 20
2017-11-24 14:52:44,588 : LMD: ./modeldata/


KeyboardInterrupt: 

## Playing SpaceInvader (10000 episodes)

### Input

In [22]:
args = {'LR': 0.001, "G":0.99, "T":1.00, "S":1,"W":8,"NS":20,"M":1000000,"ENV":'SpaceInvaders-v0',
         "EC":'./settings.json',"SO":True,"L":True,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./modeldata/',"SMD":"./modeldata/","LG":'./log/', "seed":42,"config":"SpaceInvaders"
        }


loadarguments()

### Run this to Start

In [23]:
test(args, shared_model, env_conf,render=True)

2017-11-24 16:45:17,533 : OPT: Adam
2017-11-24 16:45:17,534 : LG: ./log/
2017-11-24 16:45:17,535 : SMD: ./modeldata/
2017-11-24 16:45:17,536 : ENV: SpaceInvaders-v0
2017-11-24 16:45:17,537 : G: 0.99
2017-11-24 16:45:17,538 : CL: False
2017-11-24 16:45:17,539 : config: SpaceInvaders
2017-11-24 16:45:17,539 : M: 1000000
2017-11-24 16:45:17,541 : L: True
2017-11-24 16:45:17,542 : EC: ./settings.json
2017-11-24 16:45:17,543 : SSL: 20
2017-11-24 16:45:17,544 : S: 1
2017-11-24 16:45:17,545 : seed: 42
2017-11-24 16:45:17,546 : LR: 0.001
2017-11-24 16:45:17,547 : T: 1.0
2017-11-24 16:45:17,547 : W: 8
2017-11-24 16:45:17,548 : SO: True
2017-11-24 16:45:17,549 : NS: 20
2017-11-24 16:45:17,551 : LMD: ./modeldata/
2017-11-24 16:45:49,833 : Time 00h 00m 32s, episode reward 2895.0, episode length 3753, reward mean 2895.0000
2017-11-24 16:47:10,696 : Time 00h 01m 52s, episode reward 2340.0, episode length 2343, reward mean 2617.5000
2017-11-24 16:49:32,367 : Time 00h 04m 14s, episode reward 11355.0, 

Traceback (most recent call last):
  File "/home/nasdin/anaconda3/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/nasdin/anaconda3/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
  File "/home/nasdin/anaconda3/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/home/nasdin/anaconda3/envs/py27/lib/python2.7/inspect.py", line 1049, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/home/nasdin/anaconda3/envs/py27/lib/python2.7/inspect.py", line 1013, in getframeinfo
    lines, lnum = findsource(frame)
  File "/home/nasdin/anaconda3/envs/py27/lib/python2.7/site-packages/IPython/core/ultratb.py", line 170, in findsource

IndexError: string index out of range