# Using DFO to Optimise TensorFlow Neural Networks


### Imports

It's well worth noting your TensorFlow and gym version. The API's change quickly so if you are having issues with this code then check that you are on the same version or your code is updated correctly. As it stands I am using the following:
- TensorFlow version: **1.3.0**
- OpenAI Gym version: **0.9.3**

In [None]:
import tensorflow as tf
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import threading
import multiprocessing
import gym
import json
import os
from time import sleep

print('TensorFlow version:', tf.__version__)
print('OpenAI Gym version:', gym.__version__)

### Setup parameters

The network is setup with a rectified linear unit as the activation function. The hidden layers dimensions are specified in the list.  

In [None]:
hidden_non_linearity = tf.nn.relu
hidden_sizes = [6, 3]
output_size = 2
input_size = 4
cpu_only = True
env_name = 'CartPole-v0'
number_iterations = 200

### Setup TF graph

The ```tf.reset_default_graph()``` method is good for wiping the graph clean to try different dimensionalities. The network is then constructed using a for loop and there are special tensors created at the end of the cell to allow efficient insertion of weights concurrently into the graph.    

In [None]:
tf.reset_default_graph()

model_input = tf.placeholder(dtype=tf.float32, 
                             shape=[None, input_size])
net = model_input

for hidden_size in hidden_sizes:
    net = tf.layers.dense(inputs=net,
                          units=hidden_size,
                          activation=hidden_non_linearity)

net = tf.layers.dense(inputs=net,
                      units=output_size,
                      activation=tf.nn.softmax)
model_output = net

graph = tf.get_default_graph()

restore_dict = {}
restore_ops = []
for var in graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
    place_holder = tf.placeholder(tf.float32, var.get_shape(), 'ph%s' % var.name.split(':')[0])
    restore_dict[var.name] = place_holder
    restore_ops.append(tf.assign(var, place_holder))

### Define multi-threaded DFO class

This class encapsulates the main DFO algorithm. It is used further down when the tensorflow session is created, and the main method worth checking out is the ```run()``` function.

In [None]:
class DFOStrategy(object):
    """Class to manage the DFO optimisation."""    

    def __init__(self, 
                 weights, 
                 get_reward_func, 
                 population_size=100, 
                 disturbance_threshold=0.01,
                 name='CartPole-v0', 
                 sess=None,
                 meta=None,
                 mode=0):
        """__init__ to set up the classes member variables.

        The population is created from a gaussian distribution
        with the same deviation and mean from the passed in 
        weights.

        Note:
            A legitimate tf.Session must be passed to the class.

        Args:
            weights (list): list of hyperparameters from the mlp.
            get_reward_func (method): externel method that returns 
                                      reward.
            population_size (int): amount of flies.
            disturbance_threshold (float): how regularily the flies 
                                           re-init.
            name (str): the name of the OpenAI gym environment to run. 
            sess (tf.Session): a valid sess object to access tensors.
            meta (tuple): important vars to reconstruct mlp weights.
        """
        np.random.seed(0)
        self.weights = weights
        self.meta = meta
        self.get_reward = get_reward_func
        self.population_size = population_size
        dev = np.std(weights)
        mean = np.mean(weights)
        if mean < 0:
            mean = 0
        self.population = np.array(
            [np.random.normal(dev, mean, len(weights)) 
             for _ in range(self.population_size)])
        self.disturbance_threshold = disturbance_threshold
        self.env_name = name
        self.env = gym.make(self.env_name)
        self.sess = sess
        self.swarms_best = None
        self.swarms_best_score = np.finfo(np.float32).max
        self.all_time_best = None
        self.all_time_best_score = np.finfo(np.float32).max
        self.mode = mode
        self.best_reward_record = deque()
        self.reward_mean_record = deque()
        self.reward_sigma_record = deque()
        assert sess is not None 


    def get_weights(self):
        """Returns the best weights generated thus far.
        
        Returns:
            A list of weights if run has been called else returns None. 
        """
        return self.all_time_best
    
    
    def draw(self):
        """Quick and dirty data plotter."""
        x = np.arange(len(self.best_reward_record))
      
        f, axarr = plt.subplots(3, sharex=True)
        axarr[0].plot(x, self.best_reward_record)
        axarr[0].set_title('Reward, Mean, Std Dev.')
        axarr[1].plot(x, self.reward_mean_record)
        axarr[2].plot(x, self.reward_sigma_record)
        plt.show()
        
        
    def print_out(self, iteration, print_step):
        """Quick and dirty data printer."""
        if iteration % print_step == 0 and self.all_time_best is not None:
            print('iter %d. reward: %f. dt: %f. best: %f.' % (iteration,
                                                              self.swarms_best_score,
                                                              self.disturbance_threshold,
                                                              self.all_time_best_score))


    def run(self, iteration_amount, elitism=0, print_step=10, decay=0.98):
        """The main optimisation method.

        As many as optimal threads are created and the environments are
        passed in and computed in parallel. The main DFO algoithm is then
        computed and the flies are updated.
        
        Args:
            iteration_amount (int): How many rounds of optimisations are ran.
            save_step (int): How often should we save the weights.
            print_step (int): How often we should print the reward/fitness.

        """
        saver = tf.train.Saver()
        best_neighbour = np.zeros_like(self.population[0])
        
        envs = [gym.make(self.env_name) for _ in range(multiprocessing.cpu_count())]
        for iteration in range(iteration_amount):
              
            self.print_out(iteration, print_step)
                
            amount_per_thread = int(np.floor(self.population_size / multiprocessing.cpu_count()))
            left_over = self.population_size - amount_per_thread * multiprocessing.cpu_count()
            
            fitnesses = np.zeros(len(self.population))
            
            def get_weights_reward(begin, size, env):
                for i in range(begin, begin + size):
                    fitnesses[i] = -self.get_reward(self.population[i], 
                                                    self.sess, 
                                                    env,
                                                    self.meta)
            threads = []
            idx = 0
            for i in range(multiprocessing.cpu_count()):
                amt = (amount_per_thread + 1) if i < left_over else amount_per_thread
                thread = threading.Thread(target=get_weights_reward,
                                          args=[idx, amt, envs[i]])
                threads.append(thread)
                idx += amt
            
            assert idx == len(self.population)
                
            for t in threads:
                t.start()
            for t in threads:
                t.join()
            
            swarms_best_index = np.argmin(fitnesses)
            self.swarms_best_score = np.amin(fitnesses)
            self.swarms_best = self.population[swarms_best_index]
                    
            if self.swarms_best_score <= self.all_time_best_score:
                self.all_time_best_score = self.swarms_best_score
                self.all_time_best = self.swarms_best
                self.get_reward(self.all_time_best, sess, self.env, self.meta)
                saver.save(sess, self.env_name + '_dfo')
                                
            r = np.random.uniform(0.0, 1.0, self.population.shape)
            self.lower = np.amin(self.population)
            self.upper = np.amax(self.population)
            dev = np.std(self.population)
            mean = np.mean(self.population)
            self.best_reward_record.append(self.swarms_best_score) 
            self.reward_mean_record.append(mean) 
            self.reward_sigma_record.append(dev) 
            if mean < 0:
                mean = 0
                
            if elitism > 0:    
                n = elitism
                n_fittest = np.argpartition(fitnesses, range(n))[:n]
                
            leader_rate = np.random.uniform(0.0, 1.0)
            self.disturbance_threshold *= decay
            
            for i, p in enumerate(self.population):
                
                if self.mode != 'n_fittest' and elitism > 0 and i in n_fittest:
                    pass
                
                else:
                
                    left = (i - 1) if i != 0 else len(self.population) - 1
                    right = (i + 1) if i != (len(self.population) - 1) else 0

                    if fitnesses[left] < fitnesses[right]:
                        best_neighbour = self.population[left]  
                    else:
                        best_neighbour = self.population[right]

                    for x in range(len(p)):

                        if self.mode == 'original':
                            if r[i][x] < self.disturbance_threshold:
                                p[x] = np.random.normal(dev, mean)
                            else:
                                leader_rate = np.random.uniform(0.0, 1.0)
                                update = self.swarms_best[x] - best_neighbour[x]
                                p[x] = best_neighbour[x] + leader_rate * update

                        elif self.mode == 'hybrid':
                            if r[i][x] < self.disturbance_threshold:
                                p[x] = np.random.normal(dev, mean)
                            else:
                                leader_rate = np.random.uniform(0.0, 1.0)
                                update = (best_neighbour[x] + self.swarms_best[x]) / 2.0 - p[x]
                                p[x] = p[x] + leader_rate * update  
                                
                        elif self.mode == 'n_fittest':
                            if r[i][x] < self.disturbance_threshold:
                                p[x] = np.random.normal(dev, mean)
                            else:
                                leader_rate = np.random.uniform(0.0, 1.0)
                                update = np.average(self.population[n_fittest]) - best_neighbour[x]
                                p[x] = best_neighbour[x] + leader_rate * update          
                                
                        elif self.mode == 'no_leader_with_random':
                            if r[i][x] < self.disturbance_threshold:
                                p[x] = np.random.normal(dev, mean)
                            else:
                                update = best_neighbour[x] - p[x]
                                p[x] = p[x] + leader_rate * update

                        elif self.mode == 'no_leader':         
                            update = best_neighbour[x] - p[x]
                            p[x] = p[x] + leader_rate * update

                        elif self.mode == 'random_gauss':    
                            p[x] = np.random.normal(dev, mean)

                        elif self.mode == 'random_uniform':
                            p[x] = np.random.sample()
                            

### Getters and setters for TF trainable variables

These methods are the nuts and bolts of getting and setting, and more  generally, the way of optimising the weights other than tf's built in auto backprop optimsers.  

In [None]:
def get_weights(sess):
    """Get weights from sess.

    This method essentially obtains and compresses all of the the trainable
    tensors into a list.
    
    Args:
        sess (tf.Session): The tf session with the correct tensors in the graph.

    Returns:
        genotype (list): list of floats, comprising of the weights.
        meta (tuple): important variables to rebuild the weights in set_weights.

    """
    all_variable_names = [v.name for v in tf.trainable_variables()]
    all_variable_values = sess.run(all_variable_names)
    all_variable_shapes = [v.shape for v in all_variable_values]
    all_variable_cutoffs = [np.prod(s) for s in all_variable_shapes]
    genotype = np.concatenate([v.flatten() for v in all_variable_values])
    return genotype, (all_variable_names, all_variable_shapes, all_variable_cutoffs)


def set_weights(sess, new_genotype, meta):
    """Set the weights in the sess.

    This takes the list of weights, chops them up correctly, feeds it into the
    graph to the right tensors. This is concurrently safe.
    
    Args:
        new_genotype (list): A list of floats which are the new weights for the mlp.
        sess (tf.Session): The tf session with the correct tensors in the graph.
        meta (tuple): important variables to rebuild the weights correctly.
    """
    names, shapes, cutoffs = meta
    new_genotype = np.array(new_genotype)
    new_variable_values = []
    start = 0
    end = cutoffs[0]
    for i in range(1, len(cutoffs)):
        new_variable = new_genotype[start:end]
        new_variable_values.append(new_variable)
        start = end
        end += cutoffs[i]
    new_variable_values.append(new_genotype[:-start])
    
    feed_dict = {}
    for i in range(len(shapes)):
        new_variable_values[i] = new_variable_values[i].reshape(shapes[i])    
        feed_dict[restore_dict[names[i]]] = new_variable_values[i]
    sess.run(restore_ops, feed_dict=feed_dict)

### Define our reward / fitness function

In [None]:
def get_reward(weights, sess, env, meta):
    """Get the reward from the passed in weights in the passed in environment.

    The env will run until it returns false for done.
    
    Args:
        weights (list): A list of floats which are the new weights for the mlp.
        sess (tf.Session): The tf session with the correct tensors in the graph.
        env (openai): the OpenAI gym environment to run. 
        meta (tuple): important variables to rebuild the weights correctly.

    Returns:
        A float representing the total reward. Bigger is better. 

    """
    set_weights(sess, weights, meta)

    total_reward = 0
    done = False
    observation = env.reset()

    while not done:

        feed_dict = {
            model_input: observation.reshape((1, -1))
        }
        prediction = sess.run(model_output, 
                              feed_dict=feed_dict)          
        action = prediction[0]
        action = np.argmax(action)

        observation, reward, done, info = env.step(action)
        total_reward += reward

    return total_reward

### Run and opimise the MLP

Now we can run the optimisation steps. A number of modifications have been made to the DFO algo which are worth playing with. Once a new best fly has been created, the tensorflow weights are saved. Parameters in particular to be aware of are population_size, disturbance_threshold, mode, decay and elitism. From the mode parameter the following options are available:   
```
'original'
'hybrid'
'n_highest'
'no_leader_with_random'
'no_leader'
'random_gauss'
'random_uniform'
```

**Please note:** the Cartpole environment is considered solved when 200 reward has been achieved. Note also that we are inversing our reward because DFO is minimising and hasn't been modfied to maximise so we are trying to get -200.0!   

In [None]:
config = tf.ConfigProto(device_count = {'GPU': 0}) if cpu_only else None 

with tf.Session(config=config) as sess:
    
    sess.run(tf.global_variables_initializer())
    
    initial_weights, meta = get_weights(sess)

    es = DFOStrategy(initial_weights, 
                     get_reward,
                     population_size=1000, 
                     disturbance_threshold=0.1,
                     name=env_name,
                     sess=sess,
                     meta=meta,
                     mode='original')
    
    es.run(number_iterations, 
           elitism=20,
           print_step=1,
           decay=0.99)
    
    best_weights = es.get_weights()
    set_weights(sess, best_weights, meta)
    

### Graph of results

Note that the reward is inversed to be negative here, as the stock DFO algorithm minimises an objective function and the Openai envs return positive rewards correlated with positive performance.   

In [None]:
es.draw()

### Preview the network

In [None]:
from gym import wrappers

with tf.Session(config=config) as sess:
    saver = tf.train.Saver()
    saver.restore(sess, env_name + '_dfo')

    env = es.env
    env = wrappers.Monitor(env, ".", force=True)
    env.reset()
    env.render(close=True)
    
    observation = es.env.reset()
    
    for i in range(200):

        env.render()

        feed_dict = {
            model_input: observation.reshape((1, -1))
        }
        prediction = sess.run(model_output,
                              feed_dict=feed_dict)

        action = prediction[0]
        action = np.argmax(action)

        observation, reward, done, info = env.step(action)
        if done == True:
            print("Done early at step", i)
            break
        sleep(1.0/60.0)
    env.close()    