In [None]:
!pip install swig
!pip install gym[box2d]

In [None]:
import os #used for interacting with the operating system
import numpy as np #to make array related operations
import gym #to get the enviroment
from gym import wrappers #to record videos of the enviroment
# import pybullet_envs
max_reward = 0

class Normalizer():
    # Normalizes the inputs
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs) #for number of obseravations for each input dimension
        self.mean = np.zeros(nb_inputs) #for running mean
        self.mean_diff = np.zeros(nb_inputs) #to update the running mean
        self.var = np.zeros(nb_inputs) #for running variance


    def observe(self, x): #here x is observation made
        self.n += 1.0 #incrementing the number of observation
        last_mean = self.mean.copy() #making last mean equal to the running mean
        self.mean += (x - self.mean) / self.n #mean is getting updated to = last_mean + (observation - last_mean)/number_of_observations
        self.mean_diff += (x - last_mean) * (x - self.mean) #getting mean difference
        self.var = (self.mean_diff / self.n).clip(min = 1e-2) #calculating the variance where clip avoids getting divided by 0

    def normalize(self, inputs):
        obs_mean = self.mean #getting mean
        obs_std = np.sqrt(self.var) #getting the standard deviation
        return (inputs - obs_mean) / obs_std #returning the normalized value

## Algorithm
class Walker(): #giving hyperparameters
    def __init__(self,nb_steps=10000, episode_length=2000, learning_rate=0.05, num_deltas=16, num_best_deltas=16, noise=0.04, seed=1, env_name='BipedalWalker-v3',record_every=25, monitor_dir = None):
        self.nb_steps = nb_steps #number of training steps
        self.episode_length = episode_length #maximum number of steps in each episode
        self.learning_rate = learning_rate #giving the learning rate that is the alpha
        self.num_deltas = num_deltas #number of noise that is delta
        self.num_best_deltas = num_best_deltas #number of top deltas that we want to store
        assert self.num_best_deltas <= self.num_deltas
        self.noise = noise #value of noise
        self.seed = seed #gives the same random each time
        self.record_every = record_every #recording video after every 25 episode
        np.random.seed(self.seed) #producing same random number each time at the start of the experiment to compare the results
        self.env = gym.make(env_name) #making the enviroment
        if monitor_dir is not None:
            should_record = lambda i: self.record_video
            self.env = wrappers.RecordVideo(self.env, monitor_dir) #recording video of enviorement and storing in the given directory
        self.input_size = self.env.observation_space.shape[0] #number of neurons at input layer
        self.output_size = self.env.action_space.shape[0] #number of neurons at output layer
        self.normalizer = Normalizer(self.input_size) #inheriting the normalizer class
        self.episode_length = self.env.spec.max_episode_steps or episode_length
        self.theta = np.zeros((self.output_size, self.input_size)) #fiving initial weights that is theta
        self.record_video = False #signifies that the video recording during theb training is turned of

    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(self.num_deltas)] #initialising noises

    def evaluate(self, input, delta = None, direction = None): #getting output by dot product of weights and inputs
        if direction is None:
            return self.theta.dot(input)
        elif direction == "+":
            return (self.theta + self.noise * delta).dot(input)
        elif direction == "-":
            return (self.theta - self.noise * delta).dot(input)

    def play_episode(self, direction=None, delta=None, render=False): #things that will occur during an episode
        state = self.env.reset() #reseting the enviroment to initial state
        done = False #indicating that the episode is not ended yet
        num_plays = 0.0
        sum_rewards = 0.0
        while not done and num_plays < self.episode_length: #while episode is not ended and number of steps are less than the max number of steps that could be in an episode
            self.normalizer.observe(state)
            state = self.normalizer.normalize(state) #getting the z score
            action = self.evaluate(state,delta,direction) #getting output as actions
            state, reward, done, _ = self.env.step(action) #storing the hp for new state in state variable, reward obtained in the step in the reward variable after taking the action
            reward = max(min(reward, 1), -1)
            sum_rewards += reward #summing the reward obtained in this step to the reward obtained in in the preivous steps
            num_plays += 1 #increase the time step by 1 as the previous step is completed
            if render:
                self.env.render()

        return sum_rewards #reward at the end of the episode

    def train(self):
        for iteration in range(self.nb_steps):
            # Generate num_deltas deltas and evaluate positive and negative rewards
            deltas = self.sample_deltas()
            positive_rewards = [0] * self.num_deltas #initialising matrix with 0 to store the rewards
            negative_rewards = [0] * self.num_deltas #initialising matrix with 0 to store the rewards

            # Run num_deltas episode with positive and negative variations
            for i in range(self.num_deltas):
                positive_rewards[i] = self.play_episode(direction="+",delta=deltas[i])
                negative_rewards[i] = self.play_episode(direction="-",delta=deltas[i])

            # Collect rollouts r+,r-,delta
            #rollouts = zip(positive_rewards, negative_rewards, deltas)

            # Calculate the standard deviation of all the rewards
            sigma_rewards = np.array(positive_rewards + negative_rewards).std()

            # Sort the rollouts by maximum reward and select best_num_deltas rollouts
            scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards,negative_rewards))} #making the dictionary with index k and storing the max outof +r and -r
            order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:self.num_best_deltas] #sorting in descending order and selects the top num_best_deltas values
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] #creating rollouts tuple for the best performing deltas

            # Calculate step
            step = np.zeros(self.theta.shape)
            for pos, neg, d in rollouts:
                step += (pos-neg)*d

            # Update the weights
            self.theta += self.learning_rate/(self.num_best_deltas*sigma_rewards) * step #updating the weights

            # Only record video during evaluation, every n steps
            if iteration % self.record_every == 0:
                self.record_video = True
                np.save(os.path.join("weights", "weights_" + str(iteration)), self.theta)
                print("Saved weights for " + str(iteration) + "th iteration")

            # Play an episode with the new weights and see improvement
            final_reward = self.play_episode() ## We play without + or - noise
            print('Step: ', iteration, 'Reward: ', final_reward)
            with open(os.path.join("data","data.txt"), "a") as my_file:
                my_file.write(str(iteration) + ", " + str(final_reward) + "\n")

            global max_reward
            if final_reward > max_reward:
                max_reward = final_reward
                np.save(os.path.join("max_weights","weights_" + str(iteration)), self.theta)
                print("Saved weights for " + str(iteration) + "th iteration")
                self.record_video = True
                with open(os.path.join("data","max.txt"), "a") as my_file:
                    my_file.write(str(iteration) + ", " + str(final_reward) + "\n")
            final_reward = self.play_episode()

            self.record_video = False


def mkdir(base, name): #creating paths to store data
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

# Main code
if __name__ == '__main__':
    ENV_NAME = "BipedalWalker-v3"
    videos_dir = mkdir('.', 'videos')
    monitor_dir = mkdir(videos_dir, ENV_NAME)
    mkdir('.', 'weights')
    mkdir('.', 'max_weights')
    mkdir('.', 'data')
    trainer = Walker(seed = 1000,monitor_dir=monitor_dir)
    trainer.train()
#     # ...

