In [None]:
import gym
import numpy as np
import torch

env = gym.make('Pong-v0', frameskip=4)
print(env.action_space.n)
print(env.observation_space.shape)
state = []

In [None]:
import torch
from torch import nn
from torch import optim

#class for the neural network
class neural_network(nn.Module):
    def __init__(self, env):
        super().__init__()
        #get number of actions & observations for output & input layer resp.
        self.n_actions = 3
        self.n_observations = 900
        print("Number actions: " + str(self.n_actions))
        print("Number observations: " + str(self.n_observations))
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=2, out_channels=8, kernel_size=4, stride=2),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=8, out_channels = 4, kernel_size=4, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(324, 160),
            nn.ReLU(),
            nn.Linear(160, self.n_actions),
            nn.Softmax(dim=-1)
        )
        
        
        
    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, 324)  # reduce the dimensions for linear layer input
        x = x.squeeze(0)
        return self.classifier(x)
                    
    def predict(self, state):
        #print(type(state))
        action_probabilities = self.forward(state)
        return action_probabilities

In [None]:
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import torch
from torch import nn
import math

def rgb2grey(rgb):
    return np.dot(rgb[33:193][:][...,:3], [0.2989, 0.5870, 0.1140])

from torchvision.transforms import functional

def inv(frame):
    new_frame = []
    for line in frame:
        new_frame.append([])
        for item in line:
            new_frame[-1].append(255-item)
            
    return torch.Tensor(new_frame)

def process(rgb):
    #greyscale, downscale, background removal
    frame = rgb[34:194][:][:]
    transposed = frame.transpose(2, 0, 1)
    as_tensor = torch.Tensor(transposed)
    grey = functional.rgb_to_grayscale(as_tensor)
    downsampled = functional.resize(grey, [84, 84])
    thresh = nn.Threshold(87.3, 0)
    background_removed = thresh(downsampled)
    state.append(background_removed)
    
    
    if (len(state) > 2):
        state.pop(0)
    return torch.stack(state).squeeze(1)

env.reset()
for n in range(15):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)

#taking a look at what the convolution actually looks like
network = neural_network(env)
for n in range(2):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)
    imgplot = plt.imshow(img_pre[0], cmap="gray")
    plt.show()
    img_pre = network.conv(process(observation)).detach().numpy()
    print(np.shape(img_pre))
    imgplot = plt.imshow(img_pre[0], cmap="gray")
    plt.show()
    imgplot = plt.imshow(img_pre[1], cmap="gray")
    plt.show()

In [None]:
#function for discounted return

def calculate_discounted_return(rewards, discount_factor, secondary_discount_factor):
    discounted_returns = [rewards[-1]]
    #neg = False
    
    for i in range(len(rewards)-2, -1, -1):
        if rewards[i] == 0:
            discounted_returns.append(rewards[i] + (discount_factor*discounted_returns[-1]))
        else:
            discounted_returns.append(rewards[i] + (secondary_discount_factor*discounted_returns[-1]))
    
    discounted_returns.reverse()
    return [discounted_return - (sum(discounted_returns) / len(discounted_returns)) for discounted_return in discounted_returns]


In [None]:
import random
import numpy
import time

random.seed(23)
#env.seed(23) seems to be depricated 
torch.manual_seed(23)

#initialise parameters
alpha = 0.001
discount_factor = 0.99
secondary_discount_factor = 0
n_network = neural_network(env)
network_optimiser = optim.Adam(n_network.parameters(), lr=alpha)
available_actions = [0,2,3]
print(available_actions)

num_episodes = 1000
batch_size = 5
episode_rewards = []
total_rewards = [0]
env_states = []
total_loss = []
batch_log_probs = []
batch_actions = []
batch_rewards = []
batch_time = 0
batch_count = 0


start = time.time()

for episode in range(300,1000):
    tic = time.time()
    #initialising variables
    episode_rewards.append(0)
    observation = env.reset()
    cur_state = process(observation)
    cur_state = process(observation)
    done = False
    first = True
    turn = 0

    states = []
    rewards = []
    actions = []
    log_probs = []

    while not done:
        #generating episode trajectory
        action_probs = n_network.predict(cur_state)
        turn += 1
        action = random.choices(available_actions, weights=action_probs.tolist())[0]
        actions.append(action)
        if action == 2:
            action_index = 1
        elif action == 3:
            action_index = 2
        else:
            action_index = 0
        #calculating log probability
        log_probs.append(torch.log(action_probs.squeeze(0))[action_index])
        observation, reward, done, info = env.step(action)

        cur_state = process(observation)
        rewards.append(reward)
        episode_rewards[episode] += reward

        if done:
            #adding to batches
            discounted_returns = calculate_discounted_return(rewards, discount_factor, secondary_discount_factor)
            batch_rewards.extend(discounted_returns)
            batch_log_probs.extend(log_probs)
            batch_count += 1
            total_rewards[-1] += sum(rewards)
            print(f"Episode {episode}, {time.time() - tic:.2f} seconds, {episode_rewards[episode]} reward")
            batch_time += time.time() - tic

            if batch_count == batch_size:
                #processing batch
                tic = time.time()
                network_optimiser.zero_grad()

                reward_tensor = torch.Tensor(batch_rewards)
                chosen_log_probs = torch.stack(batch_log_probs)

                log_x_reward = reward_tensor * chosen_log_probs
                loss = - log_x_reward.mean()

                loss.backward()
                print("Loss: ", loss)
                total_loss.append(loss)
                network_optimiser.step()
                
                if episode % 100 == 99:
                    print("Saving network")
                    torch.save(n_network.state_dict(), 'reinforce-logs/agent-final' + str(episode) + '.pt')
                    with open("reinforce-logs/last-run.pickle", "wb") as f:
                        pickle.dump(total_rewards, f)
                    with open("reinforce-logs/last-run-all.pickle", "wb") as f:
                        pickle.dump(episode_rewards, f)

                batch_rewards = []
                batch_log_probs = []
                batch_count = 0
                batch_time += time.time() - tic
                print(f"AVG REWARDS {episode}, {batch_time:.2f} SECONDS, {total_rewards[-1]/batch_size} AVG REWARD")
                batch_time = 0
                total_rewards.append(0)


In [None]:
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import torch
from torch import nn

#transposed = observation.transpose(2, 0, 1

conv2 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=8, stride=4)
network = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, stride=1))
            #nn.ReLU())
            #nn.MaxPool2d(kernel_size=4, stride=2),
            #nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2),
            #nn.ReLU())
            #nn.MaxPool2d(kernel_size=2, stride=1))
            #nn.BatchNorm2d(16))

networ2k = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=96, kernel_size=10, stride=3),  # (b x 96 x 55 x 55)
            nn.LeakyReLU(),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),  # section 3.3
            nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 96 x 27 x 27)
            nn.Conv2d(96, 256, 5, padding=2),  # (b x 256 x 27 x 27)
            #nn.LeakyReLU(),
            #nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
            #nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 256 x 13 x 13)
            #nn.Conv2d(256, 384, 3, padding=1),  # (b x 384 x 13 x 13)
            #nn.LeakyReLU(),
            #nn.Conv2d(384, 384, 3, padding=1),  # (b x 384 x 13 x 13)
            #nn.LeakyReLU(),
            #nn.Conv2d(384, 256, 3, padding=1),  # (b x 256 x 13 x 13)
            #nn.LeakyReLU(),
            #nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 256 x 6 x 6)
        )
for n in range(10):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)

last_space = [0,0]

for n in range(10):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)
grey_image = img_pre[0]
print("GREY IMAGE PROCESS")
#print(np.amax(grey_image))
imgplot = plt.imshow(grey_image, cmap="gray")
plt.show()
print(np.shape(grey_image))
print(grey_image)

fc = network(img_pre[None, ...].float())
#print(max(fc))
fc_numpy = fc.detach().numpy()
print(np.shape(fc_numpy))

#array = grey_image.detach().numpy()
#print(np.shape(array))
#result = numpy.where(array == numpy.amax(array))
#print('Tuple of arrays returned : ', result)
#print('Max value: ', numpy.amax(array))
#print('List of coordinates of maximum value in Numpy array : ')
# zip the 2 arrays to get the exact coordinates
#listOfCordinates = list(zip(result[0], result[1]))
# travese over the list of cordinates
#for cord in listOfCordinates:
 #   print(cord)
print("1")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy()), cmap="gray")
plt.show()
'''
print("2")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[1][:][:], cmap="gray")
plt.show()
print("3")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[2][:][:], cmap="gray")
plt.show()
print("4")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[3][:][:], cmap="gray")
plt.show()

print("5")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[4][:][:], cmap="gray")
plt.show()
print("6")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[5][:][:], cmap="gray")
plt.show()
print("7")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[6][:][:], cmap="gray")
plt.show()
print("8")
imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[7][:][:], cmap="gray")
plt.show()'''
for n in range(50):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)
    imgplot = plt.imshow(img_pre[0], cmap="gray")
    plt.show()

grey_image = img_pre[0]
imgplot = plt.imshow(grey_image, cmap="gray")
plt.show()

#img_pre = process(observation)
#fc = network(img_pre[None, ...].float())
#print(np.shape(fc.detach().numpy()))
#imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[1][:][:], cmap="gray")
#plt.show()

for n in range(10):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)

grey_image = img_pre[0]
imgplot = plt.imshow(grey_image, cmap="gray")
plt.show()

#fc = network(img_pre[None, ...].float())
#print(np.shape(fc.detach().numpy()))
#imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[2][:][:], cmap="gray")
#plt.show()

for n in range(10):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)
    
grey_image = img_pre[0]
imgplot = plt.imshow(grey_image, cmap="gray")
plt.show()

#img_pre = process(observation)
#fc = network(img_pre[None, ...].float())
#print(np.shape(fc.detach().numpy()))
#imgplot = plt.imshow(numpy.squeeze(fc.detach().numpy())[3][:][:], cmap="gray")
#plt.show()

In [None]:
class neural_network(nn.Module):
    def __init__(self, env):
        super().__init__()
        #get number of actions & observations for output & input layer resp.
        self.n_actions = env.action_space.n
        self.n_observations = 900
        print("Number actions: " + str(self.n_actions))
        print("Number observations: " + str(self.n_observations))
        self.network = nn.Sequential(
            nn.Linear(self.n_observations, 32), 
            #nn.ReLU(), 
            nn.Linear(32, self.n_actions),
            nn.Softmax(dim=-1))
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=2, out_channels=4, kernel_size=4, stride=2),
            nn.MaxPool2d(kernel_size=2, stride=2),
           # nn.Conv2d(in_channels=4, out_channels = 8, kernel_size=5, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(1600, 800),
            nn.ReLU(),
            nn.Linear(800, 400),
            nn.Linear(400, self.n_actions),
            nn.Softmax(dim=-1)
        )
        
        
        
    def forward(self, x):
        x = self.conv(x)
        #print(numpy.squeeze(x.detach().numpy())[1][:][:])
        #imgplot = plt.imshow(numpy.squeeze(x.detach().numpy())[1][:][:], cmap="gray")
        #plt.show()
        img_temp = numpy.squeeze(x.detach().numpy())[1][:][:]
        unique, counts = np.unique(img_temp,return_counts=True)
        common = unique[np.argmax(counts)]
        self.Thresh = nn.Threshold(common, 0)
        #print("max:")
        if common < 0:
            common = math.floor(common)
        else:
            common = math.ceil(common)
        #print(common)
        x = self.Thresh(x)
        #print(numpy.shape(x))
        x = x.view(-1, 1600)  # reduce the dimensions for linear layer input
        x = x.squeeze(0)
        #print(numpy.shape(x))
        return self.classifier(x)
network_optimiser = optim.Adam(n_network.parameters(), lr=alpha)
for n in range(20):
    observation, reward, done, info = env.step(env.action_space.sample())
    img_pre = process(observation)

network = neural_network(env)
#print(img_pre)

for n in range(10):
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action
    img_pre = process(observation)
    print(network(img_pre))