In [1]:
import torch
from torch import nn
from torch import optim

#class for the neural network
class neural_network(nn.Module):
    def __init__(self, env):
        super().__init__()
        #get number of actions & observations for output & input layer resp.
        self.n_actions = 3
        self.n_observations = 900
        print("Number actions: " + str(self.n_actions))
        print("Number observations: " + str(self.n_observations))
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=2, out_channels=8, kernel_size=4, stride=2),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=8, out_channels = 4, kernel_size=4, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(324, 160),
            nn.ReLU(),
           # nn.Linear(800, 400),
            nn.Linear(160, self.n_actions),
            nn.Softmax(dim=-1)
        )
        
        
        
    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, 324)  # reduce the dimensions for linear layer input
        x = x.squeeze(0)
        return self.classifier(x)
                    
    def predict(self, state):
        #print(type(state))
        action_probabilities = self.forward(state)
        return action_probabilities

In [2]:
import pickle
import gym
import numpy as np
import torch
from torch import nn
from torchvision.transforms import functional
import math
import random
import time 

available_actions = [0,2,3]

state = []

def process(rgb):
    frame = rgb[34:194][:][:]
    transposed = frame.transpose(2, 0, 1)
    as_tensor = torch.Tensor(transposed)
    grey = functional.rgb_to_grayscale(as_tensor)
    downsampled = functional.resize(grey, [84, 84])
    thresh = nn.Threshold(87.3, 0)
    background_removed = thresh(downsampled)
    state.append(background_removed)
    
    if (len(state) > 2):
        state.pop(0)
    #print(numpy.shape(torch.flatten(torch.stack(state).squeeze(1), 0, -1)))
    return torch.stack(state).squeeze(1)

env = gym.make('ALE/Pong-v5', frameskip=4)

#saved_network = neural_network(env)
#saved_network.load_state_dict(torch.load('agent-alpha.pt'))

#observation = env.reset()
#cur_state = process(observation)


for test in range(3):
    saved_network = neural_network(env)
    if test == 0:
        print("ALPHA")
        saved_network.load_state_dict(torch.load('agent-alpha.pt'))
    elif test == 1:
        print("MODE 1")
        saved_network.load_state_dict(torch.load('agent-mode1.pt'))
 #   for diff in range(4):
    etic = time.time()
    #env = gym.make('ALE/Pong-v5', frameskip=4, difficulty=diff, mode=test)
    #env.seed()
    observation = env.reset()
    cur_state = process(observation)
    pre_loss = []
    pre_win = []

    all_rewards = 0
    for episode in range(50):
        tic = time.time()
        cur_state = process(env.reset())
        done = False
        rewards = [0]
        memory = [cur_state]
        total_reward = 0
        frames = 0

        while not done:
            action_probs = saved_network.predict(cur_state)
            action = random.choices(available_actions, weights=action_probs.tolist())[0]
            observation, reward, done, info = env.step(action)
            frames += 1
            total_reward += reward
            if reward == 0:
                rewards[-1] += 1
            else:
                if reward == 1:
                    pre_win.append(rewards[-1])
                elif reward == -1:
                    pre_loss.append(rewards[-1])
                #print(rewards[-1])
                rewards.append(reward)
                rewards.append(0)
            cur_state = process(observation)
            memory.append(cur_state[1])
        all_rewards += total_reward
        print(f"Episode {episode}, {time.time() - tic:.2f} seconds, {frames} frames, {total_reward} reward")
           # print(rewards)
            #print()
    print("TEST " + str(test) + " AVG REWARDS " + str(all_rewards/50))
    print(f"Time: {time.time() - etic:.2f}")
    print("Pre-Win Avg: " + str(sum(pre_win) / len(pre_win)))
    print("Pre-Loss Avg: " + str(sum(pre_loss) / len(pre_loss)))
    print()

  deprecation(
  deprecation(


Number actions: 3
Number observations: 900
ALPHA
Episode 0, 5.43 seconds, 3344 frames, -13.0 reward
Episode 1, 5.00 seconds, 3037 frames, -15.0 reward
Episode 2, 4.13 seconds, 2500 frames, -20.0 reward
Episode 3, 5.39 seconds, 3079 frames, -17.0 reward
Episode 4, 5.06 seconds, 3136 frames, -12.0 reward
Episode 5, 5.17 seconds, 3220 frames, -13.0 reward
Episode 6, 4.59 seconds, 2818 frames, -16.0 reward
Episode 7, 5.08 seconds, 3152 frames, -15.0 reward
Episode 8, 4.93 seconds, 2975 frames, -18.0 reward
Episode 9, 4.92 seconds, 2921 frames, -11.0 reward
Episode 10, 6.20 seconds, 3799 frames, -12.0 reward
Episode 11, 5.40 seconds, 3228 frames, -13.0 reward
Episode 12, 4.42 seconds, 2717 frames, -17.0 reward
Episode 13, 4.74 seconds, 2936 frames, -16.0 reward
Episode 14, 5.21 seconds, 3244 frames, -14.0 reward
Episode 15, 5.17 seconds, 3223 frames, -13.0 reward
Episode 16, 5.92 seconds, 3618 frames, -12.0 reward
Episode 17, 4.96 seconds, 3071 frames, -19.0 reward
Episode 18, 4.73 seconds,

<class 'gym.wrappers.time_limit.TimeLimit'>


{'lives': 0, 'episode_frame_number': 8730, 'frame_number': 94950}
