Importiamo tutte le librerie che ci interessano

In [None]:
import gym
import random
import numpy as np
from keras.layers import Input, Conv2D, Dense, MaxPooling2D , Flatten
from keras.models import Model, Sequential, load_model
from collections import deque
from keras.optimizers import Adam
import keras
import matplotlib.pyplot as plt

Creo l'ambiente SpaceInvaders-vo

In [None]:
env = gym.make('SpaceInvaders-v0')

Creiamo la nostra rete neurale

In [None]:
class DQN:
    def __init__(self, input_shape, output_shape, discount=0.99, update_target_every=10, memory_size=2000):
        self.input_shape=input_shape
        self.output_shape=output_shape
        self.discount=discount
        self.update_target_every=update_target_every
        self.policy_net=self.create_model()
        self.memory=deque(maxlen=memory_size)
        self.target_counter=0 
    
    def create_model(self):
        model=Sequential()
        model.add(Conv2D(input_shape=self.input_shape, filters=16, kernel_size=(8,8), strides=(4,4), padding="valid", 
                        activation="relu", use_bias=True,))
        model.add(Conv2D(filters=16, kernel_size=(4,4), strides=(2,2), padding="valid", 
                       activation="relu", use_bias=True,))
        #model.add(Conv2D(filters=32, kernel_size=(2,2), padding="valid", 
        #                activation="relu", use_bias=True,))
        model.add(Flatten())
        model.add(Dense(128, activation="relu"))
        model.add(Dense(self.output_shape)) # era softmax
        adm=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss="mse", optimizer=adm, metrics=["accuracy"] )
        return model        

Definiamo tre metodi per preprocessare la nostra immagine croppandola e portandola ad una scala di grigi

In [None]:
def to_greyscale(img):
        return np.mean(img , axis=2).astype(np.uint8)

In [None]:
def downsample(img):
        return img[::2 , ::2]

In [None]:
def crop(img):
    return img[10:100 ,:]

In [None]:
img = env.reset()
plt.imshow(preprocess(img))
#print(preprocess(img).shape)

In [None]:
plt.imshow(to_greyscale(downsample(img)))

In [None]:
def preprocess(img):
        return crop(to_greyscale(downsample(img)))/255

Definiamo un metodo per trasformare ogni ricompensa in (-1, 0, +1)

In [None]:
def transform_reward(reward):
    return np.sign(reward)

In [None]:
class Memory:
    def __init__(self , memory = [] , maxsize = 10000):
            self.iteration = 0
            self.memory = memory
            self.maxsize = maxsize

In [None]:
def iteration(env  , model  , mem):
    env.reset()
    epsilon =  (0.995)**(mem.iteration)
    o,r,d,i = env.step(0)
    env.render()
    processed_state = preprocess(o)
    processed_state = processed_state.reshape((1,90,80,1))
    mem.memory = []
    for i in range(mem.maxsize):
        if  random.random() < epsilon:
            ac = env.action_space.sample()
        else:
            ac = np.argmax(model.policy_net.predict(processed_state))
        next_state,reward,done,info = env.step(ac)
        if done:
            next_processed_state = preprocess(next_state)
            next_processed_state = next_processed_state.reshape((1,90,80,1))
            mem.memory.append((processed_state,ac  , next_processed_state , reward, done ))
            break
        env.render()
        next_processed_state = preprocess(next_state)
        next_processed_state = next_processed_state.reshape((1,90,80,1))
        mem.memory.append((processed_state,ac  , next_processed_state , reward, done ))
        processed_state=next_processed_state
    mem.iteration=mem.iteration+1   

In [None]:
mem = Memory()
dqn = DQN((90,80,1),6)

In [None]:
dqn.policy_net.summary()

In [None]:
iteration(env , dqn , mem)

In [None]:
env.close()

In [None]:
len(mem.memory)

In [None]:
iteration(env , dqn , mem)

In [None]:
len(mem.memory)

In [None]:
mem.iteration

In [None]:
dqn.policy_net.predict(mem.memory[0][0])

In [None]:
for index, _  in enumerate(mem.memory):
    print(index , dqn.policy_net.predict(mem.memory[index][0]))

In [None]:
for index, _  in enumerate(mem.memory):
    print(index ,mem.memory[index][3])

In [None]:
env.close()

In [None]:
def train ( mem , model,gamma = 0.99 ):
    for state , action , next_state , reward ,done in mem.memory:
            #rewards = np.zeros(6)
            #rewards[action]=reward
            #q_values = np.zeros(6)
            #q_values[best]=1
            #target = rewards+ (  gamma * q_values)
            #target = target.reshape(1,6)
            target =model.policy_net.predict(state)[0]
            if not done:
                #best = np.argmax(model.policy_net.predict(next_state))
                target[action]= reward + gamma* np.max(model.policy_net.predict(next_state)[0])
            else:
                target[action]= 0
            target =target.reshape(1,6)
            model.policy_net.fit(state , target , verbose=0)

In [None]:
train(mem , dqn)

In [None]:
iteration(env,dqn,mem)
env.close()

In [None]:
len(mem.memory)

In [None]:
mem = Memory()
dqn = DQN((90,80,1),6)

In [None]:
for i in range(10000):
    train(mem , dqn)
    iteration(env,dqn,mem)
    env.close()
    print(f"iterazione: {mem.iteration}\t memoria: { len(mem.memory)}\t esplorazione: {(0.995)**(mem.iteration+1)}")
    #print("memoria" , len(mem.memory))
    #print("esplorazione" ,(0.995)**(mem.iteration+1))

In [None]:
def play_network(env , model):
    env.reset()
    o,r,d,i = env.step(0)
    env.render()
    processed_state = preprocess(o)
    processed_state = processed_state.reshape((1,90,80,1))
    for i in range(10000):
        ac = np.argmax(model.policy_net.predict(processed_state))
        next_state,reward,done,info = env.step(ac)
        if done:
            break
        env.render() 

In [None]:
play_network(env , dqn)

In [None]:
env.close()

In [None]:
#salviamo la rete
dqn.policy_net.save("modello_prova.h5")

In [None]:
plt.imshow(mem.memory[540][0].reshape((90,80)))