In [54]:
import gym
from uuid import uuid3
import time
import numpy as np
from pymongo import MongoClient
from collections import deque
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout,BatchNormalization,Conv1D,Flatten
from keras.optimizers import Adam


ENV_NAME = "BeamRider-ram-v0"

env = gym.make(ENV_NAME);
Number_of_Actions = env.action_space.n
Observation_Shape = env.observation_space.shape
memory = deque(maxlen=1024)
print("Number of Actions: {}\nObservation Space: {}".format(Number_of_Actions,Observation_Shape));

Number of Actions: 9
Observation Space: (128,)


In [130]:
OBS_INDEX = 0
ACT_INDEX = 1
XOBS_INDEX = 2
REWARD_INDEX = 3
DONE_INDEX = 4
INFO_INDEX = 5


DENSE_MODEL = 0
CONV_MODEL = 1

AtariDatabase = MongoClient("localhost")["AtariAgents"]

class Agent():
    
    def __init__(self,model=None,model_type=None,loss="mse",optimizer="adam",metrics=["mse","mae"],
                 batch_size =1024,number_of_memories_to_replay = 10,gamma=0.99,eps_init=0.9,eps_decay=0.99,
                 eps_min=0.05,max_timesteps=10000,render_game=True):
        
        """
            Eposide_Experience = An Array holding the memory of the eposide;
            L          =        Lenght of  Eposide_Experience 
            N          =        batch_size or self.BatchSize

            While playing, After N Timesteps the Agent will train from Eposide_Experience[L: N],
        """
        self.MemoryCollection = AtariDatabase["Memory_"+ENV_NAME]
        self.AgentsCollection = AtariDatabase["Agents_"+ENV_NAME]
        self.Id = uuid4()
        #Agent's Hyper-Parameters
        self.Loss = loss
        self.Optimizer = optimizer
        self.BatchSize = batch_size;
        self.Number_Of_Memories_To_Replay = number_of_memories_to_replay;
        self.MaxTimeSteps = max_timesteps;
        
        #Agent Bellman Paramenters
        self.Epilison = eps_init;
        self.Epilison_Decay = eps_decay;
        self.Epilison_MinLimit = eps_min;
        self.Gamma = gamma;
        
        
        # Agent's Settings;
        self.Experience = []
        self.HighScore = 0;
        self.RenderGame = render_game
        self.Model_Type = model_type
        self.NumberofActions = Number_of_Actions;
        
        # Agent's History
        self.ActionsHistory = []
        self.RewardsHistory = []
        self.Metrics = list(set(metrics))
        self.Metrics.remove(self.Loss);
        self.MetricsHistory = {metric:[] for metric in self.Metrics}
        self.MetricsHistory[self.Loss] = []
        
       # Agent's Model (The Brain);
        self.Model = model
        if self.Model == None:
            self.createDenseNet();
        elif self.Model == DENSE_MODEL:
            self.createDenseNet();
        elif self.Model == CONV_MODEL:
            self.createConvNet();
            
            
        self.formatObservation = lambda obs: np.expand_dims(np.array(obs,dtype="float").reshape(self.Input_Shape)/255.0,axis=0);
        self.obsToImg = lambda obs:  np.array(obs,dtype="float").reshape(16,8,1)/255.0
        self.updateAgent =  lambda: self.AgentsCollection.update_one(
            filter={"_id":str(self.Id)},
            update={"$push":{"weights":agent.Model.get_weights()},
                    "$set":{"metrics":self.MetricsHistory,"rewards":self.RewardsHistory}
                   })
        
        # String Parsers
        self.parseScore = lambda: "Score: "+str(self.getCurrentScore())+" |  HighScore: "+str(self.HighScore);
        self.parseAverageCost = lambda _index: "Cost: "+" | ".join(["%s:%.5f"%(metric[0].upper()+metric[1:],sum(self.MetricsHistory[metric][_index])/len(self.MetricsHistory[metric][_index])) for metric in self.MetricsHistory])
        self.parseStatus =  lambda: "%d# | %s | ~ | %s | ~ | %s | ~ | Eps: %.3f Lives Left: %d \t"%(self.TimeStep,self.parseAverageCost(-1),self.parseActions(),self.parseScore(),self.Epilison,self.Experience[-1][INFO_INDEX])
        pass;
    
    def createDenseNet(self):
        
        """
            Four Dense Layers - Input Shape 
        """
        self.Input_Shape = (1,128);self.Model_Type = DENSE_MODEL;
        
        self.Model = Sequential();
        self.Model.add(Dense(256,use_bias=True,input_shape=self.Input_Shape));
        self.Model.add(Activation("relu"));
        self.Model.add(Dropout(0.25));
        
        self.Model.add(Dense(1024,use_bias=True));
        self.Model.add(BatchNormalization())
        self.Model.add(Activation("relu"))
        self.Model.add(Dropout(0.25));
        
        self.finalizeModel("3 Layers Dense");
    
    def createConvNet(self,):
        """
            A Convoluational Neural Network
        """
        
        self.Input_Shape = (8,16);self.Model_Type = CONV_MODEL;
                
        self.Model = Sequential();
        self.Model.add(Conv1D(filters=8,kernel_size=(4),use_bias=True,input_shape=(self.Input_Shape)));
        self.Model.add(BatchNormalization())
        self.Model.add(Activation("relu"));
        self.Model.add(Dropout(0.01));
        
        self.Model.add(Conv1D(filters=24,kernel_size=(1),use_bias=True));
        self.Model.add(BatchNormalization())
        self.Model.add(Activation("relu"));
        self.Model.add(Dropout(0.01));
        self.Model.add(Flatten());
        self.Model.add(BatchNormalization())

        
        self.finalizeModel("2 Convoluational Layers and 1 Dense Layers");
    
    def finalizeModel(self,structure_info):
        
        self.Model.add(Dense(self.NumberofActions,use_bias=True,));
        self.Model.add(BatchNormalization())
        self.Model.add(Activation("linear"));
        self.Model.compile(loss=self.Loss,optimizer=self.Optimizer,metrics=self.Metrics);
        self.AgentsCollection.insert_one({"_id":str(self.Id),"config":self.Model.to_json(),"weights":[]});
        print("Succesfully Created a Neural Network with {} .It has  Input Shape of {}".format(structure_info,self.Input_Shape),end="\r")

        
    def predictAction(self,obs):        
        return np.argmax(self.Model.predict(obs))
        
    def getAction(self,obs):
        """ Choose Random Action if random number is smaller than epilision for exporation """
        if np.random.uniform(0,1) > self.Epilison :
            return self.predictAction(obs);
        return np.random.choice(self.NumberofActions); 
    
    def learn(self,experience):
        if len(experience) == 0 :
            return np.nan
        x = np.array([exp[0] for exp in experience]).reshape((len(experience),)+self.Input_Shape)
        y = self.Model.predict(x)
        for i in range(len(experience)):
            #Bellman Equation
            if experience[i][4]:
                y[i:experience[i][1]] = experience[i][3];
            elif len(experience)-1 == i:
                y[i:experience[i][1]] = experience[i][3] + self.Gamma * np.max(experience[len(experience)-1][2])
            else:
                y[i:experience[i][1]] = experience[i][3] + self.Gamma * np.max(y[i+1]);
            

        for metric,cost in zip(self.MetricsHistory,self.Model.train_on_batch(x,y)):
            self.MetricsHistory[metric][-1].append(cost)
        print(self.parseStatus(),end="\r");
    
    def restartGame(self,):
        self.TimeStep = 0;
        for metric in self.MetricsHistory:
            self.MetricsHistory[metric].append([])
        self.RewardsHistory.append([[]])
        self.ActionsHistory.append({act:0 for act in range(self.NumberofActions)});
        return self.formatObservation(env.reset())
    
    def getCurrentScore(self):
        score = sum(self.RewardsHistory[-1][-1]);
        if score > self.HighScore:
            self.HighScore = score;
        return score
    
    def parseActions(self,nth_experience=-1):
        total = sum([act for act in self.ActionsHistory[nth_experience].values()]);
        return "Actions: "+" | ".join(["%.1f%s"%(100*(value/total),"%") for value in self.ActionsHistory[nth_experience].values()]);
        
    def render(self,):
        if self.RenderGame: 
            env.render(); # Rendering Frame(s)
            
    def playGame(self):
        obs = self.restartGame();
        self.render()
        prev_score = 0
        while True:
            
            # Pushing the new Observation to for new State 
            self.Experience.append([obs])
            
            #Predicting then Sending Action to the enviroment
            self.Experience[-1].append(self.getAction(self.Experience[-1][OBS_INDEX]));
            for item in env.step(self.Experience[-1][ACT_INDEX]):
                self.Experience[-1].append(item);
            self.Experience[-1][INFO_INDEX] = self.Experience[-1][INFO_INDEX]["ale.lives"]
            self.Experience[-1][XOBS_INDEX] = self.formatObservation(self.Experience[-1][XOBS_INDEX]);
            self.RewardsHistory[-1][-1].append(self.Experience[-1][REWARD_INDEX]);
            self.ActionsHistory[-1][self.Experience[-1][ACT_INDEX]] += 1;
            self.TimeStep +=1;
            self.render();
            
            # If agent is killed OR timestep divide by self.BatchSize has no reminader
            if self.Experience[-1][DONE_INDEX] or self.TimeStep%self.BatchSize == 0:
                
                # Training on Experience from the Current (self.Experience) and Previous Game (memory).
                self.learn(self.getExperienciesFromMemory(self.Number_Of_Memories_To_Replay,memories=self.Experience[self.TimeStep-self.BatchSize:]));
                
                # STOP PLAYING : If the agent has no lives or not imporving it's score
                score = self.getCurrentScore();
                self.RewardsHistory[-1]
                if score == prev_score: 
                    break
                elif self.Experience[-1][INFO_INDEX] == 0:
                    break;
                elif self.Experience[-1][DONE_INDEX]:
                    self.RewardsHistory[-1].append([])
                    prev_score = 0;
                else:
                    prev_score = score;
                    

        # Training on Experience from the Current (self.Experience) and Previous Game (memory).
        self.learn(self.getExperienciesFromMemory(self.Number_Of_Memories_To_Replay,memories=self.Experience[self.TimeStep-self.BatchSize:]));
        #self.MemoryCollection.insert_one({"model_type":self.Model_Type,"game_experience":self.Experience,"agent_id":str(self.ID)})
        memory.append(self.Experience[:-1]);
    
        # Reducing The Epilison to give agent a bigger probability to choose action and action
        if self.Epilison > self.Epilison_MinLimit:
            self.Epilison = self.Epilison*self.Epilison_Decay;
        pass;
    
    def getBatchFromExperience(self,experience):
        startIndex = max(0,np.random.choice(len(experience))-self.BatchSize);
        return experience[startIndex:startIndex+self.BatchSize]
    
    def getExperienciesFromMemory(self,number_of_memories_to_fecth,memories = []):
        if len(memory) > 0:
            for _ in range(min(len(memory),number_of_memories_to_fecth)):
                memories.extend(self.getBatchFromExperience(memory[np.random.choice(len(memory))]))
        return memories
        
    def pratice(self,trials=1000,testing_interval=3):
        np.random.seed(0);
        for n_trial in range(0,trials):
            print("\nPracting... On Trial %d out of %d Trials"%(n_trial+1,trials))
            self.playGame();
            print()
            if n_trial%testing_interval == 0:
                self.play();
            
                
    
    
    def play(self,):
        eps = self.Epilison;
        self.Epilison = 0
        print("Playing Game Without Any Random Actions (Epilison == 0). No Training Wheels")
        self.playGame();
        self.Epilison = eps;
        print("\n")

In [137]:
agent = Agent(
    model=CONV_MODEL,
    batch_size =256,
    gamma=0.99,
    optimizer=Adam(decay=0.99999999),
    eps_init=0.59,
    number_of_memories_to_replay = 300,
    eps_decay=0.9557,
    eps_min=0.05,
    render_game=False,
    max_timesteps = 13000,
    metrics=["mse","mae"]
)
agent.pratice(trials=100000)

Succesfully Created a Neural Network with 2 Convoluational Layers and 1 Dense Layers .It has  Input Shape of (8, 16)
Practing... On Trail 1 out of 100000 Trials
512# | Cost: Mae:1.00001 | Mse:0.60621 | ~ | Actions: 47.1% | 7.4% | 7.0% | 8.8% | 5.1% | 6.8% | 4.1% | 6.4% | 7.2% | ~ | Score: 44.0 |  HighScore: 44.0 | ~ | Eps: 0.590 Lives Left: 3 	
Playing Game Without Any Random Actions (Epilison == 0). No Training Wheels
256# | Cost: Mae:0.98553 | Mse:0.59876 | ~ | Actions: 100.0% | 0.0% | 0.0% | 0.0% | 0.0% | 0.0% | 0.0% | 0.0% | 0.0% | ~ | Score: 0.0 |  HighScore: 44.0 | ~ | Eps: 0.000 Lives Left: 3 	


Practing... On Trail 2 out of 100000 Trials
256# | Cost: Mae:0.97044 | Mse:0.60674 | ~ | Actions: 46.5% | 6.6% | 5.5% | 10.5% | 5.1% | 6.2% | 7.8% | 6.2% | 5.5% | ~ | Score: 0.0 |  HighScore: 44.0 | ~ | Eps: 0.564 Lives Left: 3 	

Practing... On Trail 3 out of 100000 Trials
1280# | Cost: Mae:0.94843 | Mse:0.62083 | ~ | Actions: 50.9% | 5.6% | 4.9% | 6.1% | 6.6% | 7.3% | 5.9% | 7.0% | 5.

256# | Cost: Mae:0.94994 | Mse:0.74199 | ~ | Actions: 0.0% | 0.0% | 0.0% | 0.0% | 0.0% | 100.0% | 0.0% | 0.0% | 0.0% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.000 Lives Left: 3 	


Practing... On Trail 26 out of 100000 Trials
256# | Cost: Mae:0.93813 | Mse:0.73391 | ~ | Actions: 2.0% | 2.3% | 2.0% | 3.1% | 2.0% | 82.4% | 2.3% | 2.3% | 1.6% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.190 Lives Left: 3 	

Practing... On Trail 27 out of 100000 Trials
256# | Cost: Mae:0.95475 | Mse:0.75964 | ~ | Actions: 3.1% | 1.2% | 3.1% | 3.9% | 1.6% | 80.9% | 2.0% | 2.3% | 2.0% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.182 Lives Left: 3 	

Practing... On Trail 28 out of 100000 Trials
256# | Cost: Mae:0.95764 | Mse:0.75236 | ~ | Actions: 2.3% | 3.1% | 3.1% | 0.8% | 3.1% | 82.8% | 1.2% | 2.3% | 1.2% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.174 Lives Left: 3 	
Playing Game Without Any Random Actions (Epilison == 0). No Training Wheels
256# | Cost: Mae:0.95267 | Mse:0.75129 |

256# | Cost: Mae:0.99766 | Mse:0.79434 | ~ | Actions: 0.8% | 0.4% | 0.8% | 1.6% | 0.4% | 92.2% | 1.2% | 2.0% | 0.8% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.061 Lives Left: 3 	

Practing... On Trail 52 out of 100000 Trials
256# | Cost: Mae:0.98717 | Mse:0.79216 | ~ | Actions: 0.8% | 0.0% | 0.8% | 0.0% | 0.8% | 95.3% | 0.8% | 0.4% | 1.2% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.059 Lives Left: 3 	
Playing Game Without Any Random Actions (Epilison == 0). No Training Wheels
256# | Cost: Mae:1.00600 | Mse:0.80532 | ~ | Actions: 0.0% | 0.0% | 0.0% | 0.0% | 0.0% | 100.0% | 0.0% | 0.0% | 0.0% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.000 Lives Left: 3 	


Practing... On Trail 53 out of 100000 Trials
256# | Cost: Mae:0.99758 | Mse:0.80033 | ~ | Actions: 1.2% | 0.4% | 0.8% | 0.0% | 0.4% | 95.7% | 0.8% | 0.4% | 0.4% | ~ | Score: 0.0 |  HighScore: 308.0 | ~ | Eps: 0.056 Lives Left: 3 	

Practing... On Trail 54 out of 100000 Trials
768# | Cost: Mae:0.98043 | Mse:0.79407 |

KeyboardInterrupt: 

In [None]:
agent.Experience[0:3]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots();
for i in range(100):
    img = (memory[0][i][0][0]*255.0).reshape(16,8)
    
    fig.add_subplot(2,1,2)
    ax = fig.imshow(img)
    plt.pause(0.1)
    


In [None]:
from matplotlib.animation import ArtistAnimation,FFMpegWriter

fig = plt.figure();
images = [[plt.imshow((memory[0][i][0][0]*255.0).reshape(16,8), animated=True)] for i in range(100)]

animation = ArtistAnimation(fig,images,blit=True,interval=50,repeat_delay=1000)
plt.show()

In [None]:
animation.save("ram.html")

In [None]:
writer = FFMpegWriter(fps=15, metadata=dict(artist='Me'), bitrate=1800)
animation.save("ram.mp4", writer=writer,)

In [None]:
from IPython.core.display import display, HTML

html = None
with open("./ram.html") as file:
    html = HTML(file.read())

In [34]:
type(agent.Experience[-1][INFO_INDEX])

int

In [15]:
for data in MongoClient("localhost")["Machine_Learning"]["Memory"].find():
    print(data)

In [16]:
MongoClient("localhost").list_database_names()

['Football', 'admin', 'config', 'local']