In [None]:
# These are the relevant dependencies
'''
Keras==2.4.3
tensorflow==2.3.0
Pillow==7.0.0
gym==0.17.3
h5py==2.10.0
numpy==1.18.5
pandas==1.1.4
scikit-image==0.16.2
matplotlib==3.2.2
'''

In [3]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

EPISODES = 100


#Custom Environment
class Env():

    #to get the variable_1 as described in problem
    def get_var(self):
        return random.randint(0,100) # return random no in the given range

    #to get the ground truth as described in problem
    def get_ground_truth(self):
        return random.randint(90,95) # return random no in the given range

    # each iteration step
    def step(self, action, i,state):

        alpha = self.alpha
        reward = 0
        gndtrt = self.get_ground_truth()
        variable_1 = self.get_var()
        if action == 0:
            alpha = alpha - 0.05
            if(i>1):
                state[i] = alpha*state[i-1] + (1-alpha)*(variable_1)         # if action is 0, decrease alpha
            else:
                state[i] = alpha + (1-alpha)*(variable_1)
            if(state[i] > gndtrt):
                return 100
            else :
                return 0

        if action == 1:
            alpha = alpha + 0.05
            if(i>1):
                state[i] = alpha*state[i-1] + (1-alpha)*(variable_1)         # if action is 1, increase alpha 
            else:
                state[i] = alpha + (1-alpha)*(variable_1)
            if(state[i] > gndtrt) :
                return 100
            else :
                return 0

    
    # to reset the variables after epochs
    def reset(self,state):

        self.alpha = 0
        state[0] = 0






# Double DQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network
class DoubleDQNAgent:
    def __init__(self, state_size, action_size):

        self.load_model = False
        # get size of state and action
        self.state_size = 20 #state_size
        self.action_size = 2 #action_size

        # these is hyper parameters for the Double DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()

        #if self.load_model:
          #  self.model.load_weights("./save_model/var1.h5")

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            
            state = np.reshape(state, [1, state_size])

            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])


        target = self.model.predict(update_input)
        target_next = self.model.predict(update_target)
        target_val = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_next[i])
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    target_val[i][a])

        # make minibatch which includes target q value and predicted q value
        # and do the model fit!
        self.model.fit(update_input, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)


if __name__ == "__main__":
      
    env = Env()
    # get size of state and action from environment
    state_size = 20 
    action_size = 2 

    #creates state: size 20 as that was the number of iterations mentioned in problem statement
    state = [0]*20

    agent = DoubleDQNAgent(state_size, action_size)

    #these lists store the final score of each epoch
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0

        i = 0

        #This is to reset the states after each epoch
        state = [0]*20
        env.reset(state)
        while not done:
            i = i+1
            if(i==19):
                done = True

            # get action for the current state and go one step in environment
            action = agent.get_action(state)

            reward = env.step(action,i,state)
            # if an action make the episode end, then gives penalty of -100
            reward = reward if not done else 0

            # save the sample <s, a, r, s'> to the replay memory
            if(i<19):
                agent.append_sample(state[i], action, reward, state[i+1], done)
            else:
                agent.append_sample(state[i], action, reward, 0, done)

            # every time step do the training
            agent.train_model()
            score = score + reward

            if done:
                # every episode update the target model to be same with model
                agent.update_target_model()

                # every episode, plot the play time
                scores.append(score)
                episodes.append(e)
                
                # TO PLOT A GRAPH OF EPISODES VS SCORES
                #pylab.plot(episodes, scores, 'b')
                #pylab.savefig("episodes_vs_scores.png")
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

           

        # to save the model after 50 epochs each time
        if e % 50 == 0:
            agent.model.save_weights("var1.h5")






Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 24)                504       
_________________________________________________________________
dense_7 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 50        
Total params: 1,154
Trainable params: 1,154
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 24)                504       
_________________________________________________________________
dense_10 (Dense)             (None, 24)                600       
______________________________

In [2]:
# I have mentioned the relevant dependencies above, but still if you need anytign else here are the other ones
!pip freeze

absl-py==0.10.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.1.0
argon2-cffi==20.1.0
asgiref==3.3.0
astor==0.8.1
astropy==4.1
astunparse==1.6.3
async-generator==1.10
atari-py==0.2.6
atomicwrites==1.4.0
attrs==20.2.0
audioread==2.1.9
autograd==1.3
Babel==2.8.0
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==3.2.1
blis==0.4.1
bokeh==2.1.1
Bottleneck==1.3.2
branca==0.4.1
bs4==0.0.1
CacheControl==0.12.6
cachetools==4.1.1
catalogue==1.0.0
certifi==2020.6.20
cffi==1.14.3
chainer==7.4.0
chardet==3.0.4
click==7.1.2
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.9.5
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.2.2
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.2.5
cvxpy==1.0.31
cycler==0.10.0
cymem==2.0.4
Cython==0.29.21
daft==0.0.4
dask==2.12.0
dataclasses==0.7
datascience==0.10.6
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.6.0
descartes==1.1.0
dill==0.3.3
distributed==1.25.3
Django==3.1.3
dlib==19.18.0
dm-tree==0.1.5
docopt==0.6.2
docutil