In [9]:
import gym 
import torch
from collections import deque
import random 
import copy 
from torch.autograd import Variable
env = gym.envs.make("MountainCar-v0")
import numpy as np

In [10]:
class DQN():
    def __init__(self, n_state,n_action,  n_hidden=50, lr=0.05):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_action)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

        self.model_target = copy.deepcopy(self.model)

    def target_predict(self, s):
        with torch.no_grad():
            return self.model_target(torch.Tensor(s))

    def copy_target(self):
        self.model_target.load_state_dict(self.model.state_dict())

    def replay(self, memory, replay_size, gamma ):
        if len(memory) >= replay_size:
            replay_data = random.sample(memory, replay_size)
            states= []
            td_targets =[]
            for state, action, next_state, reward, is_done in replay_data:
                states.append(state)
                q_values = self.predict(state).tolist()
                if is_done:
                    q_values[action]= reward
                else:
                    q_values_next = self.target_predict(next_state).detach()
                    q_values[action] = reward + gamma*torch.max(q_values_next).item()

                td_targets.append(q_values)
            
            states = np.array(states)
            self.update(states, td_targets)
    def update(self, s,y):
        """
        Update the weights of the DQN given a training sample
        @param s : state
        @param y: target value
        """

        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s ):
        with torch.no_grad():
            return self.model(torch.Tensor(s))

In [11]:
def gen_epsilon_greedy_policy(estimator, epsilon , n_action):
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action -1 )

        else :
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
 
    return policy_function


In [12]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50 
lr = 0.01
dqn = DQN(n_state, n_action, n_hidden, lr)


In [13]:
memory = deque(maxlen=10000)

In [14]:
def q_learning(env, estimator, n_episode, replay_size, target_update=10, gamma=1.0 , epsilon=0.1, epsilon_decay=0.99):
    for episode in range(n_episode):
        if episode % target_update ==0:
            estimator.copy_target()
        policy = gen_epsilon_greedy_policy(
            estimator, epsilon , n_action
        )
        state,_ = env.reset()

        is_done = False
        while not is_done:
            action = policy(state)
            next_state , reward, is_done , _ , _ = env.step(action)
            total_reward_episode[episode] += reward
            modified_reward = next_state[0] + 0.5
            if next_state[0]  >= 0.5:
                modified_reward += 100
            elif next_state[0] >= 0.25:
                modified_reward +=20

            elif next_state[0] >= 0.1:
                modified_reward +=10

            elif next_state[0] >= 0: 
                modified_reward += 5

            memory.append((state,action,next_state, modified_reward,  is_done))

            if is_done :
                break
                
            estimator.replay(memory, replay_size, gamma)
            state = next_state
        print('episode: {}, total reward :{}, epsilon:{}'.format(episode, total_reward_episode[episode],
        epsilon))

        epsilon = max(epsilon * epsilon_decay, 0.01)



In [15]:
n_episode = 1000
replay_size = 20
target_update =10


In [16]:
total_reward_episode = [0] * n_episode
q_learning(env, dqn, n_episode, replay_size,target_update, gamma =0.9, epsilon=1)

episode: 0, total reward :-5924.0, epsilon:1
episode: 1, total reward :-30215.0, epsilon:0.99
episode: 2, total reward :-18757.0, epsilon:0.9801
episode: 3, total reward :-16222.0, epsilon:0.9702989999999999
episode: 4, total reward :-14152.0, epsilon:0.96059601
episode: 5, total reward :-15230.0, epsilon:0.9509900498999999
episode: 6, total reward :-10591.0, epsilon:0.9414801494009999
episode: 7, total reward :-4586.0, epsilon:0.9320653479069899
episode: 8, total reward :-2231.0, epsilon:0.92274469442792
episode: 9, total reward :-29053.0, epsilon:0.9135172474836407
episode: 10, total reward :-13215.0, epsilon:0.9043820750088043
episode: 11, total reward :-11415.0, epsilon:0.8953382542587163
episode: 12, total reward :-13694.0, epsilon:0.8863848717161291
episode: 13, total reward :-14292.0, epsilon:0.8775210229989678
episode: 14, total reward :-3031.0, epsilon:0.8687458127689781
episode: 15, total reward :-9912.0, epsilon:0.8600583546412883
episode: 16, total reward :-12116.0, epsilon