In [18]:
import gym 
import torch
from  torch.autograd import Variable
import random
 

In [19]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden = 50, lr= 0.05):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_action)
        )

        self.optimizer = torch.optim.Adam(
            self.model.parameters(),lr
        )
    def update(self, s,y):
        """
        Update the weights of the DQN given a training sample
        @param s : state
        @param y: target value
        """

        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s ):
        with torch.no_grad():
            return self.model(torch.Tensor(s))
        print("going on")

    


In [20]:
env = gym.envs.make("MountainCar-v0")

In [21]:
def gen_epsilon_greedy_policy(estimator, epsilon , n_action):
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action -1 )

        else :
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()

    return policy_function



In [22]:
def q_learning(env, estimator, n_episode, gamma = 1.0 , epsilon =0.1 , epsilon_decay = 0.99):
    for episode in range(n_episode):
        policy = gen_epsilon_greedy_policy(estimator, epsilon , n_action)
        state,_ = env.reset()
        #print("state",state)
        is_done = False
        while not is_done:
            action = policy(state)
            next_state , reward, is_done,_,_ = env.step(action)
            #print("next_state",next_state)
            total_reward_episode[episode] += reward 
            modified_reward = next_state[0] + 0.5
            if next_state[0] >= 0.5:
                modified_reward += 100

            elif next_state[0] >=0.25:
                modified_reward += 20

            elif next_state[0] >= 0.1:
                modified_reward += 10

            elif next_state[0] >= 0 :
                modified_reward += 5
            
            q_values = estimator.predict(state).tolist()

            if is_done:
                q_values[action] = modified_reward
                estimator.update(state,q_values)
                break
            
            q_values_next = estimator.predict(next_state)
            q_values[action] = modified_reward +  gamma * torch.max(q_values_next).item()
            estimator.update(state,q_values)
            state = next_state
        
        print(' episode :{} , total reward:{}, epsilon:{}'.format(episode, total_reward_episode[episode] , epsilon))

        epsilon = max(epsilon * epsilon_decay, 0.01)



        
            














            


In [23]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50 
lr = 0.001
dqn = DQN (n_state, n_action, n_hidden, lr)
n_state

2

In [24]:
n_episode  =1000
total_reward_episode = [0] * n_episode
q_learning(env,dqn, n_episode, gamma=0.99, epsilon =0.3)

 episode :0 , total reward:-8969.0, epsilon:0.3
 episode :1 , total reward:-726.0, epsilon:0.297
 episode :2 , total reward:-823.0, epsilon:0.29402999999999996
 episode :3 , total reward:-1147.0, epsilon:0.29108969999999995
 episode :4 , total reward:-642.0, epsilon:0.28817880299999993
 episode :5 , total reward:-758.0, epsilon:0.28529701496999993
 episode :6 , total reward:-1576.0, epsilon:0.28244404482029994
 episode :7 , total reward:-791.0, epsilon:0.27961960437209693
 episode :8 , total reward:-1076.0, epsilon:0.276823408328376
 episode :9 , total reward:-567.0, epsilon:0.27405517424509224
 episode :10 , total reward:-1565.0, epsilon:0.2713146225026413
 episode :11 , total reward:-678.0, epsilon:0.2686014762776149
 episode :12 , total reward:-633.0, epsilon:0.26591546151483875
 episode :13 , total reward:-765.0, epsilon:0.2632563068996904
 episode :14 , total reward:-599.0, epsilon:0.2606237438306935
 episode :15 , total reward:-509.0, epsilon:0.2580175063923865
 episode :16 , tot