In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch import optim
from collections import deque
from queue import Queue 
import random
import time

In [2]:
env = gym.make('CartPole-v0')

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [7]:
class Policy(nn.Module):
    def __init__(self,learning_rate,input_dims,h1,n_actions):
        super(Policy,self).__init__()
        self.linear1 = nn.Linear(input_dims,h1)
        self.linear2 = nn.Linear(h1,n_actions)
        
        self.optimizer = optim.Adam(self.parameters(),lr=learning_rate)
    
    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        return F.softmax(x,dim=0)

In [8]:
class cart_agent():
    def __init__(self,gamma,l_r,input_dims,n_actions,save=False):
        self.gamma = gamma
        self.policy = Policy(learning_rate = l_r,input_dims=input_dims,h1=128,
                            n_actions=n_actions).to(device)
        self.states = np.array([[0,0,0,0]])
        self.actions = np.array([])
        self.rewards = np.array([])
        self.returns = 0
    def choose_action(self,obs):
        obs = torch.Tensor(obs).to(device)
        with torch.no_grad():
            prob = self.policy.forward(obs)
            action = torch.argmax(prob)
        return int(action)
    def store_trajectory(self,state,action,reward):
        self.states = np.append(self.states,state.reshape(1,4),axis=0)
        self.actions = np.append(self.actions,action)
        self.rewards = np.append(self.rewards,reward)
        self.returns += reward * np.power(self.gamma, self.states.size - 1) 
    def improve(self):
        g = self.returns
        for i in range(self.actions.size):
            s = torch.Tensor(self.states[i+1]).to(device)
            a = self.actions[i]
            r = self.rewards[i]
            self.policy.optimizer.zero_grad()
            loss = (np.power(self.gamma,i))*g*torch.log(self.policy.forward(s)[int(a)]).to(device)
            loss.backward()
            self.policy.optimizer.step()
            g = np.divide((g - r),self.gamma)
        self.states = np.array([[0,0,0,0]])
        self.actions = np.array([])
        self.rewards = np.array([])
        self.returns = 0

In [9]:
agent = cart_agent(gamma=0.99,l_r=0.003,input_dims=4,n_actions=2)

In [21]:
scores = []
n_games = 400
score = 0
best_score = 200

In [22]:
for i in range(n_games):
    score = 0
    done = False
    state = env.reset()
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, info = env.step(action)
        agent.store_trajectory(state,action,reward)
        score += reward
        state = next_state
    scores.append(score)
    agent.improve()
    print(np.mean(scores[-10:]))
    if np.mean(scores[-10:])>best_score and i>10:
        best_score = np.mean(scores[-10:])
        torch.save(agent.policy.state_dict(),'/home/raj/My_projects/REINFORCE/CartPole_lowstate(REINFORCE).pt')

98.0
107.5
138.33333333333334
153.75
153.8
161.5
153.57142857142858
159.375
163.88888888888889
167.5
168.9
169.5
169.5
168.6
163.6
163.5
172.9
172.9
172.9
172.9
172.0
179.7
179.7
178.3
187.9
178.7
178.7
178.7
178.7
178.7
188.4
188.4
188.4
182.5
182.5
191.8
191.8
191.8
191.8
191.8
191.8
191.8
191.8
196.0
189.0
189.0
189.0
189.0
189.0
189.0
189.0
181.0
181.0
185.0
192.0
182.7
182.7
180.8
172.6
166.8
166.8
174.8
166.4
162.3
162.3
168.5
168.5
169.7
168.9
174.7
174.7
170.8
179.2
183.3
174.3
177.4
172.6
163.3
172.3
172.3
172.3
176.2
175.0
175.0
184.0
175.5
180.3
190.3
190.3
180.5
180.5
180.5
181.7
172.1
171.5
170.8
170.8
163.8
153.3
157.6
153.7
149.4
149.4
159.0
148.9
158.1
158.1
155.6
166.1
171.1
168.7
166.5
166.5
166.5
167.6
167.6
157.3
158.7
156.9
147.6
153.9
148.7
138.8
128.9
128.1
121.7
121.4
129.5
127.9
131.5
130.4
141.3
143.0
147.1
149.6
144.9
155.5
150.1
153.5
151.6
152.7
153.5
151.8
147.9
154.5
165.6
161.8
167.2
164.3
169.0
157.8
157.8
167.7
177.4
178.3
169.0
172.8
172.8
174.9
167.8

In [34]:
n_games = 5
scores = []
agent.policy.load_state_dict(torch.load('/home/raj/My_projects/REINFORCE/CartPole_lowstate(REINFORCE).pt'))

for i in range(n_games):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action = agent.choose_action(state)
        env.render()
        next_state, reward, done, _ = env.step(action)
        score += reward
        state = next_state
    print(score)
    print("\n")

    scores.append(score)
print(np.mean(scores))
env.close()

194.0


200.0


200.0


200.0


144.0


187.6


In [783]:
best_score

200.0