In [2]:
import gym
import random
#from torch import *
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from copy import deepcopy

In [31]:
env = gym.make('CartPole-v1')

In [34]:
#env.spec.reward_threshold

In [21]:
class RLModel(nn.Module):
    def __init__(self):
        super(RLModel, self).__init__()
        self.linear1 = nn.Linear(4, 128)
        self.drop1 = nn.Dropout(p=.4)
        self.policy_head = nn.Linear(128, 2)
        self.state_head = nn.Linear(128, 1)
        
        self.old_params = self.state_dict()
    
    def save(self):
        self.old_params = deepcopy(self.state_dict())
    
    def old_forward(self, x):
        current_params = deepcopy(self.state_dict())
        self.load_state_dict(self.old_params)
        #print('old', self.state_dict()['linear1.weight'][0])
        out = self.forward(x)
        self.load_state_dict(current_params)
        #print('current', self.state_dict()['linear1.weight'][0])
        return out
        
    def forward(self, x):
        #print('current', self.state_dict()['linear1.weight'][0])
        x = self.linear1(x)
        x = self.drop1(x)
        neck = F.relu(x)
        policy_out = F.softmax(self.policy_head(neck), dim=-1)
        state_out = F.sigmoid(self.state_head(neck)) 
        
        return policy_out, state_out
    The observation of a 3-tuple of: the players current sum,
    the dealer's one showing card (1-10 where 1 is ace),
    and whether or not the player holds a usable ace (0 or 1).

    The observation of a 3-tuple of: the players current sum,
    the dealer's one showing card (1-10 where 1 is ace),
    and whether or not the player holds a usable ace (0 or 1).


In [22]:
gamma = .99

In [24]:
def one_episode(my_model):
    observation = np.array(env.reset())

    loss = []
    old_loss = []
    rewards = []
    state_guesses = []

    for i in range(1000):
        probs, state_guess = my_model(torch.from_numpy(observation).float())
        probs_old, _ = my_model.old_forward(torch.from_numpy(observation).float())
        #env.render()
        #env.action_space.sample()
        
        np_probs = probs.detach().numpy()
        np_probs_old = probs_old.detach().numpy()
        
        action = np.random.choice([0, 1], 1, p=(np_probs/sum(np_probs)))[0]
        loss.append(torch.log(probs[action]))
        old_loss.append(torch.log(probs_old[action]))
        state_guesses.append(state_guess)
        
        observation, reward, done, _ = env.step(action)
        #print(observation, reward, done)
        observation = np.array(observation)
        rewards.append(reward)
        #print(observation, done, reward, info)

        if done:
            break
            
    for i in range(len(rewards)-1, -1, -1):
        rewards[i] = rewards[i] + (rewards[i+1] * gamma) if i != len(rewards)-1 else 1
    
    return rewards, loss, old_loss, state_guesses

In [29]:
def train_loop():
    
    my_model = RLModel()
    optimizer = torch.optim.Adam(my_model.parameters(), lr=0.001)
    
    running_mean = 0
    
    for i in range(100000):
        rewards, loss, old_loss, state_guesses = one_episode(my_model)
        
        #print(loss, old_loss)
        
        running_mean = running_mean * .99 + len(rewards) * .01
        
        if i%100 == 0:
            print(running_mean)
        
        loss = torch.stack(loss)
        old_loss = torch.stack(old_loss).detach()

        rewards = np.array(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 0.00001)
        
        rewards = torch.from_numpy(rewards).float()
        
        advantages = rewards - torch.stack(state_guesses)
        
        my_model.save()
        
        epsilon = .2
        
        optimizer.zero_grad()
        #ratio = torch.ones_like(loss)
        ratio = torch.exp(loss - old_loss)
        
        #print(ratio)
        #print(ratio)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantages
        #print(surr1, surr2)
        
        policy_loss = -1 * torch.min(surr1, surr2).mean()
        state_loss = F.mse_loss(advantages, torch.zeros_like(advantages))
        
        final_loss = state_loss + policy_loss
        #policy_surr = -1 * surr1.mean()
        
        torch.nn.utils.clip_grad_norm_(my_model.parameters(), 30)
        
        final_loss.backward()
        #reduced_loss.backward()
        optimizer.step()

In [30]:
train_loop()

0.1
14.581927861594293
26.353360867476553
34.59849578361196
55.05122031435293
78.58662390820649
122.5300242584199
154.27544177908118
195.31898246695482
271.84679708988887
295.0624660816676
303.15978895888816
333.2195635556552
287.16826255859166
282.3462658855182
303.02769418263483
329.1238089232835
389.5035290354416
348.44584336083744
329.28486531817117


KeyboardInterrupt: 