In [1]:
import gymnasium as gym
import pygame
from gymnasium.utils.play import play
# from gynasium.utils.play import pl
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.utils.data import DataLoader
import matplotlib
import matplotlib.pyplot as plt
import random

%matplotlib inline

matplotlib.rcParams['figure.facecolor'] = '#ffffff'


In [2]:
env = gym.make("CartPole-v1",render_mode="human")

In [21]:
class Q_Network(nn.Module):
    def __init__(self):
        super(Q_Network, self).__init__()
        self.network = nn.Sequential(nn.Linear(4,6),
                                     nn.ReLU(),
                                     nn.Linear(6,4),
                                     nn.ReLU(),
                                     nn.Linear(4,2))
    def forward(self,state):
            return self.network(state)

class Policy_Network(nn.Module):
    def __init__(self):
        super(Policy_Network, self).__init__()
        self.network = nn.Sequential(nn.Linear(4,8),
                                     nn.ReLU(),
                                     nn.Linear(8,6),
                                     nn.ReLU(),
                                     nn.Linear(6,2),
                                     nn.Softmax(dim=0))
    def forward(self,state):
            return self.network(state)

In [26]:
# @torch.no_grad()
def train(epochs,lrQ,lrP,model_Q,model_P,max_steps,gamma,opt_func=torch.optim.SGD):
    epsilon =1e-5
    env = gym.make("CartPole-v1",render_mode="human")
    # opt_P = opt_func(model_P.parameters(),lrP)
    opt_Q = opt_func(model_Q.parameters(),lrQ)
    action_labels = torch.tensor([0,1])
    # replay_memory = list()
    # replay_memory = replay_memory
    for epoch in range(epochs):
        state , info = env.reset()
        state = torch.from_numpy(state)

        total_reward = 0
        for step in range(max_steps):
            # logits_actions = model_P.forward(state)
            # P_actions = F.softmax(logits_actions)
            P_actions = model_P.forward(state)
            # print(P_actions)
            prob = random.uniform(0, 1)
            P_right = P_actions[1].item()
            if prob > P_right:
                action = 0
            else:
                action = 1
            
            Q_value = model_Q.forward(state)[action]
            
            new_state, reward, done,truncated, info= env.step(action)
            new_state = torch.from_numpy(new_state)

            total_reward += reward
            
            if done or truncated:
                print(epoch,":",total_reward)
                break
            
            #Training Phase
            opt_P = opt_func(model_P.parameters(),abs(lrP*Q_value.item()))            
            P_actions = torch.clamp(P_actions,min=epsilon,max=1-epsilon)

            
            loss_P = torch.log(P_actions[action])
            # print("prob action:",P_actions[action].item(),"for Action:",action)
            print("P loss:" , loss_P.item())
            loss_P.backward()
            opt_P.step()
            opt_P.zero_grad()

            P_actions_new = model_P.forward(new_state)
            prob = random.uniform(0, 1)
            P_right = P_actions[1].item()
            if prob > P_right:
                new_action = 0
            else:
                new_action = 1
            lrQ_F = lrQ*(reward + gamma*model_Q.forward(new_state)[new_action] - Q_value)
            opt_Q = opt_func(model_Q.parameters(),abs(lrQ_F))
            Q_value.backward()
            opt_Q.step()
            opt_Q.zero_grad()
            
            
            #State-Change
            state = new_state
            

        # epsilon = 0.01
        
    env.close()

In [27]:
model_Q = Q_Network()
model_P = Policy_Network()

In [28]:
epochs=200
max_steps=300
gamma=0.7
lr = 0.003


Possible reasons:
1. Exploding value due to large value after log
2. Division by 0 somewhere
3. Exploding gradients
4. higher the learning rate faster the error arrives.
5. Maybe instead of using the optimiser iterating over the params and modifying them would be better.

In [29]:

train(epochs,1e-5,0.001,model_Q,model_P,max_steps,gamma)


P loss: -0.6689671277999878
P loss: -0.6990264058113098
P loss: -0.7162177562713623
P loss: -0.6610146164894104
P loss: -0.6724611520767212
P loss: -0.6958643794059753
P loss: -0.6740530729293823
P loss: -0.6920368671417236
P loss: -0.7034611701965332
P loss: -0.7088727951049805
P loss: -0.7107839584350586
P loss: -0.6961478590965271
P loss: -0.7074402570724487
0 : 14.0
P loss: -0.7114623785018921
P loss: -0.6669665575027466
P loss: -0.6745637059211731
P loss: -0.6975574493408203
P loss: -0.7129418849945068
P loss: -0.6668059825897217
P loss: -0.6732140183448792
P loss: -0.6875647306442261
P loss: -0.6845138669013977
P loss: -0.6992409229278564
P loss: -0.7147594094276428
P loss: -0.7211117148399353
P loss: -0.6690986156463623
P loss: -0.7201071381568909
P loss: -0.7164322733879089
P loss: -0.6867445111274719
P loss: -0.6725326180458069
P loss: -0.7175469994544983
P loss: -0.7132740616798401
P loss: -0.6955230236053467
P loss: -0.7087505459785461
P loss: -0.6919936537742615
P loss: -0.

KeyboardInterrupt: 

In [None]:
def test(model,episodes):
    env = gym.make("CartPole-v1",render_mode="human")
    state,info = env.reset()
    state = torch.from_numpy(state)
    total_reward=0
    for ep in range(episodes):
       
        action = torch.argmax(model.forward(state)).item()
        # print(action)
        new_state, reward, done,truncated, info= env.step(action)
        # print(env.step(action))
        total_reward+=reward
        # print(f"{ep} Total reward:",total_reward)
        if done or truncated:
            print(total_reward)
            break
        state = torch.from_numpy(new_state)    
    env.close()
        

In [None]:
for i in range(20):
    print(i)
    test(model,100)