# THIS MODEL USES REINFORCEMENT LEARNING

# Import modules...

In [105]:
import torch ## torch let's us create tensors and also provides helper functions
import torch.nn as nn ## torch.nn gives us nn.Module(), nn.Embedding() and nn.Linear()
import torch.nn.functional as F # This gives us sigmoid()???
from torch.optim import Adam # optim contains many optimizers. This time we're using Adam

import lightning as L ## Lightning makes it easier to write, optimize and scale our code
from torch.utils.data import TensorDataset, DataLoader ## We'll store our data in DataLoaders

# Create training data

In [106]:
## We don't need training data for reinforcement learning
## Since the model itself does the exploring. So it creates
## the inputs and discovers the "labels" and a reward
## tells us if we got the right "label" or not.

training_inputs = torch.tensor([0.8, 0.1, 0.2, 0.9])

training_labels = torch.tensor([1.0, 0.0, 0.0, 1.0])

## Now let's package everything up into a DataLoader...
training_dataset = TensorDataset(training_inputs, training_labels) 
dataloader = DataLoader(training_dataset)

# Create a Neural Network with a Trainable Bias

In [108]:
class simpleNN_with_RL(L.LightningModule):
    
    def __init__(self):
        
        super().__init__()

        L.seed_everything(seed=42)
        
        self.weight = torch.tensor(20)
        self.bias = nn.Parameter(torch.tensor(0.0)) ## The ideal value for the bias is -10

        
        self.gamma = torch.tensor(0.99)
        self.reward = torch.tensor(0)

        # self.loss = nn.CrossEntropyLoss()

    
    def forward(self, inputs):
        ## A forward pass through a super simple neural network

        p_norm = torch.sigmoid(inputs * self.weight + self.bias)

        return p_norm

    
    def configure_optimizers(self): 
        ## Configure the optimizer we want to use for backpropagation.
        return Adam(self.parameters(), lr=0.1)
    
    
    def training_step(self, batch, batch_idx): # take a step during gradient descent.
        ## NOTE: When training_step() is called it calculates the loss with the code below...
        # inputs, labels = batch # collect input
        # outputs = self.forward(inputs) # run input through the neural network

        ## First, decide how hungry we are...
        how_hungry = torch.rand(1).to("mps")
        ## Now use how_hungry to get a probability for going to Norm's from the policy network...
        outputs = self.forward(how_hungry)
        
        # print("outputs:", outputs)
        ## now figure out if we go to Norm's or Squatch's
        rand_num = torch.rand(1).to("mps")
        # print("rand_num:", rand_num)
        if(rand_num < outputs):
            ## go to norm's
            # print("\tgoing to norms!")
            ## Now determine if norm is giving us a large order or not
            if (torch.rand(1) < 0.8): # Norm gave us a large order...
                if(how_hungry > 0.5): # 
                    self.reward = 1 # We are hungry and happy we got a large order
                else:
                    self.reward = -1 # We are not hungry and not happy we got a large
            else: # Norm gave us a small order
                if(how_hungry > 0.5):
                    self.reward = -1  # We are hungry and sad we got a small order
                else:
                    self.reward = 1 # We are not hungry and happy we got a small order
        else:
            ## go to squatch's
            # print("\tgoing to squatch!")
            outputs = 1 - outputs # convert to probability of visiting Squatch
            ## Now determine if Squatch is giving us a large order or not
            if (torch.rand(1) < 0.2): # Squatch gave us a large order...
                if(how_hungry > 0.5): 
                    self.reward = 1 # We are hungry and happy squatch gave us a large order
                else:
                    self.reward = -1 # We are not hungry and sad we got a large order
            else: # Squatch gave us a small order
                if(how_hungry > 0.5): 
                    self.reward = -1 # We are hungry and sad squatch gave us a small order
                else:
                    self.reward = 1 # We are not hungry and happy we got a small order


        ## convert outputs for norm's in to probabilities for norm's
        # outputs = (labels * outputs) + ((1 - labels) * (1 - outputs))
        # loss = -torch.log(action_probs[0, action]) * total_reward
            
        loss = -1 * torch.log(outputs) * self.gamma * self.reward
        
        return loss

# Run the inputs through neural network to make sure forward() works...

In [109]:
model = simpleNN_with_RL()

Seed set to 42


In [110]:
training_inputs

tensor([0.8000, 0.1000, 0.2000, 0.9000])

In [111]:
model(training_inputs)

tensor([1.0000, 0.8808, 0.9820, 1.0000], grad_fn=<SigmoidBackward0>)

# Now train the NN

In [112]:
model = simpleNN_with_RL()
trainer = L.Trainer(max_epochs=70)
trainer.fit(model, train_dataloaders=dataloader)

Seed set to 42
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 1      | n/a 
---------------------------------------------
1         Trainable params
0         Non-trainable params
1         Total params
0.000     Total estimated model params size (MB)


Training: |                                               | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=70` reached.


In [113]:
model(training_inputs)

tensor([9.9654e-01, 2.3959e-04, 1.7676e-03, 9.9953e-01],
       grad_fn=<SigmoidBackward0>)

In [114]:
for name, param in model.named_parameters():
    print(name, torch.round(param.data, decimals=2))

bias tensor(-10.3400)


In [115]:
model(torch.tensor(1.0))

tensor(0.9999, grad_fn=<SigmoidBackward0>)

In [116]:
model(torch.tensor(0.5))

tensor(0.4167, grad_fn=<SigmoidBackward0>)

In [117]:
model(torch.tensor(0.0))

tensor(3.2431e-05, grad_fn=<SigmoidBackward0>)

In [12]:
## THIS IS AN EXAMPLE GOOGLE GENERATED
## It uses the REINFORCE algorithm, which maximizes the expected reward.

import torch
import torch.nn as nn
import torch.optim as optim
import gym

# Define a simple neural network for our agent
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.softmax(x, dim=-1)

# Create an environment
env = gym.make("CartPole-v1")

# Define hyperparameters
learning_rate = 0.01
gamma = 0.99

# Initialize the policy network and optimizer
policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

# Training loop
for episode in range(1000):
    state = env.reset()
    total_reward = 0

    while True:
        # Choose an action based on the current policy
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        action_probs = policy_net(state_tensor)
        action = torch.multinomial(action_probs, 1).item()

        # Take the action in the environment
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # Update the policy network
        if done:
            loss = -torch.log(action_probs[0, action]) * total_reward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            break

        state = next_state

    if episode % 100 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

ModuleNotFoundError: No module named 'gym'