In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import gymnasium as gym
import pygame 
import matplotlib.pyplot
import numpy as np

In [None]:
# Environment Definiton
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()
print(observation)
print(info)
env.close()

In [None]:
# Loss Function definion

# Model Definition

class LunarLanderRL(tf.keras.Model):
    def __init__(self):
        super(LunarLanderRL, self).__init__()
        self.dense1 = tf.keras.layers.Dense(16, activation = 'relu')
        self.dense2 = tf.keras.layers.Dense(32, activation = 'relu')
        self.dense3 = tf.keras.layers.Dense(64, activation = 'relu')
        self.classifier = tf.keras.layers.Dense(4, activation = 'softmax')
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        return self.classifier(x)

In [None]:
# Defining one Episode

def Episode(model = tf.keras.Model):
    observation, info = env.reset()
    reward = 0
    while(True):
        model.predict()
        




In [None]:
env = gym.make('LunarLander-v2', render_mode = 'human')
model = LunarLanderRL()

optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def train_step(initial_state, action, reward, next_state, done):
    with tf.GradientTape() as tape:
        # Predict action probabilities
        action_probs = model(initial_state)
        # Compute the log probability of the chosen action
        log_prob = tf.math.log(action_probs[0, action])
        # Compute the advantage
        advantage = reward - tf.reduce_max(action_probs)
        # Compute loss value as the negative log probability of the chosen action, weighted by the advantage
        loss_value = -log_prob * advantage
        # If episode is done, apply a penalty to the loss
        if done:
            loss_value += 100  # Penalty value can be tuned
    # Compute gradient and apply it to the model
    print(loss_value)
    grads = tape.gradient(loss_value, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss_value

# Training loop
for episode in range(1000):  # Number of episodes can be tuned
    initial_state, _ = env.reset()
    done = False
    while not done:
       # Reshape the state to have an extra dimension
        state = np.reshape(initial_state, [1, -1])
        action_probs = model.predict(state, verbose=None)
        action = np.random.choice(range(env.action_space.n), p=action_probs.ravel())
        next_state, reward, done, _, _= env.step(action)\
        # Also reshape the next_state before passing it to train_step
        next_state = np.reshape(next_state, [1, -1])
        loss_value = train_step(state, action, reward, next_state, done)
        initial_state = next_state 