<a href="https://colab.research.google.com/github/RizanSM/zero_shot_llms_in_HIL_RL/blob/main/01_highway_env/02_default_env/06_BIASED_HF_RSM_CON/01_Policy_Training_BIASED_HF_RSM_CON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries in your Google Colab environment
!pip install stable-baselines3 gymnasium highway-env -q
# Install necessary packages
!pip install torch numpy pandas matplotlib -q

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gymnasium as gym
import highway_env
import os

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from google.colab import data_table
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Step 1: Load and Preprocess the Data
conservative_df =  pd.read_pickle('/content/drive/MyDrive/data_rp1/2_trajectories/1_human_feedback/3_Biased_Hf_D_Conservative_df.pkl')       # Update directory location 1

In [None]:
# Display the data frame
data_table.enable_dataframe_formatter()
data_table.DataTable(conservative_df)

In [None]:
# Selecting relevant features and target
features = ['state', 'action', 'collision_flag', 'lane_index']
target = 'Recalibrated_rewards'

In [None]:
# Convert features into tensor-compatible format
def process_features(df):
    # X = np.stack(df['state'].apply(lambda x: np.array(x)).values)  # Convert state to numpy array
    X = np.vstack(df['state'].apply(lambda x: np.array(x, dtype=np.float32)).values)  # Convert state to numpy array with float32 dtype
    # X = np.hstack([X, df[['action', 'collision_flag', 'lane_index']].values])
    X = np.hstack([X, df[['action', 'collision_flag', 'lane_index']].astype(np.float32).values])
    y = df[target].values
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

X, y = process_features(conservative_df)             # change here the dataframe name
print(X.shape)
print(y.shape)

In [None]:
# Split data into train and validation sets
train_size = int(0.8 * len(X))
val_size = len(X) - train_size
train_data, val_data = random_split(TensorDataset(X, y), [train_size, val_size])
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

In [None]:
# Step 2: Define the Reward Model (Neural Network) with Dropout and L2 Regularization
class RewardModel(nn.Module):
    def __init__(self, input_dim):
        super(RewardModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)  # Added Dropout

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout after activation
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

# Initialize model
input_dim = X.shape[1]
print(input_dim)
model = RewardModel(input_dim)

# Define loss function and optimizer
criterion = nn.SmoothL1Loss()  # Replaced MSE with Huber Loss
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Switched to AdamW and added L2 Regularization
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)  # Added ReduceLROnPlateau

# Early stopping parameters
patience = 10  # Increased patience for early stopping
min_delta = 0.0001
best_loss = float('inf')
patience_counter = 0
train_losses = []
val_losses = []

In [None]:
# Step 3: Train the Reward Model with Early Stopping
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation loss
    model.eval()
    with torch.no_grad():
        val_loss = sum(criterion(model(batch_X).squeeze(), batch_y).item() for batch_X, batch_y in val_loader) / len(val_loader)

    val_losses.append(val_loss)
    scheduler.step(val_loss)  # Apply learning rate scheduler
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping criteria
    if val_loss < best_loss - min_delta:
        best_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered!")
        break

# Plot losses
plt.figure(figsize=(10,5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training & Validation Losses')
plt.show()

In [None]:
# Step 4: Train PPO Using the Trained Reward Model

# Custom reward function using the trained reward model
def custom_reward(env, state):
    state_tensor = torch.tensor(np.array(state), dtype=torch.float32).unsqueeze(0)
    return model(state_tensor).item()

# Custom environment wrapper to replace environment reward with learned reward
class CustomHighwayEnv(gym.Wrapper):
    def __init__(self, env):
        super(CustomHighwayEnv, self).__init__(env)

    def step(self, action):
        next_state, _, done, truncated, info = self.env.step(action)
        reward = custom_reward(self.env, next_state)  # Use learned reward
        return next_state, reward, done, truncated, info


In [None]:
# Step 4: Train PPO Using the Trained Reward Model

# Custom reward function using the trained reward model
def custom_reward(env, state):
    # Debugging: Print the state shape before processing
    # print("State shape before processing:", state.shape)

    # Reshape the state to match the expected input dimension of the RewardModel
    state_tensor = torch.tensor(state, dtype=torch.float32).reshape(1, -1)  # Reshape to (1, num_features)


    # Ensure the state tensor has the correct dimensions for the model
    input_dim = model.fc1.in_features  # Get expected input dimension from model
    if state_tensor.shape[1] < input_dim:
        padding = torch.zeros((1, input_dim - state_tensor.shape[1]))
        state_tensor = torch.cat([state_tensor, padding], dim=1)

    # Reshape to (1, num_features), and slice off any extra dimensions that are not expected.
    # state_tensor = torch.tensor(state, dtype=torch.float32).reshape(1, -1)[:, :input_dim]
    return model(state_tensor).item()

# Custom environment wrapper to replace environment reward with learned reward
class CustomHighwayEnv(gym.Wrapper):
    def __init__(self, env):
        super(CustomHighwayEnv, self).__init__(env)

    def step(self, action):
        next_state, _, done, truncated, info = self.env.step(action)
        reward = custom_reward(self.env, next_state)  # Use learned reward
        return next_state, reward, done, truncated, info

PPO training and Training logs

In [None]:
# env = CustomHighwayEnv(gym.make('highway-v0'))
# env = DummyVecEnv([lambda: env])

In [None]:
drive_log_dir = "/content/drive/MyDrive/data_rp1/0_log_dir/6_ppo_highway_biased_hf_lrs_conservative"              # Update directory location 2

In [None]:
# Train PPO with Custom Rewards
def train_ppo_with_custom_rewards(log_dir=drive_log_dir, total_timesteps=10000):
    os.makedirs(log_dir, exist_ok=True)
    env = CustomHighwayEnv(gym.make("highway-v0"))
    env = Monitor(env, log_dir)
    model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_dir)
    model.learn(total_timesteps=total_timesteps)
    model.save('/content/drive/MyDrive/data_rp1/1_trained_models/6_ppo_highway_biased_hf_lrs_conservative')         # Update directory location 3
    return model, log_dir

In [None]:
# log_path = os.path.join(drive_log_dir, "monitor.csv")
# df = pd.read_csv(log_path, skiprows=1)
## Ensure episodes are logged correctly
# df.reset_index(inplace=True)
# df.rename(columns={"index": "episode", "r": "reward", "l": "length", "t": "time_step"}, inplace=True)

In [None]:
# data_table.enable_dataframe_formatter()
# data_table.DataTable(df)

In [None]:
# Execute Training and Convergence Tracking
model, log_dir = train_ppo_with_custom_rewards(total_timesteps=10000)