<a href="https://colab.research.google.com/github/RizanSM/zero_shot_llms_in_HIL_RL/blob/main/01_highway_env/02_default_env/07_LLM_DIRECT/01_Policy_Training_LLM_DIRECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries in your Google Colab environment
!pip install stable-baselines3 gymnasium highway-env ollama -q

In [None]:
# Import the necessary libraries
import gymnasium as gym
import highway_env
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
import ollama
import matplotlib.animation as animation
import re

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.interpolate import interp1d
from tqdm import tqdm
from sklearn.metrics import silhouette_score

from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from google.colab import data_table
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Load the dataframe back from the pickle file
trajectory_df = pd.read_pickle('/content/drive/MyDrive/data_rp1/2_trajectories/0_initial_training/0_initial_trajectory_df.pkl')       # Update directory location 1

In [None]:
# Display the data frame
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df.head())

In [None]:
# Check the data type of each column
print(type(trajectory_df['episode'][0]))
print(type(trajectory_df['time_step'][0]))
print(type(trajectory_df['state'][0]))
print(type(trajectory_df['action'][0]))
print(type(trajectory_df['reward'][0]))
print(type(trajectory_df['next_state'][0]))
print(type(trajectory_df['collision_flag'][0]))
print(type(trajectory_df['lane_index'][0]))

In [None]:
def apply_pca(trajectory_df):
    """
    Apply PCA to reduce trajectory features to 3 principal components.
    """
    # Convert 'state' and 'next_state' columns to numerical features
    trajectory_df['state'] = trajectory_df['state'].apply(lambda x: x[1])
    trajectory_df['next_state'] = trajectory_df['next_state'].apply(lambda x: x[1])

    # Convert 'state' and 'next_state' columns to numerical features
    # Check if the element is a list or tuple before indexing
    #trajectory_df['state'] = trajectory_df['state'].apply(lambda x: x[0] if isinstance(x, (list, tuple)) else x)
    #trajectory_df['next_state'] = trajectory_df['next_state'].apply(lambda x: x[0] if isinstance(x, (list, tuple)) else x)

    features = ["state", "action", "reward", "collision_flag", "lane_index"]
    pca = PCA(n_components=3)
    X = trajectory_df[features]
    pca_features = pca.fit_transform(X)

    trajectory_df_pca = trajectory_df.copy()
    trajectory_df_pca["PC1"] = pca_features[:, 0]
    trajectory_df_pca["PC2"] = pca_features[:, 1]
    trajectory_df_pca["PC3"] = pca_features[:, 2]

    return trajectory_df_pca, pca

In [None]:
def find_optimal_clusters(data, max_clusters=10):
    """
    Determine the optimal number of clusters using the Silhouette Score.
    """
    best_score = -1
    best_k = 2
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(data)
        score = silhouette_score(data, cluster_labels)
        if score > best_score:
            best_score = score
            best_k = k
    return best_k

In [None]:
def apply_clustering(trajectory_df_pca):
    """
    Apply KMeans clustering to select representative timesteps for LLM evaluation.
    """
    features = ["PC1", "PC2", "PC3"]
    X = trajectory_df_pca[features]
    pca_features = X
    optimal_clusters = find_optimal_clusters(pca_features)
    print(f"Optimal number of clusters: {optimal_clusters}")
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
    trajectory_df_pca["Cluster"] = kmeans.fit_predict(trajectory_df_pca[["PC1", "PC2", "PC3"]])

    # Select one representative per cluster
    cluster_representatives = trajectory_df_pca.groupby("Cluster").first().reset_index()

    return trajectory_df_pca, cluster_representatives, kmeans

In [None]:
def visualize_clusters(trajectory_df_pca):
    """
    Visualize PCA-clustered data in a 3D scatter plot.
    """
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(
        trajectory_df_pca["PC1"],
        trajectory_df_pca["PC2"],
        trajectory_df_pca["PC3"],
        c=trajectory_df_pca["Cluster"], cmap='viridis', alpha=0.6
    )
    plt.colorbar(scatter, label="Cluster ID")
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    plt.title("3D PCA-Clustering Visualization")
    plt.show()


In [None]:
trajectory_df_pca, pca = apply_pca(trajectory_df)

In [None]:
trajectory_df_pca, cluster_representatives, kmeans = apply_clustering(trajectory_df_pca)

In [None]:
print(cluster_representatives)
print(kmeans)

In [None]:
visualize_clusters(trajectory_df_pca)

B: LLM FEEDBACK IMPLEMENTATION <br>
SECTION B.0: LOADING THE LLM
*   Step B.0.1: Install Required Libraries
*   Step B.0.2: Setting the environment varaible  
*   Step B.0.3: Setup and Load the Pretrained LLM

In [None]:
!pip install colab-xterm

In [None]:
%load_ext colabxterm

In [None]:
%xterm
# Execute the following commands sequentially in Xterm
# curl -fsSL https://ollama.com/install.sh | sh
# ollama serve & ollama pull mistral
# ollama list
# ollama show mistral

In [None]:
def visualize_feedback_progress(progress):
    fig, ax = plt.subplots()
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Progress (%)")
    ax.set_ylabel("Completion")

    def update(frame):
        ax.clear()
        ax.barh(["LLM Feedback"], [frame], color='blue')
        ax.set_xlim(0, 100)
        ax.set_title("Live LLM Feedback Collection Progress")

    ani = animation.FuncAnimation(fig, update, frames=progress, repeat=False)
    plt.show()

In [None]:
def get_llm_feedback(state, action, reward, next_state, episode_num, time_step, collision_flag, lane_index, pc1, pc2, pc3):
    """
    Query LLM for feedback on action effectiveness and reward appropriateness, incorporating principal components.
    """
    prompt = f"""
    You are an expert in analyzing reinforcement learning trajectories in a highway environment.
    You will process a structured dataset containing agent trajectories and analyze each row by assessing whether the action taken is effective or not,
    and whether the reward awarded is appropriate or not.

    You are provided with a trajectory data frame containing the following details:
      - Episode: {episode_num}
      - Time Step: {time_step}
      - State: {state}
      - Action taken by agent: {action}
      - Reward: {reward}
      - Next state: {next_state}
      - Collision Flag: {collision_flag}
      - Lane Index: {lane_index}
      - Principal Components: PC1 = {pc1}, PC2 = {pc2}, PC3 = {pc3}

    Your task is to analyze this data and provide feedback based on:
    1. Action Effectiveness Evaluation:
       a. If the action taken is effective, respond with:
          llm_score_1 = +2
       b. If the action taken is ineffective, respond with:
          llm_score_1 = -2
       c. If the action taken has no effect (neither effective nor ineffective), respond with:
          llm_score_1 = 0

    2. Reward Appropriateness Evaluation:
       a. If the reward awarded is appropriate, respond with:
          llm_score_2 = +1
       b. If the reward awarded is inappropriate, respond with:
          llm_score_2 = -1

    For each row, your output feedback should strictly follow this format:
       - "Justification for the decision"
       - Action Effectiveness Evaluation: <llm_score_1>
       - Reward Appropriateness Evaluation: <llm_score_2>
    """

    answer = ollama.generate(model="mistral", prompt=prompt, options={"temperature": 0.6, "seed": 4})
    feedback = answer['response'].strip()

    return feedback

In [None]:
def collect_llm_feedback_cluster(cluster_representatives):
    llm_feedback_data = {}
    progress = []
    for i, (_, row) in tqdm(enumerate(cluster_representatives.iterrows()), total=len(cluster_representatives)):
        feedback = get_llm_feedback(
            row["state"], row["action"], row["reward"], row["next_state"],
            row["episode"], row["time_step"], row["collision_flag"], row["lane_index"],
            row["PC1"], row["PC2"], row["PC3"]
        )
        llm_feedback_data[row["Cluster"]] = feedback
        progress.append((i + 1) / len(cluster_representatives) * 100)
        visualize_feedback_progress(progress)
    return llm_feedback_data

In [None]:
def interpolate_llm_scores(trajectory_df_pca, llm_feedback_data):
    """
    Interpolates LLM feedback scores across all timesteps.
    """
    trajectory_df_pca["LLM_Adjusted_Score"] = trajectory_df_pca["Cluster"].map(llm_feedback_data)
    trajectory_df_pca["LLM_Adjusted_Score"] = trajectory_df_pca["LLM_Adjusted_Score"].interpolate()
    return trajectory_df_pca

In [None]:
llm_feedback_data = collect_llm_feedback_cluster(cluster_representatives)


In [None]:
trajectory_df_pca = interpolate_llm_scores(trajectory_df_pca, llm_feedback_data)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_pca)

In [None]:
# Group the data by 'episode'
episode_data = trajectory_df_pca.groupby('episode')
# Loop through each episode
for episode, data in episode_data:

    cluster_list = data['Cluster'].tolist()

    # Count the total number of time steps in the episode
    total_timesteps = data['time_step'].max() + 1  # Assuming time_step starts from 0

    print(f"Episode {episode}:Total timesteps {total_timesteps}: {cluster_list}")

Section B.2: REWARD MODELLING(LLM FEEDBACK)

*   Step B.2.1: Recalibrate Reward Based on LLM Feedback
*   Step 3.2.2: Displaying the recalibrated rewards based on LLM feedback
*   Step 3.2.3: Access the reward for a specific step

In [None]:
def add_llm_scores_to_dataframe(trajectory_df, trajectory_df_pca):
    # Make a copy of the original dataframe
    updated_df = trajectory_df.copy()

    # Define regex patterns to extract scores
    #score_1_pattern = r"Action Effectiveness Evaluation: llm_score_1 = (\d+)"
    #score_2_pattern = r"Reward Appropriateness Evaluation: llm_score_2 = (\d+)"

    score_1_pattern = r"Action Effectiveness Evaluation: llm_score_1 = ([+-]?\d+)"
    score_2_pattern = r"Reward Appropriateness Evaluation: llm_score_2 = ([+-]?\d+)"

    def extract_llm_scores(summary_text):
        """Extract llm_score_1 and llm_score_2 from the given text."""
        score_1_match = re.search(score_1_pattern, summary_text)
        score_2_match = re.search(score_2_pattern, summary_text)

        llm_score_1 = int(score_1_match.group(1)) if score_1_match else 0
        llm_score_2 = int(score_2_match.group(1)) if score_2_match else 0

        return llm_score_1, llm_score_2

    # Extract LLM scores from trajectory_df_pca
    llm_scores = trajectory_df_pca['LLM_Adjusted_Score'].apply(extract_llm_scores)

    # Create new columns
    updated_df['LLM_feedback_score'] = llm_scores.apply(lambda scores: f"llm_score_1={scores[0]}, llm_score_2={scores[1]}")
    updated_df['LLM_score'] = llm_scores.apply(lambda scores: scores[0] + scores[1])

    return updated_df

In [None]:
llm_feedback_df = add_llm_scores_to_dataframe(trajectory_df, trajectory_df_pca)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(llm_feedback_df)

In [None]:
# Group the data by 'episode'
episode_data = llm_feedback_df.groupby('episode')
# Loop through each episode
for episode, data in episode_data:
    # Extract lane indices
    llm_score_list = data['LLM_score'].tolist()

    # Count the total number of time steps in the episode
    total_timesteps = data['time_step'].max() + 1  # Assuming time_step starts from 0

    print(f"Episode {episode}:Total timesteps {total_timesteps}: {llm_score_list}")

In [None]:
# Step B.2.3: Recalibrate Reward Based on LLM Feedback
# Function to recalibrate the rewards
def recalibrate_rewards(df):
    # Create a copy of the dataframe
    df_copy = df.copy()

    # Create the 'Recalibrated_rewards' column
    df_copy['Recalibrated_rewards'] = df_copy['reward'] + df_copy['LLM_score']

    # Get the list of recalibrated rewards
    recalibrated_rewards_list = df_copy['Recalibrated_rewards'].tolist()

    return df_copy, recalibrated_rewards_list

In [None]:
# Apply the function to recalibrate rewards
recalibrated_df, recalibrated_rewards_list = recalibrate_rewards(llm_feedback_df)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(recalibrated_df)

In [None]:
recalibrated_df.to_pickle('/content/drive/MyDrive/data_rp1/2_trajectories/2_llm_d/1_llm_feedback_ideal_df_with_sil_score.pkl')         # Update directory location 2

In [None]:
# To access the reward for a specific step:
for i, feedback in enumerate(recalibrated_rewards_list):
    human_recalibrated_reward_for_step = recalibrated_rewards_list[i]
    print(f"Recalibrated reward for step {i}: {human_recalibrated_reward_for_step}")

SECTION A.5: MODEL TRAINING(HUMAN FEEDBACK DIRECT- IDEAL CASE SCENARIO)
*   Step A.5.1: CUSTOM REWARD FUNCTION
*   Step A.5.2: LOAD THE SAVED INITIALLY TRAINED PPO MODEL FROM GOOGLE DRIVE
*   Step A.5.3: TRAIN/UPDATE PPO MODEL WITH RECALIBRATED REWARD
*   Step A.5.4: SAVE THE TRAINED MODEL(HF_IDEAL) FOR TESTING

In [None]:
# Step A.5.1: CUSTOM REWARD FUNCTION
def custom_reward(self, env, state, action, next_state, reward, done):
    # Access and recalculate the reward using human_feedback_data or recalibrate_rewards_human function
    global step_counter
    try:
        step_counter
    except NameError:
        step_counter = 0

    reward = recalibrated_rewards_list[step_counter]
    step_counter += 1
    return reward

# Create a new environment class that wraps your original environment and overrides the default reward function with your custom function
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        next_state, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        reward = custom_reward(self, self.env, self.last_obs, action, next_state, reward, done)
        # custom_reward should be defined and accessible to your class
        self.last_obs = next_state
        return next_state, reward, terminated, truncated, info

    def reset(self, **kwargs):
        global step_counter
        step_counter = 0
        self.last_obs = self.env.reset(**kwargs)[0]  # Assuming Gymnasium env returns (obs, info)
        return self.last_obs, {}  # Assuming Gymnasium env requires (obs, info)
# Create and wrap the environment with your custom reward wrapper
# env_human = CustomRewardWrapper(gym.make('highway-v0'))

PPO training and Training logs

In [None]:
drive_log_dir = "/content/drive/MyDrive/data_rp1/0_log_dir/7_ppo_highway_llmf_direct_ideal_1"             # Update directory location 3

In [None]:
# Train PPO with Custom Rewards
def train_ppo_with_custom_rewards(log_dir=drive_log_dir, total_timesteps=10000):
    os.makedirs(log_dir, exist_ok=True)
    env = CustomRewardWrapper(gym.make("highway-v0"))
    env = Monitor(env, log_dir)
    model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_dir)
    model.learn(total_timesteps=total_timesteps)
    model.save('/content/drive/MyDrive/data_rp1/1_trained_models/7_ppo_highway_llmf_direct_ideal_1')      # Update directory location 4
    return model, log_dir

In [None]:
# log_path = os.path.join(drive_log_dir, "monitor.csv")
# df = pd.read_csv(log_path, skiprows=1)
## Ensure episodes are logged correctly
# df.reset_index(inplace=True)
# df.rename(columns={"index": "episode", "r": "reward", "l": "length", "t": "time_step"}, inplace=True)

In [None]:
# data_table.enable_dataframe_formatter()
# data_table.DataTable(df)

In [None]:
# Execute Training and Convergence Tracking
model, log_dir = train_ppo_with_custom_rewards(total_timesteps=10000)