The following code is part of "SymbXRL: Symbolic Explainable Deep Reinforcement Learning for Mobile Networks" 

Copyright - RESILIENT AI NETWORK LAB, IMDEA NETWORKS

DISCLAIMER: THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

# IMPORTS

In [2]:
import sys
sys.path.insert(0, '../../')
import numpy as np
import gymnasium as gym
import h5py
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import matplotlib.pyplot as plt

# SAC Imports
from constants import PROJ_ADDR
from SAC.sac import SAC
from SAC.replay_memory import ReplayMemory
from SAC.smartfunc import sel_ue
import torch
from custom_mimo_env import MimoEnv, reverse_sel_ue


# Action Steering Imports
from Action_Steering.action_steering_utils import process_buffer, transform_action, do_action_steering_this_timestep, extract_decision_from_suggested 
from Action_Steering.symbolic_representation import QuantileManager, Symbolizer
from Action_Steering.experiment_constants import KPI_LIST, USERS
from Action_Steering.decision_graph import DecisionGraph

In [None]:
# Check PyTorch version
print("PyTorch version:", torch.__version__)

# Check CUDA version
cuda_version = torch.version.cuda
print("CUDA version:", cuda_version)

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("Is CUDA available:", cuda_available)

# If CUDA is available, print the device name
if cuda_available:
    print("CUDA device name:", torch.cuda.get_device_name(0))

# ENVIRONMENT SETUP

In [None]:
# Load the Dataset
H_file = h5py.File(f'{PROJ_ADDR}/A2-MIMOResourceScheduler/Datasets/LOS_highspeed2_64_7.hdf5','r')
H = np.array(H_file.get('H'))
print("H shape is:", H.shape)
se_max_ur = H_file.get('se_max')
se_max_ur = np.array(se_max_ur)
print("se_max_ur shape is:", se_max_ur.shape)

In [None]:
env = MimoEnv(H, se_max_ur)

class SACArgs:
    def __init__(self):
        # Default values for arguments
        self.policy = "Gaussian"  # Policy name (Gaussian, Deterministic)
        self.eval = False # False : Train the agent, True : Evaluate the agent
        self.gamma = 0.99 # Discount factor
        self.tau = 0.005 # Target smoothing coefficient (Critic to Critic target network)
        self.lr = 0.0003 # Learning rate for the critic network
        self.alpha_lr = 0.0003 # Learning rate for the actor network
        self.alpha = 0.95 # Entropy coefficient (0.0 = no entropy, 1.0 = maximum entropy)
        self.automatic_entropy_tuning = True
        self.seed = 1 # Random seed
        self.batch_size = 256 # Batch size for Replay Memory
        self.max_episode_steps = len(H) # Maximum number of steps for each episode
        self.max_episode = 85 # Maximum number of episodes
        self.hidden_size = 512 # Hidden size for the networks
        self.updates_per_step = 1 # Number of updates per step
        # self.save_per_epochs = 15 
        self.start_steps = 3000 # Number of steps for uniform-random action selection, before running real policy. Helps exploration 
        self.target_update_interval = 1 # Value update interval for the Critic target networks 
        self.replay_size = 1000000 # Size of the replay buffer
        self.cuda = 1 # Cuda ID to use
        self.gpu_nums = 1 # Number of GPUs to use

# Creating an instance of SACArgs with default values
args = SACArgs()
torch.manual_seed(args.seed)
np.random.seed(args.seed)

num_states = env.observation_space.shape[0]
num_actions = len([env.action_space.sample()])
max_actions = env.action_space.n

agent = SAC(num_states, num_actions, max_actions, args, args.lr, args.alpha_lr)
print('SAC build finished')
ckpt_path = f"{PROJ_ADDR}/A2-MIMOResourceScheduler/models/SACG_884.53_551_dtLOS_HS2_checkpointed.pth_"
agent.load_checkpoint(ckpt_path)
memory = ReplayMemory(args.replay_size, args.seed)


# IMPLEMENTATION WITH AS

In [None]:
buffer_addpated_timestep = 0

step_rewards = []
acn_str = []
grp_str = []
mean_rew = []
fin_act = []
score = 0
state_data = []
action_reward_data = []
test_buffer = []

# Create the object of symbolizer and qm
kpis = KPI_LIST

# Action Steering
do_action_steering = True
start_action_steering_from = 5

# Symbolic tools Instantiation
qunatile_manager = QuantileManager(kpis + ['scheduled_user'])
qunatile_manager.reset()
qunatile_manager.partial_fit("scheduled_user", [0])
qunatile_manager.partial_fit("scheduled_user", [7])

# Define Symbolic df
symbolic_df = pd.DataFrame()

# Symbolizer
symbolizer = Symbolizer(quantile_manager=qunatile_manager, kpi_list=kpis, users=USERS)

rt_decision_graph = {
    0: DecisionGraph(column_name="decision"),
    1: DecisionGraph(column_name="decision"),
    2: DecisionGraph(column_name="decision"),
}

observation, info = env.reset()
done = False
grp_str.append(observation[0, 14:])

action_steering_info_df = []

while not done:
    action, final_action = agent.select_action(observation)
    fin_act.append(final_action[0])
    ue_select, idx = sel_ue(final_action[0])
    acn_str.append(ue_select)
    final_reward = 0

    # Code goes
    buff_ac = []
    buff_ac.append((observation, action))

    curr_states_df, curr_actions_rewards_df = process_buffer(buff_ac, transform_action, sel_ue, mode=None, timestep=info['current_step'])

    # Symbolic Representation Section
    state_t_df = curr_states_df[curr_states_df['timestep'] == info['current_step']]
    decision_t_df = curr_actions_rewards_df[curr_actions_rewards_df['timestep'] == info['current_step']]

    symbolic_form = symbolizer.create_symbolic_form(state_t_df, decision_t_df)

    if not symbolic_form.empty:
        # Add updated timestep to the symbolic_form
        buffer_addpated_timestep += 1
        symbolic_form['timestep'] = [buffer_addpated_timestep] * symbolic_form.shape[0]
        symbolic_form['reward'] = [0] * symbolic_form.shape[0]

        # Create and fetch info for the action steering
        agent_proposed_reward = env.get_reward(final_action[0])
        # Get the reward for agent's action
        action_steering_info = {
            "timestep": buffer_addpated_timestep,
            "agent_decsion": ue_select,
            "agent_reward": agent_proposed_reward,
            "used_action_steering": False,
            "action_steering_decision": ue_select,
            "action_steering_reward": agent_proposed_reward
        }

    action_steered = False

    if (info['current_step'] > start_action_steering_from) and (do_action_steering):
        suggested_decision, action_steered_reward = do_action_steering_this_timestep(symbolic_form, symbolic_df, rt_decision_graph)

        if type(suggested_decision) != bool:
            if action_steered_reward > agent_proposed_reward:
                extracted_decisoin = extract_decision_from_suggested(suggested_decision)
                action_steered_final = reverse_sel_ue(extracted_decisoin)
                ue_select = extracted_decisoin

                # Update the symbolizer with action_steering data
                decision_t_df.at[0, 'action'] = extracted_decisoin

                symbolic_form_as = symbolizer.create_symbolic_form(state_t_df, decision_t_df)

                next_obs, reward, done, _, info = env.step(action_steered_final)
                action_steering_info['action_steering_decision'] = extracted_decisoin
                action_steering_info['action_steering_reward'] = reward
                action_steering_info['used_action_steering'] = True
                symbolic_form['decision'] = symbolic_form_as['decision']
                symbolic_form['sched_members'] = symbolic_form_as['sched_members']

                final_reward = reward
                action_steered = True

    if not action_steered:
        next_obs, reward, done, _, info = env.step(final_action[0])
        final_reward = reward

    if not symbolic_form.empty:
        symbolic_form['reward'] = [final_reward] * symbolic_form.shape[0]
        action_steering_info_df.append(action_steering_info)
        symbolic_df = pd.concat([symbolic_df, symbolic_form], ignore_index=True)

        # Add timestep decisions to graph
        timestep_groups = symbolic_form['group'].unique()
        for group in timestep_groups:
            rt_decision_graph[group].update_graph(symbolic_form[symbolic_form['group'] == group])

    symbolizer.step()

    grp_str.append(next_obs[0, 14:])
    score += final_reward
    step_rewards.append(final_reward)
    mean_reward = np.mean(step_rewards)
    mean_rew.append(mean_reward)

    log_print = f'Step: {info["current_step"]} / {env.total_steps - 1} | Agent Action: {ue_select} | Step Reward: {final_reward} | Mean Reward: {mean_reward:.3f} | Score: {score:.3f}'
    print(log_print, end="\r")
    # print("------------------------------------")

    state_data.append(observation.flatten())
    action_reward_data.append([ue_select, reward])
    test_buffer.append((observation, action, reward, next_obs, done))
    observation = next_obs

symbolic_df = symbolic_df.reset_index(drop=True)


# RESULTS AND PLOTS

In [None]:
# Plot to show the reward over time with and without action steering
action_steering_info_df = pd.DataFrame(action_steering_info_df)
action_steering_info_df['timestep'] = action_steering_info_df['timestep'] - action_steering_info_df['timestep'].min() + 1

# Create traces for each DataFrame
trace1 = go.Scatter(x=action_steering_info_df['timestep'], y=action_steering_info_df['agent_reward'], mode='lines', name='No AS Reward')
trace2 = go.Scatter(x=action_steering_info_df['timestep'], y=action_steering_info_df['action_steering_reward'], mode='markers', name='With AS Reward')

# Create the layout
layout = go.Layout(
    title='Reward Over Time',
    xaxis=dict(title='Timestep'),
    yaxis=dict(title='Reward', range=[0, 1]),  # Set y-axis range from 0 to 1
    height=1200
)

# Combine the traces and layout into a figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

fig.update_layout(font=dict(size=25))

# Show the figure
fig.show()


In [None]:
# Plot the difference in reward over time between the agent reward and action steering reward

action_steering_info_df['difference'] = action_steering_info_df['action_steering_reward'] - action_steering_info_df['agent_reward']

trace1 = go.Scatter(
    x=action_steering_info_df['timestep'], 
    y=action_steering_info_df['difference'], 
    mode='lines', 
    name='Reward Difference'
)

# Create the layout
layout = go.Layout(
    title='Difference of Reward Over Time',
    xaxis=dict(title='Timestep'),
    yaxis=dict(title='Reward', range=[-1, 1]),  # Set y-axis range from -1 to 1
    height=1200
)

# Combine the traces and layout into a figure
fig = go.Figure(data=[trace1], layout=layout)

fig.update_layout(font=dict(size=25))

# Show the figure
fig.show()
