# Test f(s,a,s')

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from DeepAIRL import DeepAIRL

In [10]:
fm = np.load('static/fm.npy')
expert_traj = {
    'state':[],
    'action':[],
    'reward':[],
    'next_state':[],
    'done':[],
    'labels':[]
}
expert_traj['state'] = np.random.rand(5, 12).tolist()
action = np.random.randint(0, 3, 5).reshape(5, 1)
expert_traj['encoded_action'] = tf.keras.utils.to_categorical(action, num_classes=3)
expert_traj['action'] = action.tolist()
expert_traj['reward'] = np.random.rand(5)
expert_traj['next_state'] = np.random.rand(5, 12).tolist()
expert_traj['done'] = np.zeros(5)
expert_traj['labels'] = np.ones(5)
traj_length = 10

In [8]:
gen_traj = {
    'state':[],
    'action':[],
    'reward':[],
    'next_state':[],
    'done':[],
    'label':[]
}
gen_traj['state'] = expert_traj['state']
gen_traj['next_state'] = np.flipud(fm[0:10])
gen_traj['done'] = np.zeros(10)
gen_traj['action'] = np.random.randint(0, 3, 10)

In [3]:
alg = DeepAIRL(
    env_flag='PF',
    user_num=3,
    rbg_num=1,
    feature_dim=4,
    n_states=1296,
    n_actions=3,
    traj=expert_traj,
    traj_length=traj_length
)

In [13]:
policy = alg.DDPGAgent.actor_eval_net.predict(expert_traj['state'])
policy

array([[0.21906003, 0.4625738 , 0.31836614],
       [0.20018406, 0.45232546, 0.34749043],
       [0.17363249, 0.47224858, 0.35411894],
       [0.214531  , 0.4788761 , 0.30659288],
       [0.16707443, 0.48981553, 0.34311008]], dtype=float32)

In [15]:
np.max(policy, axis=1)

array([0.4625738 , 0.45232546, 0.47224858, 0.4788761 , 0.48981553],
      dtype=float32)

In [19]:
sa_pairs = np.concatenate((expert_traj['state'], expert_traj['encoded_action']), axis=1)
sa_pairs.shape

(10, 15)

In [17]:
optimizer = tf.keras.optimizers.Adam(0.01)
with tf.GradientTape(persistent=True) as tape:
    alg.reward = alg.reward_model(sa_pairs.astype('float32'))
    shaping_term_s = alg.shaping_term(expert_traj['state'])
    shaping_term_ns = alg.shaping_term(expert_traj['next_state'])
    f_sas = alg.reward + alg.gamma * shaping_term_ns - shaping_term_s

    part_policy = get_part_policy()
    
    D_sas = tf.exp(f_sas) / (tf.exp(f_sas) + part_policy)
    
    alg.loss = tf.keras.losses.binary_crossentropy(y_true=expert_traj['labels'], y_pred=D_sas)

    gradients = tape.gradient(alg.loss, alg.reward_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, alg.reward_model.trainable_variables))
    
    del tape



# Test f(s,a)

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from DeepAIRL import DeepAIRL
from simulator.ProportionalFairness import load_av_ue_info, PROPORTIONALFAIRNESS

def user_info_threshold(user_info, threshold_min, threshold_max):
    filtered_user_info = []
    for i in range(len(user_info)):
        if (user_info[i]['buffer'] > threshold_min) and (user_info[i]['buffer'] < threshold_max):
            filtered_user_info.append(user_info[i])
    return filtered_user_info

class Replayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                                   columns=['state', 'action', 'reward', 'label'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        self.memory.dropna(inplace=True)
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
av_ues_info = load_av_ue_info()
av_ues_info = user_info_threshold(av_ues_info, threshold_min=1e+6, threshold_max=1e+7)
pf_env = PROPORTIONALFAIRNESS(
    lambda_avg=None,
    lambda_fairness=None,
    reward_flag=None)

INITIAL_USER_START = 0
USER_NUM = 3
av_ues_idx = list(range(INITIAL_USER_START, INITIAL_USER_START + USER_NUM))
state = pf_env.reset(av_ues_info, av_ues_idx)
replayer = Replayer(capacity=500)

tti = 0 
while (tti < 500):
    action = pf_env.action()[0]
    next_state, reward, done, info = pf_env.step(None, 0, 0)
    replayer.store(state.reshape(-1), action, reward, 1.0)
    state = next_state
    tti += 1

In [2]:
replayer.memory.dropna(inplace=True)
replayer.memory.reset_index(inplace=True, drop=True)
alg = DeepAIRL(
    env_flag='PF',
    user_num=3,
    rbg_num=1,
    feature_dim=4,
    n_states=1296,
    n_actions=3,
    traj=replayer.memory,
    traj_length=replayer.memory.shape[0]
)

In [3]:
alg.execute_policy()
alg.traj_batch = alg.get_batch()
encoded_actions = tf.keras.utils.to_categorical(alg.traj_batch['action'], num_classes=alg.user_num)
sa_pairs = np.concatenate((alg.traj_batch['state'], encoded_actions), axis=1)
alg.D_output = alg.discriminator.predict(sa_pairs)
alg.reward = alg.update_reward()
alg.loss = alg.logistic_loss()

# Normalize state

In [1]:
import numpy as np
from DeepAIRL import DeepAIRL
from collect import COLLECT
import logging
import os

logging.disable(30)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

if __name__ == '__main__':
    USER_NUM = 3
    RBG_NUM = 1
    FEATURE_DIM = 4
    N_STATES = None
    N_ACTIONS = USER_NUM
    TRAJ_NUMBER = 5
    TRAJ_LENGTH = 50
    EPOCH = 100
    ENV_FLAG = ['PF']

    for flag in ENV_FLAG:
        collect = COLLECT(
            env_flag=flag,
            user_num=USER_NUM,
            feature_dim=FEATURE_DIM,
            rbg_num=RBG_NUM,
            traj_length=TRAJ_LENGTH,
            traj_number=TRAJ_NUMBER,
            replayer_capacity=TRAJ_LENGTH * TRAJ_NUMBER
        )

        expert_traj = collect.generate()
        
        alg = DeepAIRL(
            env_flag=flag,
            user_num=USER_NUM,
            rbg_num=RBG_NUM,
            epochs=EPOCH,
            feature_dim=FEATURE_DIM,
            n_states=N_STATES,
            n_actions=N_ACTIONS,
            traj=expert_traj,
            traj_length=expert_traj.shape[0],
            batch_size=32)

INFO: Generating trajectories!
INFO: Trajectories generation accomplished, time cost=0.5716619999999999s
