# pip install

In [None]:
import os
from datetime import datetime
import pandas as pd


In [None]:
tensorboard_log = '/xx/' # replace with your directory

os.makedirs(tensorboard_log, exist_ok=True)


In [None]:

df_all_targets = pd.DataFrame()

model_name = 'PPO'


In [None]:
pip install gymnasium


In [None]:
pip install "stable-baselines3[extra]>=2.0.0a4"

In [None]:
import numpy as np

import gymnasium as gym
from gymnasium import spaces

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env, SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback


## set reproducibility

In [None]:
import torch
import numpy as np
import random
import os
os.environ['PYTHONASHSEED'] = '0'
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# utils functions

In [None]:
    def load_topic_target_location(topic_id,target_recall):
      ## load data

      vector_size = 100 # vector size to feed NN

      all_vectors = [[-1]*vector_size for i in range(vector_size)]

      target_location = -1 # initial




      n_docs = len(doc_rank_dic[topic_id])  # total n. docs in topic
      rel_list = rank_rel_dic[topic_id]  # list binary rel of ranked docs

      # get batches
      windows = make_windows(vector_size, n_docs)

      window_size = windows[0][1]

      # calculate batches
      rel_cnt,rel_rate, n_docs_wins = get_rel_cnt_rate(windows, window_size, rel_list)



      n_rel = sum(rel_cnt)
      prev = sum(rel_cnt)/n_docs


      #update all vector with all possible examined states
      for i in range(vector_size):
        all_vectors[i][0:i+1] = rel_rate[0:i+1] # update examined part

        #calculate target recall stopping pos
        #mark only 1st recall achieved stopping position
        if (sum(rel_cnt[0:i+1]) / sum(rel_cnt)) >= target_recall and target_location == -1:
          target_location = i


      return topic_id, n_docs, n_rel, prev, target_location

In [None]:
def get_rel_cnt_rate(windows, window_size, rel_list):

    # x-values are the cnt at which relevant documents occur in the window
    x = [np.sum(rel_list[w_s:w_e]) for (w_s,w_e) in windows]

    # y-values are the rate at which relevant documents occur in the window
    y = [np.sum(rel_list[w_s:w_e]) for (w_s,w_e) in windows]
    y = [y_i/window_size for y_i in y]


    # z-values are the cnt of documents in the window
    z = [len(rel_list[w_s:w_e]) for (w_s,w_e) in windows]


    # convert lists to numpy arrays
    x = np.array(x)
    y = np.array(y)
    z= np.array(z)
    return (x,y,z)

In [None]:
%matplotlib inline

# IMPORT LIBRARIES
import sys
import os
import numpy as np
import pandas as pd
import math
from scipy.optimize import curve_fit
import random
import glob
import subprocess
import matplotlib.pyplot as plt
from scipy.integrate import simps
from scipy.stats import norm
import os

import scipy

import seaborn as sns


In [None]:
DIR = '/xxhome/' # replace with utils home directory


sys.path.append(DIR)

# import utils fns
from utils.read_data_fns import *
from utils.eval_fns import *

In [None]:
# LOAD TOPIC RELEVANCE DATA
def load_rel_data(qrels):
  qrel_fname =  os.path.join(DIR, qrels)
  with open(qrel_fname, 'r') as infile:
      qrels_data = infile.readlines()
  query_rel_dic = make_rel_dic(qrels_data) # make dictionary of list of docids relevant to each queryid

  return qrel_fname, query_rel_dic

In [None]:
# LOAD RUN DATA
def load_run_data(run):
  run_fname = os.path.join(DIR, run)
  with open(run_fname, 'r') as infile:
    run_data = infile.readlines()
  doc_rank_dic = make_rank_dic(run_data)  # make dictionary of ranked docids for each queryid
  rank_rel_dic = make_rank_rel_dic(query_rel_dic,doc_rank_dic) # make dic of list relevances of ranked docs for each queryid

  return doc_rank_dic, rank_rel_dic

# clf functions

In [None]:
# import
import pandas as pd
import numpy as np

#modeling
from sklearn.linear_model import LogisticRegression

#evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# import NLP libraries
### import libraries
from sklearn.feature_extraction.text import TfidfVectorizer



import pickle
from scipy.sparse import csr_matrix
from scipy.sparse import vstack


def run_classification_model_tfidf(query_id, n_samp_docs, n_docs, labels, features, clf_name, imbalance_handle):

    features = vstack(features) # combine sparse rows into single sparse matrix


    #split train & test sets
    train_x = features[0:n_samp_docs]
    train_y = labels[0:n_samp_docs]
    valid_x = features[n_samp_docs:n_docs]
    valid_y = labels[n_samp_docs:n_docs]


    # calculate relv, non-relv
    relv_cnt = sum(train_y)
    non_relv_cnt = len(train_y) - relv_cnt


    if imbalance_handle == 'cost_sensitive_manual':
      # manually assign majority and minority to either 0 or 1 based on sample
      if relv_cnt >= non_relv_cnt:
        majority_class = 1
        minority_class = 0
        IR = non_relv_cnt/relv_cnt
        class_weight={majority_class:IR, minority_class:1}
      else:
        majority_class = 0
        minority_class = 1
        IR = relv_cnt/non_relv_cnt
        class_weight={majority_class:IR, minority_class:1}

      clf = LogisticRegression(solver=solver, random_state=0, C=1.0, max_iter=10000, class_weight = class_weight)


    accuracy, f1, predictions = train_model_save_threshold(query_id,clf_name, clf, train_x, train_y, valid_x, valid_y)

    predictions = predictions.astype(int)

    return accuracy, f1, predictions



def train_model_save_threshold(topic_id, clf_name, classifier, feature_vector_train, label, feature_vector_valid, valid_y, is_neural_net=False):
      # fit the training dataset on the classifier
      classifier.fit(feature_vector_train, label)

      #set threshold optimised F1 (models default)
      model_threshold = 0.5

      # get clf labels
      predictions = (classifier.predict_proba(feature_vector_valid)[:,1] >= model_threshold).astype(bool) # set threshold to threshold_list[i]


      acc = metrics.accuracy_score(valid_y, predictions) * 100
      f1 = metrics.f1_score(valid_y, predictions, average='macro') * 100

      return round(acc,3), round(f1,3), predictions


# Callback functions

In [None]:
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np

class EarlyStoppingCallback(BaseCallback):
    def __init__(self, patience: int, min_delta: float = 0.001, verbose: int = 0):
        super(EarlyStoppingCallback, self).__init__(verbose)
        self.patience = patience
        self.min_delta = min_delta
        self.best_mean_reward = -np.inf
        self.no_improvement_steps = 0

    def _on_step(self) -> bool:
        # check if an episode has ended
        if 'episode' in self.locals['infos'][-1]:
            # get latest episode reward mean
            mean_reward = np.mean(self.locals['infos'][-1]['episode']['r'])

            # check if the mean reward has improved
            if mean_reward > self.best_mean_reward + self.min_delta:
                # improvement, update the best reward
                self.best_mean_reward = mean_reward
                self.no_improvement_steps = 0
            else:
                # no improvement
                self.no_improvement_steps += 1

            # if no improvement for `self.patience` steps, stop training
            if self.no_improvement_steps > self.patience:
                if self.verbose > 0:
                    print("Early stopping triggered: Training stopped at step {}".format(self.num_timesteps))
                return False  # stop the training

        return True  # continue training




# TAREnv class

In [None]:
# reward functions to handle different powers m, and n
def cumulative_reward(x, T, B, m, n):
    if x <= T:
        return (x / T) ** m
    else:
        return ((B - x) / (B - T)) ** n

def stepwise_reward(x, T, B, m, n):
    if x <= T:
        return (1 / T ** m) * (x ** m - (x - 1) ** m)
    else:
        return (1 / (B - T) ** n) * ((B - x) ** n - (B - (x - 1)) ** n)

In [None]:

import numpy as np
import gymnasium as gym
from gymnasium import spaces


ALL_VECTORS_PREDICTIONS_DIC = {}
SELECTED_TOPICS_WITHOUT_TARGET = []
ALL_VECTORS_PREDICTIONS_DIC_EXIST = False

SELECTED_TOPICS = [] # keep track of all randomly selected topics
SELECTED_TOPICS_TARGET = []
class TAREnv(gym.Env):



    def __init__(self, target_recall = None, topics_list = None, topic_id= None, size=100 , render_mode=None):



        self.size = size  # The size of the ranking relv vector

        #observation is 1D np array size array of relv vector
        self.observation_space = spaces.Box(-1,  1, shape=(size+2,), dtype=np.float32) #with clf

        # 2 actions, corresponding to "next", "stop"
        self.action_space = spaces.Discrete(2)

        # Set up some properties
        self.done = False
        self.reward = 0
        self.hit, self.miss = 0, 0



        # Set up the TAR
        self.vector_size = size



        # current position and stop position
        self._agent_location = 0 # -1: target,
        self._target_location = -1 #dummy value

        # keep predicted recall so far
        self.recall = 0
        self.target_recall = target_recall

        # topic data

        self.topics_list = topics_list
        self.topic_id = topic_id # for single env

        self.windows = 0
        self.window_size = 0

        #for vec env
        if topic_id is None:
          # checking whether the generated random number is not repeated
          while ( len(SELECTED_TOPICS) <= len(topics_list)):
            t = random.choice(topics_list)
            if t not in SELECTED_TOPICS:
              # include target recall with topic
              if TRAINING:
                # appending the random number to the resultant list, if the condition is true
                SELECTED_TOPICS.append(t)
                self.topic_id = t.split('_',1)[0]
                self.target_recall = float(t.split('_',1)[1])
              #if testing no need for target recall with topic
              else:
                # appending the random number to the resultant list, if the condition is true
                SELECTED_TOPICS.append(t)
                self.topic_id = t
                self.target_recall = target_recall
              break


          #ACTIVATE on TRAINING
          # use same ordered list of topics across diffreent runs
          if TRAINING:
            global SELECTED_TOPICS_ORDERERD_INDEX
            t = SELECTED_TOPICS_ORDERERD[SELECTED_TOPICS_ORDERERD_INDEX]
            self.topic_id = t.split('_',1)[0]
            self.target_recall = float(t.split('_',1)[1])
            SELECTED_TOPICS_ORDERERD_INDEX += 1
        else:
           self.topic_id = topic_id # for single env



        self.n_docs = 0
        self.rel_cnt = []
        self.rel_rate = []
        self.n_samp_docs = 0
        self.n_samp_docs_after_target =  0
        self.n_docs_wins = []
        self.rel_list = 0
        self.text_list = ''
        self.tfidf_list = []
        self.all_vectors = []
        self.all_vectors_target = []
        self.all_vectors_prediction = []

        # Define constants for clearer code
        self.NEXT = 0
        self.STOP = 1


        self.load_data_flag = True
        self.load_data(self.topic_id)

        self.first_step_flag = True

        #initialize environment each time for each topic
        self.reset()


    def load_data(self, topic_id):

      # load data only once when self._agent_location ==0
      if self._agent_location == 0 :

        all_vectors = [[-1]*self.vector_size for i in range(self.vector_size)]
        all_vectors_target = [[-1]*(self.vector_size+2) for i in range((self.vector_size))] # +2 for current index &target, DONT ADD IT FOR ROWS ONLY COLS
        all_vectors_prediction = [[-1]*(self.vector_size+2) for i in range((self.vector_size))] # +2 for current index &target, DONT ADD IT FOR ROWS ONLY COLS

        topic_id = self.topic_id


        n_docs = len(doc_rank_dic[topic_id])  # total n. docs in topic
        rel_list = rank_rel_dic[topic_id]  # list binary rel of ranked docs
        text_list = rank_text_dic[topic_id]  # list text feature of ranked docs
        tfidf_list = rank_tfidf_dic[topic_id]  # list text feature of ranked docs


        # get batches
        windows = make_windows(self.vector_size, n_docs)

        window_size = windows[0][1]

        # calculate batches
        rel_cnt,rel_rate, n_docs_wins = get_rel_cnt_rate(windows, window_size, rel_list)


        self.n_docs = n_docs
        self.rel_cnt = rel_cnt
        self.rel_rate = rel_rate
        self.n_docs_wins = n_docs_wins
        self.rel_list = rel_list
        self.text_list = text_list
        self.tfidf_list = tfidf_list
        self.windows = windows
        self.window_size = window_size

        #update all vector with all possible examined states
        for i in range(self.vector_size):
          all_vectors[i][0:i+1] = rel_rate[0:i+1] # update examined part

          all_vectors_target[i][-1] = self.target_recall # include target recall as last element
          all_vectors_target[i][-2] = i # mark current examined index

          #if clf not used
          all_vectors_target[i][0:i+1] = rel_rate[0:i+1] # update examined part only

          #if clf used
          if self.topic_id not in SELECTED_TOPICS_WITHOUT_TARGET:
            #run clf only once
            all_vectors_target[i][0:-2] = self.get_clf_predictions(i) # update examined with tl & non-examined part with clf predictions
          else:
            saved_all_vectors_target = ALL_VECTORS_PREDICTIONS_DIC[self.topic_id]
            all_vectors_target[i][0:-2] = saved_all_vectors_target[i][0:-2]

          #calculate target recall stopping pos
          #mark only 1st recall achieved stopping position
          if (sum(self.rel_cnt[0:i+1]) / sum(self.rel_cnt)) >= self.target_recall and self._target_location == -1 and i < self.vector_size: #7-10-24: i< self.vector_size
            self._target_location = i


        #update after passing through all batches
        SELECTED_TOPICS_WITHOUT_TARGET.append(self.topic_id)
        ALL_VECTORS_PREDICTIONS_DIC[self.topic_id] = all_vectors_target

        self.all_vectors = all_vectors
        self.all_vectors_target = all_vectors_target

    def get_clf_predictions(self,i):

      # Initialise count of documents in sample
      tmp_n_samp_docs = int(np.sum(self.n_docs_wins[0:i+1]))

      clf_name = 'LR-TFIDF'
      dataset_imbalance_handle = 'cost_sensitive_manual'
      acc, f1, predictions = run_classification_model_tfidf(self.topic_id, tmp_n_samp_docs, self.n_docs, self.rel_list, self.tfidf_list, clf_name, dataset_imbalance_handle)

      rel_pred_list = list(self.rel_list[0:tmp_n_samp_docs+1])+list(predictions)
      rel_cnt_wins, rel_pred_wins, n_docs_wins  = get_rel_cnt_rate(self.windows, self.window_size, rel_pred_list)

      return rel_pred_wins # update examined&non-examined part with tl & clf predictions

#######################################################################################


    def _get_obs(self):
        return  np.array(self.all_vectors_target[self._agent_location], dtype=np.float32)




    def _get_info(self):



        return {
                "topic_id": self.topic_id,
                "recall": round((self.recall),3),
                "cost": round(((self._agent_location +1 )/100),3), # each vec pos == 1% of collection +1 bc 1st loc [0] is 1% cost
                "e_cost": (round((((self._agent_location)-(self._target_location))/100),3)), # CostDiff
                "distance": (self._agent_location - self._target_location),
                "agent": (self._agent_location),
                "target": (self._target_location),
                "agent_vector": np.array(self.all_vectors_target[self._agent_location]),
                "terminal_observation": np.array(self.all_vectors_target[self._target_location])} # target_vector named terminal_observation needed for SB3 vec_env
#######################################################################################


    def reset(self,seed=0):

        # re-load data 1st time for vec_env
        if self.load_data_flag:
          self.load_data(self.topic_id)
          self.load_data_flag = False

        self._agent_location = 0
        self.n_samp_docs =  sum(self.n_docs_wins[0:self._agent_location+1])
        self.n_samp_docs_after_target =  sum(self.n_docs_wins[self._target_location:self._agent_location+1])
        self.recall = sum(self.rel_cnt[0:self._agent_location+1]) / sum(self.rel_cnt)

        state = self.all_vectors[self._agent_location]

        observation = self._get_obs()
        info = self._get_info()

        self.reward = 0

        #return state
        return observation, info

#######################################################################################



    def step(self, action):
        truncated = False
        terminated = False

        if self._agent_location >= self.vector_size-1:
          self.done = True
          truncated = True
          self.reward = 0

        if self._agent_location >= self.vector_size-2 and action == self.NEXT:
          self.done = True
          truncated = True
          self.reward = 0

        if action == self.STOP:
          terminated = True
          self.reward = 0


        if action == self.NEXT:
            if self.first_step_flag:
              self._agent_location = self._agent_location # dont move next, examine 1st portion at pos [0]
              self.first_step_flag = False
            else:
              self._agent_location += 1 # move to next portion (examined)

            self.n_samp_docs =  sum(self.n_docs_wins[0:self._agent_location+1])
            self.n_samp_docs_after_target =  sum(self.n_docs_wins[self._target_location:self._agent_location+1])
            self.recall = sum(self.rel_cnt[0:self._agent_location+1]) / sum(self.rel_cnt)

        observation = self._get_obs()
        info = self._get_info()

        # to get more easy readable formula
        reward_target_loc = self._target_location+1
        reward_agent_loc = self._agent_location+1
        reward_vector_size = self.vector_size


        global m,n #reusable for linear vs nonlinear
        self.reward = stepwise_reward(reward_agent_loc, reward_target_loc, reward_vector_size, m, n)


        return observation, self.reward, terminated, truncated, info




    def close(self):
        # we dont need close
        return




# run experiments

In [None]:
TRAINING = True

#penalise undershooting, tolerance with overshooting
m = 4
n = 1/4
model_name = 'non_lin'

#penalise overshooting, tolerance with undershooting
m = 1/4
n = 4
model_name = 'non_lin_cost_obj'


#blalanced
m = 1
n = 1
model_name = 'lin_clf_lr'




total_runs = 5 # if choose Deterministic = False (Stochatic) in predict()


target_recalls = [1.0, 0.99, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7]




In [None]:
df_all_targets = pd.DataFrame()

DRL_DIR = '/xx/' # replace with working directory


## Training

In [None]:
TRAINING = True
topic_set = 'training'




In [None]:
training_dataset = 'CLEF'


#### sort topics by target location

In [None]:
dataset_name = 'CLEF'

qrels = "/data/qrels/CLEF2017_qrels.txt"


qrel_fname, query_rel_dic = load_rel_data(qrels)

run = "/data/rankings/clef2017_training_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic, rank_tfidf_dic = load_run_data_with_text_tfidf(run, dataset_name, topic_set)

topics_list = make_topics_list(doc_rank_dic,1)

In [None]:
#remove topic CD008760 last element, contains 64 items only, < 100 vector size
topics_list= topics_list[:-1]


In [None]:
topics_info = []

for target_recall in target_recalls:
  for t in topics_list:
    topic_id, n_docs, n_rel, prev, target_location = load_topic_target_location(t,target_recall)
    topic_id_target_recall = topic_id + "_" + str(target_recall)
    print(topic_id, n_docs, n_rel, round(prev,3), target_location)
    topics_info.append([topic_id, n_docs, n_rel, prev, target_recall, target_location, topic_id_target_recall])

topics_info

In [None]:

df = pd.DataFrame(topics_info, columns=['topic_id', 'n_docs', 'n_rel', 'prev', 'target_recall', 'target_location', 'topic_id_target_recall'])
df = df.sort_values(by=['target_recall', 'target_location'])


In [None]:
sorted_target_loc_topics = list(df['topic_id_target_recall'])


####load clf rel dic if exist

In [None]:
dataset_name = 'CLEF'
topic_set = 'training'

# file name
ALL_VECTORS_PREDICTIONS_DIC_file_name = dataset_name +'_'+ topic_set +'_'+ 'ALL_VECTORS_PREDICTIONS_DIC.pkl'
ALL_VECTORS_PREDICTIONS_DIC_file_name = DRL_DIR+'data/'+ALL_VECTORS_PREDICTIONS_DIC_file_name

# check if the file exists
if not os.path.exists(ALL_VECTORS_PREDICTIONS_DIC_file_name):
    # if doesn't exist, create an empty dictionary
    ALL_VECTORS_PREDICTIONS_DIC = {}
    SELECTED_TOPICS_WITHOUT_TARGET = []
    ALL_VECTORS_PREDICTIONS_DIC_EXIST = False

else:
    # if exists, load the dictionary from the file
    with open(ALL_VECTORS_PREDICTIONS_DIC_file_name, "rb") as f:
        ALL_VECTORS_PREDICTIONS_DIC = pickle.load(f)

    SELECTED_TOPICS_WITHOUT_TARGET = list(ALL_VECTORS_PREDICTIONS_DIC.keys())
    ALL_VECTORS_PREDICTIONS_DIC_EXIST = True


####ordered topics

In [None]:
TRAINING = True

SELECTED_TOPICS_ORDERERD = sorted_target_loc_topics
SELECTED_TOPICS_ORDERERD_INDEX = 0

SELECTED_TOPICS_TARGET = target_recalls
SELECTED_TOPICS_TARGET_INDEX = 0

# Instantiate the vec env

#random topic selection for each env instance
SELECTED_TOPICS = [] # reset before/after each call, keep track of all randomly selected topics
SELECTED_TOPICS_TARGET = []

n_envs=len(sorted_target_loc_topics)
vec_env = make_vec_env(TAREnv, n_envs=len(sorted_target_loc_topics), env_kwargs=dict(target_recall=None, topics_list = sorted_target_loc_topics, topic_id=None, size=100, render_mode='human'))

SELECTED_TOPICS = [] # reset before/after each call, keep track of all randomly selected topics
SELECTED_TOPICS_TARGET = []

train_size = len(topics_list)*len(target_recalls)


vec_env_train = vec_env

#save dic for first time
if not ALL_VECTORS_PREDICTIONS_DIC_EXIST:
    # Save the dictionary to a file
    with open(ALL_VECTORS_PREDICTIONS_DIC_file_name, 'wb') as f:
        pickle.dump(ALL_VECTORS_PREDICTIONS_DIC, f)

#### Hyperparameter Settings

In [None]:
def nearest_factor_of(target, total):
    # find all factors of `total`
    factors = [i for i in range(1, total + 1) if total % i == 0]

    # find the factor closest to the target
    closest_factor = min(factors, key=lambda x: abs(x - target))
    return closest_factor



In [None]:


# Train the agent

n_envs=len(sorted_target_loc_topics)


learning_rate_initial = 0.0003


learning_rate = learning_rate_initial
learning_rate_type = '_lr_static'


ent_coef = 0.1

gamma = 0.99
gae_lambda = 0.98
clip_range=0.1




n_epochs =10

n_steps = 10



total_rollout = n_steps * n_envs  # Total rollout buffer size
target_mini_batch = int(1/4 * total_rollout)  # Target value (1/4 of total)


# get the nearest factor
batch_size = nearest_factor_of(total_rollout, target_mini_batch)




nn_nodes = 64

policy_kwargs = dict(
    net_arch=[nn_nodes, nn_nodes]  # Two hidden layers with nn_nodes
)


total_timesteps = 20_000_000


####  PPO

In [None]:


tb_log_name = model_name+"_"+training_dataset+"_nstps"+str(n_steps)+"_btch"+str(batch_size)+"_ts"+str(total_timesteps)+ "_ent"+str(ent_coef)+"_epch"+str(n_epochs)+learning_rate_type+str(learning_rate_initial)+"_clip"+str(clip_range)+"_nn"+str(nn_nodes) +"_target"+str(target_recall)


callback_log = tensorboard_log+'running/'
# Checkpoint callback to save the model periodically
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path=callback_log, name_prefix=tb_log_name)



# patience is set to patience steps
patience = 10*n_steps*n_envs #as a general early stopping rule, allow for 10 rollouts w/o improvement more than 1%

early_stopping_callback = EarlyStoppingCallback(patience=patience, min_delta=0.001, verbose=1)



# Combine the callbacks
callbacks = [checkpoint_callback, early_stopping_callback]

model = PPO(
    policy = 'MlpPolicy',
    env = vec_env_train,
    n_steps = n_steps,
    batch_size = batch_size,
    n_epochs = n_epochs,
    gamma = gamma,
    gae_lambda = gae_lambda,
    ent_coef = ent_coef,
    clip_range = clip_range,
    verbose=0,
    learning_rate = learning_rate,
    policy_kwargs=policy_kwargs,
    seed=0,
    tensorboard_log= tensorboard_log)




model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, callback=callbacks)

model.save(tensorboard_log+'model_'+tb_log_name)



In [None]:
%load_ext tensorboard
%tensorboard --logdir "$tensorboard_log"


## TESTING

### Load Trained Model

In [None]:
TRAINING = False


model_load_dir = tensorboard_log + 'model_'+tb_log_name


model = PPO.load(model_load_dir, env=vec_env_train)

### all targets & datasets

In [None]:


dataset_names = ["CLEF2017", "CLEF2018", "CLEF2019"]

qrels_dic = {
    "CLEF2017" : "data/qrels/CLEF2017_qrels.txt",
    "CLEF2018" : "data/qrels/CLEF2018_qrels.txt",
    "CLEF2019" : "data/qrels/CLEF2019_qrels.txt",
}
rankings_dic = {
    "CLEF2017" : "data/rankings/clef2017_ranking.txt",
    "CLEF2018" : "data/rankings/clef2018_ranking.txt",
    "CLEF2019" : "data/rankings/clef2019_ranking.txt",
}


target_recalls = [1.0, 0.9, 0.8, 0.7]


### results

In [None]:
TRAINING = False

total_runs = 1
topic_set = 'test'




### all targets & datasets
for dataset_name in dataset_names:

  # file name
  ALL_VECTORS_PREDICTIONS_DIC_file_name = dataset_name +'_'+ topic_set +'_'+ 'ALL_VECTORS_PREDICTIONS_DIC.pkl'
  ALL_VECTORS_PREDICTIONS_DIC_file_name = DRL_DIR+'data/'+ALL_VECTORS_PREDICTIONS_DIC_file_name

  # check if the file exists
  if not os.path.exists(ALL_VECTORS_PREDICTIONS_DIC_file_name):
    # if doesn't exist, create an empty dictionary
    ALL_VECTORS_PREDICTIONS_DIC = {}
    SELECTED_TOPICS_WITHOUT_TARGET = []
    ALL_VECTORS_PREDICTIONS_DIC_EXIST = False
  else:
    # if exists, load the dictionary from the file
    with open(ALL_VECTORS_PREDICTIONS_DIC_file_name, "rb") as f:
      ALL_VECTORS_PREDICTIONS_DIC = pickle.load(f)
      SELECTED_TOPICS_WITHOUT_TARGET = list(ALL_VECTORS_PREDICTIONS_DIC.keys())
      ALL_VECTORS_PREDICTIONS_DIC_EXIST = True

  qrels = qrels_dic[dataset_name]

  qrel_fname, query_rel_dic = load_rel_data(qrels)

  run = rankings_dic[dataset_name]
  #doc_rank_dic, rank_rel_dic = load_run_data(run)
  doc_rank_dic, rank_rel_dic, rank_text_dic, rank_tfidf_dic = load_run_data_with_text_tfidf(run, dataset_name, topic_set)

  topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs

  if dataset_name == 'CLEF2019':
      #remove topic CD012164 last element, contains 61 items only, < 100 vector size
      topics_list= topics_list[:-1]

  for target_recall in target_recalls:

    # Instantiate the vec env

    #random topic selection for each env instance
    SELECTED_TOPICS = [] # reset before/after each call, keep track of all randomly selected topics

    vec_env = make_vec_env(TAREnv, n_envs=len(topics_list), env_kwargs=dict(target_recall=target_recall, topics_list = topics_list, topic_id=None, size=100, render_mode='human'))

    SELECTED_TOPICS = [] # reset before/after each call, keep track of all randomly selected topics

    test_size = len(topics_list)
    vec_env_test = vec_env

    df = pd.DataFrame()
    df_all_runs = pd.DataFrame()

    for run in range(total_runs):

      # Test the trained agent
      # using the vecenv
      vec_env_test = vec_env
      obs = vec_env_test.reset()
      test_steps = 100
      vector_size = 100


      n_env = test_size
      agent=0
      target=0
      agent_vector=[]
      terminal_observation=[]

      topics = []
      recalls = []
      costs=[]
      e_costs = []
      reliabilities = []
      rewards = []
      distances = []
      differences = []
      targets = []
      run_cnts = []

      for eID in range(test_size):
        print(f"=================== env {eID}")
        env = vec_env_test.envs[eID]
        obs, info = env.reset()

        for step in range(test_steps):

          action, _ = model.predict(obs, deterministic=True) # predict all next steps

          obs, reward, done, trun,info = env.step(action)

          if done or trun:
                      topic_id = info['topic_id']
                      recall = info['recall']
                      cost = info['cost']
                      e_cost =  info['e_cost']

                      distance = info['distance']

                      agent = info['agent']
                      target = info['target']
                      agent_vector = info['agent_vector']
                      terminal_observation = info['terminal_observation']

                      difference = target_recall - recall



                      reliability = 1 if recall >= target_recall else 0
                      topics.append(topic_id)
                      recalls.append(recall)
                      costs.append(cost)
                      e_costs.append(e_cost)
                      reliabilities.append(reliability)
                      rewards.append(reward)
                      distances.append(distance)
                      targets.append(target)
                      run_cnts.append(run)
                      differences.append(difference)

                      df_tmp = pd.DataFrame( list(zip([dataset_name]*len(topics_list), topics, run_cnts, recalls, reliabilities, costs, e_costs, rewards, differences, distances, targets)),
                      columns =['Dataset', 'Topic', 'Run', 'Recall', 'Reliability', 'Cost', 'e-Cost', 'Reward', 'Difference', 'Distance', 'Target'])


                      df = pd.concat([df_tmp])


                      df.groupby('Topic').mean(numeric_only=True)

                      break



      df_all_runs = pd.concat([df_all_runs, df])



    df_all_runs['Model'] = model_name
    df_all_runs['Model_settings'] = tb_log_name
    df_all_runs['Target_Recall'] = target_recall



    df_all_targets = pd.concat([df_all_targets, df_all_runs], ignore_index = True)

    #save dic for first time
    if not ALL_VECTORS_PREDICTIONS_DIC_EXIST:
        # Save the dictionary to a file
        with open(ALL_VECTORS_PREDICTIONS_DIC_file_name, 'wb') as f:
            pickle.dump(ALL_VECTORS_PREDICTIONS_DIC, f)


display(df_all_targets)

display(df_all_targets.describe())

display(df_all_targets.groupby(['Target_Recall','Dataset']).mean(numeric_only=True).round(3))
display(df_all_targets.groupby(['Target_Recall','Dataset']).std(numeric_only=True).round(3))


