#### Notebook for running React experiments

In [None]:
import sys, os
sys.path.append('..')
root  = '../root/'

In [None]:
import joblib
from util import summarize_react_trial, log_react_trial, save_agents
from rouge_score import rouge_scorer
import numpy as np
from ctrleval import CTRLEval
import re


### For single expert, run this block

In [None]:
from agents_general_single import ReactReflectAgent, ReactAgent, ReflexionStrategy

### For multiple experts, run this block


In [None]:
from agents_general_double import ReactReflectAgent, ReactAgent, ReflexionStrategy

### Load the HotpotQA Sample

In [None]:
dataset = joblib.load('../data/hotpot-qa-100-questions.joblib').reset_index(drop = True)

### Load the SimpleQA Sample

In [None]:
dataset = joblib.load('../data/simple-qa-100-questions.joblib').reset_index(drop = True)

#### Define the Reflexion Strategy

In [None]:
print(ReflexionStrategy.__doc__)

In [None]:
strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION

#### Initialize a React Agent for each question

In [None]:
agent_cls = ReactReflectAgent if strategy != ReflexionStrategy.NONE else ReactAgent
agents = [agent_cls(row['question'], row['answer']) for _, row in dataset.iterrows()]

#### Run `n` trials

In [None]:
n = 5
trial = 0
log = ''

In [None]:
for i in range(n):
    for agent in [a for a in agents if not a.is_correct()]:
        if strategy != ReflexionStrategy.NONE:
            agent.run(reflect_strategy = strategy)
        else:
            agent.run()
        print(f'Answer: {agent.key}')
    trial += 1
    log += log_react_trial(agents, trial)
    correct, incorrect, halted = summarize_react_trial(agents)
    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}')

In [None]:
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer = True)
scores = [scorer.score(agent.key, agent.answer) for agent in agents]

print(f'Average Rouge-L: {sum([score["rougeL"].fmeasure for score in scores]) / len(scores)}')

In [None]:
ctrleval_scorer = CTRLEval()
cons_results, exp_cons_results = [], []
for agent in agents:
    question = agent.question
    generated_answer = agent.answer
    generated_answer = re.sub("\u200b", " ", generated_answer)
    if generated_answer.strip():
        prefix = [question]
        data = [question + '\n' + generated_answer]
        try:
            cons_result = ctrleval_scorer.score(aspect = 'cons', data = data, prefix = prefix, batch_size = 1)
            exp_cons_result = np.exp(cons_result[0])
        except:
            print('Error in scoring')
            print(data)
            exit()
    else:
        cons_result = [np.nan]
        exp_cons_result = 0

    cons_results.append(cons_result[0])
    exp_cons_results.append(exp_cons_result)

print(f'Average CTRLEval: {np.nanmean(cons_results)}')
print(f'Average CTRLEval Exp : {np.nanmean(exp_cons_results)}')

#### Save the result log

In [None]:
with open(os.path.join(root, 'ReAct', strategy.value, f'{len(agents)}_questions_{trial}_trials.txt'), 'w') as f:
    f.write(log)
save_agents(agents, os.path.join('ReAct', strategy.value, 'agents'))