#### Notebook for running Chain-of-Thought with supporting context experiments 

In [None]:
import sys, os
sys.path.append('..')
root = '../root/'

In [None]:
import joblib
import numpy as np
from util import summarize_trial, log_trial, save_agents, summarize_react_trial, log_react_trial
from rouge_score import rouge_scorer
from ctrleval import CTRLEval
import re


### For single expert, run this block


In [None]:
from agents_med_single import CoTAgent, ReflexionStrategy

### For multiple experts, run this block


In [None]:
from agents_med_double import CoTAgent, ReflexionStrategy

#### Load the PubmedQA Sample

In [None]:
dataset = joblib.load('../data/pubmed-qa-100-questions.joblib').reset_index(drop = True)

# dataset['supporting_paragraphs'] = None
# for ind, row in dataset.iterrows():
#     supporting_articles = row['supporting_facts']['title']
#     articles = row['context']['title']
#     sentences = row['context']['sentences'] 
#     supporting_paragraphs = []
#     for article in supporting_articles:
#         supporting_paragraph = ''.join(sentences[np.where(articles == article)][0])
#         supporting_paragraphs.append(supporting_paragraph)
#     supporting_paragraphs = '\n\n'.join(supporting_paragraphs)
#     dataset.at[ind, 'supporting_paragraphs'] = supporting_paragraphs

#### Define the Reflexion Strategy

In [None]:
print(ReflexionStrategy.__doc__)

In [None]:
strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION

#### Initialize a CoTAgent for each question

In [None]:
from prompts import cot_agent_prompt, cot_reflect_agent_prompt, cot_reflect_prompt
from fewshots import COT, COT_REFLECT
agents = [CoTAgent(row['question'],
                   row['supporting_paragraphs'],
                   row['answer'],
                   agent_prompt=cot_agent_prompt if strategy == ReflexionStrategy.NONE else cot_reflect_agent_prompt,
                   cot_examples=COT,
                   reflect_prompt=cot_reflect_prompt,
                   reflect_examples=COT_REFLECT,
                    ) for _, row in dataset.iterrows()]

#### Run `n` trials

In [None]:
n = 5
trial = 0
log = ''

In [None]:
for i in range(n):
    for agent in [a for a in agents if not a.is_correct()]:
        agent.run(reflexion_strategy = strategy)
        print(f'Answer: {agent.key}')
    trial += 1
    log += log_trial(agents, trial)
    correct, incorrect = summarize_trial(agents)
    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')
    log += log_react_trial(agents, trial)

In [None]:
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer = True)
scores = [scorer.score(agent.key, agent.answer) for agent in agents]

print(f'Average Rouge-L: {sum([score["rougeL"].fmeasure for score in scores]) / len(scores)}')

In [None]:
ctrleval_scorer = CTRLEval()
cons_results, exp_cons_results = [], []
for agent in agents:
    question = agent.question
    generated_answer = agent.answer
    generated_answer = re.sub("\u200b", " ", generated_answer)
    if generated_answer.strip():
        prefix = [question]
        data = [question + '\n' + generated_answer]
        try:
            cons_result = ctrleval_scorer.score(aspect = 'cons', data = data, prefix = prefix, batch_size = 1)
            exp_cons_result = np.exp(cons_result[0])
        except:
            print('Error in scoring')
            print(data)
            exit()
    else:
        cons_result = [np.nan]
        exp_cons_result = 0

    cons_results.append(cons_result[0])
    exp_cons_results.append(exp_cons_result)

print(f'Average CTRLEval: {np.nanmean(cons_results)}')
print(f'Average CTRLEval Exp : {np.nanmean(exp_cons_results)}')

#### Save the result log

In [None]:
with open(os.path.join(root, 'CoT', 'context', strategy.value, f'{len(agents)}_questions_{trial}_trials.txt'), 'w') as f:
    f.write(log)
save_agents(agents, os.path.join(root, 'CoT', 'context', strategy.value, 'agents'))