In [None]:
import openai_utils
import warmup
import pandas as pd
import json
import h5py

#### Initialize OpenAI API

In [None]:
api_key = "your_openai_api"  # Replace with your OpenAI API key
openai_utils.initialize_openai(api_key=api_key)

In [None]:
params = openai_utils.set_open_params()

#### Warm-up phase

In [None]:
# read demo training dataset
training_data = pd.read_excel("/content/demo_data/demo_training_data.xlsx")
X_train = training_data[['Drug A', 'Drug B', 'Animal Model']]
y_train = training_data['Efficacy']

# process warmup phase
output_hdf5_path, output_json_path = warmup.warmup(
    X_train, y_train, params, openai_utils.client,
    output_hdf5_path="embeddings.hdf5",
    pathway_dir="/content/demo_data/pathway",
    output_json_path="warm_up_CoT.json"
)

print(f"HDF5 Output: {output_hdf5_path}")
print(f"JSON Output: {output_json_path}")

HDF5 Output: embeddings.hdf5
JSON Output: warm_up_CoT.json


entry_1, entry_3 are wrong predictions--> delete from training examples.

In [None]:
# delete wrong predictions in json file
with open('/content/warm_up_CoT.json', 'r') as file:
    data = json.load(file)

mismatched_entries = []

for entry_id, entry in data.items():
    if entry['predicted_answer'] != entry['real_answer']:
        mismatched_entries.append(entry_id)

for record in mismatched_entries:
    if record in data:
        del data[record]

final_training_data = json.dumps(data)

new_file_path = '/content/warm_up_CoT_final.json'

with open(new_file_path, 'w') as new_file:
    json.dump(data, new_file, indent=4)

In [None]:
# delete wrong predictions in hdf5 file
original_file_path = '/content/embeddings.hdf5'
new_file_path = '/content/embeddings_final.hdf5'

with h5py.File(original_file_path, 'r') as hdf_original:
    with h5py.File(new_file_path, 'w') as hdf_new:
        for entry in hdf_original:
            if entry not in mismatched_entries:
                hdf_original.copy(entry, hdf_new)

#### Inference phase

dynamic few-shots learning examples + self-consistency

In [None]:
import pandas as pd
import numpy as np
from inference import inference

In [None]:
embedding_hdf5_path = "/content/embeddings_final.hdf5"
json_path = "/content/warm_up_CoT_final.json"
pathway_dir = "/content/demo_data/pathway"
test_data_path = "/content/demo_data/demo_testing_data.xlsx"
output_dir = "/content/test_results" # create the folder

X_test = pd.read_excel(test_data_path)

with h5py.File(embedding_hdf5_path, 'r') as hdf5_file:
    embeddings = [hdf5_file[name][:] for name in hdf5_file.keys()]
    embedding_train = np.stack(embeddings)

In [None]:
# perform self-consistency
def run_multiple_inferences(X_test, iterations, n_neighbors, embedding_train, params, output_dir):
    result_files = []
    for iteration in range(1, iterations + 1):
        result_file_path = inference(
            X_test=X_test,
            n=n_neighbors,
            params=params,
            embedding_train=embedding_train,
            client=openai_utils.client,
            iteration=iteration,
            hdf5_path=embedding_hdf5_path,
            json_path=json_path,
            pathway_dir=pathway_dir,
            output_dir=output_dir
        )
        result_files.append(result_file_path)
        print(f"Iteration {iteration} results saved to: {result_file_path}")
    return result_files


In [None]:
# Run inference for multiple iterations
iterations = 3  # Number of iterations (iterations = 5 is better)
n_neighbors = 2  # Number of neighbors for k-NN (n_neighbors = 5 is better)
result_files = run_multiple_inferences(X_test, iterations, n_neighbors, embedding_train, params, output_dir)

Iteration 1 results saved to: /content/test_results/test_result_1.json
Iteration 2 results saved to: /content/test_results/test_result_2.json
Iteration 3 results saved to: /content/test_results/test_result_3.json


#### Revision phase

In [None]:
# Select CoT for reviewers
excel_path = '/content/test_results/test_final.xlsx' # merge all previous test results and calculate majority vote
df = pd.read_excel(excel_path)

majority_answers = df.set_index('Entry ID')['Majority Vote'].to_dict()


json_files = [
    '/content/test_results/test_result_1.json',
    '/content/test_results/test_result_2.json',
    '/content/test_results/test_result_3.json'
]

select_chains = {}

for json_file in json_files:
    with open(json_file, 'r') as file:
        data = json.load(file)

    for entry_id, entry_data in data.items():
        question = entry_data['question']
        chain_of_thoughts = entry_data['chain_of_thoughts']
        predicted_answer = entry_data['predicted_answer']

        # Check if the predicted answer matches the majority vote answer
        if entry_id in majority_answers and predicted_answer == majority_answers[entry_id]:
            # Check if the current chain of thoughts is the longest found so far
            if entry_id not in select_chains or len(chain_of_thoughts) > len(select_chains[entry_id]['chain_of_thoughts']):
                select_chains[entry_id] = {
                    'question': question,
                    'chain_of_thoughts': chain_of_thoughts,
                    'predicted_answer': predicted_answer,
                    'source_file': json_file  # Track the source file
                }

# Save the results to a JSON file
output_path = '/content/test_results/select_chains.json'
with open(output_path, 'w') as outfile:
    json.dump(select_chains, outfile, indent=4)


In [None]:
import anthropic
from pathlib import Path
from reviewer import reviewer, process_reviewer_multiple_runs

In [28]:
anthropic_client = anthropic.Anthropic(
    api_key = "your_api"
)  # Replace with your api key

# Define the input CoT JSON file and results directory
CoT_json_file = "/content/test_results/select_chains.json"  # JSON file with chains of thought
results_path = "/content/feedback_results"  # Create directory to save the feedback results

# ToT Reviewer
process_reviewer_multiple_runs(CoT_json_file, anthropic_client, results_path=results_path)

Run 1 completed and saved to /content/feedback_results/feedbacks_1.json.
All runs processed and saved.


#### Moderator phase

In [29]:
from moderator import moderator, process_final_multiple_runs

CoT_json_file = "/content/test_results/select_chains.json"
feedback_path = "/content/feedback_results/feedbacks_1.json"
results_path = "/content/final_results" # create folder to save final results

process_final_multiple_runs(CoT_json_file, feedback_path, client, runs=3, results_path=results_path)

Run 1 completed and saved to /content/final_results/final_answer_with_feedback_1.json.
Run 2 completed and saved to /content/final_results/final_answer_with_feedback_2.json.
Run 3 completed and saved to /content/final_results/final_answer_with_feedback_3.json.
All runs processed and saved.
