In [2]:
%reload_ext autoreload
%autoreload 2

import asyncio
import json
import nest_asyncio
import os
import sys
from dotenv import load_dotenv
import pandas as pd

sys.path.append('../')
from lattereview.providers.openai_provider import OpenAIProvider
from lattereview.providers.ollama_provider import OllamaProvider
from lattereview.agents.scoring_reviewer import ScoringReviewer
from lattereview.review_workflow import ReviewWorkflow

Failed to update token costs. Using static costs.
  logger.error("Failed to update token costs. Using static costs.")
  from .autonotebook import tqdm as notebook_tqdm


## Setting up the notebook

Loading environment variables:

In [3]:
# Load environment variables from .env file
load_dotenv('../.env')
print(os.getenv('OPENAI_API_KEY'))

# Enable asyncio in Jupyter
nest_asyncio.apply()

sk-cq_M0pNgHhCFnlDOCMnagYA1l2X7Yea5CL0ci5pZMNT3BlbkFJ0m0x9wm5M_EstX5SjLu_kdwGMDYjkUdviNPs4pe9cA


Loading a dummy dataset:

In [4]:
data = pd.read_excel('data.xlsx')
data.head()

Unnamed: 0,ID,Title,1st author,repo,year,abstract
0,1,Segmentized quarantine policy for managing a t...,"Kim, J.",arXiv,2024,"By the end of 2021, COVID-19 had spread to ove..."
1,2,AutoProteinEngine: A Large Language Model Driv...,"Liu, Y.",arXiv,2024,Protein engineering is important for biomedica...
2,3,Integration of Large Vision Language Models fo...,"Chen, Z.",arXiv,2024,Traditional natural disaster response involves...
3,4,Choice between Partial Trajectories,"Marklund, H.",arXiv,2024,As AI agents generate increasingly sophisticat...
4,5,Building Altruistic and Moral AI Agent with Br...,"Zhao, F.",arXiv,2024,"As AI closely interacts with human society, it..."


## Testing the base functionalities

Testing the OpenAI provider:

In [5]:
openanai_provider = OpenAIProvider(model="gpt-4o-mini")
question = "What is the capital of France?"
asyncio.run(openanai_provider.get_response(question, temperature=0.9))



('The capital of France is Paris.',
 {'input_cost': 1.05e-06, 'output_cost': 4.2e-06, 'total_cost': 5.25e-06})

Testing the Ollama provider:

In [6]:
ollama_provider = OllamaProvider(model="llama3.2-vision:latest", host="http://localhost:11434")
question = "What is the capital of France?"
asyncio.run(ollama_provider.get_response(question))

('The capital of France is Paris!',
 {'input_cost': 0, 'output_cost': 0, 'total_cost': 0})

Testing the ScoringReviewer agent:

In [None]:
agent = ScoringReviewer(
    # provider=OpenAIProvider(model="gpt-4o-mini"),
    provider=OllamaProvider(model="llama3.2-vision:latest", host="http://localhost:11434"),
    name="Pouria",
    backstory="an expert reviewer and researcher!",
    input_description = "article title",
    temperature=0.1, # ineffective for ollama
    reasoning = "brief",
    max_tokens=100, # ineffective for ollama
    review_criteria="Look for articles that certainly do not employ any AI or machine learning agents",
    score_set=[1, 2],
    scoring_rules='Score 1 if the paper does not meet the criteria, and 2 if the paper meets the criteria.',
)


# Dummy input
text_list = data.Title.str.lower().tolist()
print("Inputs:\n\n", '\n'.join(text_list[:3]))

# Dummy review
results, total_cost = asyncio.run(agent.review_items(text_list[:3]))
print("\n\n Outputs:")
for result in results:
    print(result)

# Dummy costs
print("\nCosts:\n")
for item in agent.memory:
    print(item['cost'])

print("\nTotal cost:\n")
print(total_cost)

Inputs:

 segmentized quarantine policy for managing a tradeoff between containment of infectious disease and social cost of quarantine
autoproteinengine: a large language model driven agent framework for multimodal automl in protein engineering
integration of large vision language models for efficient post-disaster damage assessment and reporting


 Outputs:
{'reasoning': 'The input item does not clearly outline specific strategies or measures for segmentized quarantine policy, making it difficult to assess its effectiveness in managing the tradeoff between containment of infectious disease and social cost of quarantine.', 'score': 1}
{'reasoning': "The input item 'autoproteinengine: a large language model driven agent framework for multimodal automl in protein engineering' does not meet the specified criteria, as it is an article title and not a specific task or problem to be evaluated.", 'score': 1}
{'reasoning': 'The input item clearly describes an application of vision language mo

## Testing the main Functionalities

#### A multiagent review workflow for doing title/abstract analysis

Setting up the agents:

In [None]:
pouria = ScoringReviewer(
    # provider=OpenAIProvider(model="gpt-4o-mini"),
    provider=OllamaProvider(model="llama3.2-vision:latest", host="http://localhost:11434"),
    name="Pouria",
    backstory="a radiologist with many years of background in statistcis and data science, who are famous among your colleagues for your systematic thinking, organizaton of thoughts, and being conservative",
    input_description = "tilte and abstract of scientific articles",
    temperature=0.1, # ineffective for ollama
    reasoning = "cot",
    max_tokens=100, # ineffective for ollama
    scoring_task="Look for articles that disucss large languange models-based AI agents applied to medical imaging data",
    score_set=[1, 2],
    scoring_rules='Score 1 if the paper meets the criteria, and 2 if the paper does not meet the criteria.',
)

bardia = ScoringReviewer(
    provider=OpenAIProvider(model="gpt-4o-mini"),
    name="Bardia",
    backstory="an expert in data science with a background in developing ML models for healthcare, who are famous among your colleagues for your creativity and out of the box thinking",
    input_description = "tilte and abstract of scientific articles",
    temperature=0.7,
    reasoning = "brief",
    max_tokens=100,
    scoring_task="Look for articles that disucss large languange models-based AI agents applied to medical imaging data",
    score_set=[1, 2],
    scoring_rules='Score 1 if the paper meets the criteria, and 2 if the paper does not meet the criteria.',
)

brad = ScoringReviewer(
    provider=OpenAIProvider(model="gpt-4o"),
    name="Brad",
    backstory="a senior radiologist with a PhD in computer science and years of experience as the director of a DL lab focused on developing ML models for radiology and healthcare",
    input_description = "tilte and abstract of scientific articles",
    temperature=0.4,
    reasoning = "cot",
    max_tokens=100,
    scoring_task="""Pouria and Bardia have Looked for articles that disucss large languange models-based AI agents applied to medical imaging data. 
                       They scored an article 1 if they thought it does not meet this criteria, 2 if they thought it meets the criteria, 0 if they were uncertain of scoring.
                       You will receive an article they have had different opinions about, as well as each of their scores and their reasoning for that score. Read their reviews and determine who you agree with. 
                    """,
    score_set=[1, 2],
    scoring_rules="""Score 1 if you agree with Pouria, and score 2 if you agree with Bardia.""",
)


Setting up the review workflow:

In [9]:
title_abs_review = ReviewWorkflow(
    workflow_schema=[
        {
            "round": 'A',
            "reviewers": [pouria, bardia],
            "inputs": ["Title", "abstract"]
        },
        {
            "round": 'B',
            "reviewers": [brad],
            "inputs": ["Title", "abstract", "round-A_Pouria_output", "round-A_Bardia_output"],
            "filter": lambda row: row["round-A_Pouria_output"]["score"] != row["round-A_Bardia_output"]["score"]
        }
    ]
)

Applying the review workflow to a number of sample articles:

In [10]:
# Reload the data if needed.
sample_data = pd.read_excel('data.xlsx').sample(10).reset_index(drop=True)
updated_data = asyncio.run(title_abs_review(sample_data))

print("Total cost: ")
print(title_abs_review.get_total_cost())

print("\nDetailed cost:")
print(title_abs_review.reviewer_costs)

updated_data


Starting review round A (1/2)...
Reviewers: ['Pouria', 'Bardia']
Input data: ['Title', 'abstract']


                                                       

Number of eligible rows for review: 10


                                                         


Starting review round B (2/2)...
Reviewers: ['Brad']
Input data: ['Title', 'abstract', 'round-A_Pouria_output', 'round-A_Bardia_output']


                                                       

Number of eligible rows for review: 1


                                                        

Total cost: 
0.00374935

Detailed cost:
{('A', 'Pouria'): 0, ('A', 'Bardia'): 0.00014685, ('B', 'Brad'): 0.0036025}




Unnamed: 0,ID,Title,1st author,repo,year,abstract,round-A_Pouria_output,round-A_Bardia_output,round-B_Brad_output
0,129,Agent Hospital: A Simulacrum of Hospital with ...,"Li, J.",arXiv,2024,"In this paper, we introduce a simulacrum of ho...",{'reasoning': 'The input item discusses Agent ...,{'reasoning': 'The article discusses the appli...,{'reasoning': 'The title and abstract of the a...
1,540,Multi-level adaptation of distributed decision...,"Blanco-Fernández, D.",arXiv,2021,"To solve complex tasks, individuals often auto...",{'reasoning': 'This article discusses team ada...,{'reasoning': 'The article discusses decision-...,
2,232,Complexity Synchronization in Emergent Intelli...,"Mahmoodi, K.",arXiv,2023,"In this work, we use a simple multi-agent-base...",{'reasoning': 'The input item discusses comple...,{'reasoning': 'The paper discusses complexity ...,
3,274,Perimeter Control with Heterogeneous Metering ...,"Yu, J.",arXiv,2023,Perimeter Control (PC) strategies have been pr...,{'reasoning': 'The input item discusses Multi-...,{'reasoning': 'The article discusses traffic s...,
4,370,Generating synthetic data with a mechanism-bas...,"Cockrell, C.",bioRxiv,2022,Machine learning (ML) and Artificial Intellige...,{'reasoning': 'The input item discusses genera...,{'reasoning': 'The article discusses synthetic...,
5,250,"Towards Autonomous Supply Chains: Definition, ...","Xu, L.",arXiv,2023,"Recent global disruptions, such as the COVID-1...",{'reasoning': 'The input item discusses Autono...,{'reasoning': 'The article discusses autonomou...,
6,665,Split Q learning: Reinforcement learning with ...,"Lin, B.",arXiv,2019,Drawing an inspiration from behavioral studies...,{'reasoning': 'The input item discusses reinfo...,{'reasoning': 'The article discusses a reinfor...,
7,649,Range expansion shifts clonal interference pat...,"Krishnan, N.",bioRxiv,2019,"Increasingly, predicting and even controlling ...",{'reasoning': 'The input item discusses evolut...,{'reasoning': 'The article does not discuss la...,
8,272,Using a library of chemical reactions to fit s...,"Burrage, P.M.",arXiv,2023,In this paper we introduce a new method based ...,{'reasoning': 'The input item abstract discuss...,{'reasoning': 'The article focuses on agent-ba...,
9,409,Not cheating on the Turing Test: towards groun...,"Alberts, L.",arXiv,2022,"In this thesis, I carry out a novel and interd...",{'reasoning': 'The input item discusses the de...,{'reasoning': 'The article discusses grounded ...,


In [11]:
for i, row in updated_data.iterrows():
    print(
        f"""
        Title: {row.Title}
        Abstract: {row.abstract}
        Pouria's review: {row["round-A_Pouria_output"]}
        Bardia's review: {row["round-A_Bardia_output"]}
        """
    )
    if "round-B_Brad_output" in row:
        print(row["round-B_Brad_output"])


        Title: Agent Hospital: A Simulacrum of Hospital with Evolvable Medical Agents
        Abstract: In this paper, we introduce a simulacrum of hospital called Agent Hospital that simulates the entire process of treating illness. All patients, nurses, and doctors are autonomous agents powered by large language models (LLMs). Our central goal is to enable a doctor agent to learn how to treat illness within the simulacrum. To do so, we propose a method called MedAgent-Zero. As the simulacrum can simulate disease onset and progression based on knowledge bases and LLMs, doctor agents can keep accumulating experience from both successful and unsuccessful cases. Simulation experiments show that the treatment performance of doctor agents consistently improves on various tasks. More interestingly, the knowledge the doctor agents have acquired in Agent Hospital is applicable to real-world medicare benchmarks. After treating around ten thousand patients (real-world doctors may take over two

In [12]:
brad.memory

[{'identity': {'system_prompt': "Your name is <<Brad>> and you are <<a senior radiologist with a PhD in computer science and years of experience as the director of a DL lab focused on developing ML models for radiology and healthcare>>. Your task is to review input itmes with the following description: <<tilte and abstract of scientific articles>>. Your final output should have the following keys: reasoning (<class 'str'>), score (<class 'int'>).",
   'item_prompt': 'Review the input item below and evaluate it against the following criteria: Scoring task: <<Pouria and Bardia have Looked for articles that disucss large languange models-based AI agents applied to medical imaging data. They scored an article 1 if they thought it does not meet this criteria, 2 if they thought it meets the criteria, 0 if they were uncertain of scoring. You will receive an article they have had different opinions about, as well as each of their scores and their reasoning for that score. Read their reviews an