In [1]:
%reload_ext autoreload
%autoreload 2

import asyncio
import json
import nest_asyncio
import os
import sys
from dotenv import load_dotenv
import numpy as np
import pandas as pd

sys.path.append('../')
from lattereview.providers.openai_provider import OpenAIProvider
from lattereview.providers.ollama_provider import OllamaProvider
from lattereview.providers.litellm_provider import LiteLLMProvider
from lattereview.agents.scoring_reviewer import ScoringReviewer
from lattereview.review_workflow import ReviewWorkflow

Failed to update token costs. Using static costs.
  logger.error("Failed to update token costs. Using static costs.")


## Setting up the notebook

Loading environment variables:

In [2]:
# Load environment variables from .env file
load_dotenv('../.env')
print(os.getenv('OPENAI_API_KEY'))

# Enable asyncio in Jupyter
nest_asyncio.apply()

sk-cq_M0pNgHhCFnlDOCMnagYA1l2X7Yea5CL0ci5pZMNT3BlbkFJ0m0x9wm5M_EstX5SjLu_kdwGMDYjkUdviNPs4pe9cA


Loading a dummy dataset:

In [3]:
data = pd.read_excel('data.xlsx')
data.head()

Unnamed: 0,ID,Title,1st author,repo,year,abstract
0,1,Segmentized quarantine policy for managing a t...,"Kim, J.",arXiv,2024,"By the end of 2021, COVID-19 had spread to ove..."
1,2,AutoProteinEngine: A Large Language Model Driv...,"Liu, Y.",arXiv,2024,Protein engineering is important for biomedica...
2,3,Integration of Large Vision Language Models fo...,"Chen, Z.",arXiv,2024,Traditional natural disaster response involves...
3,4,Choice between Partial Trajectories,"Marklund, H.",arXiv,2024,As AI agents generate increasingly sophisticat...
4,5,Building Altruistic and Moral AI Agent with Br...,"Zhao, F.",arXiv,2024,"As AI closely interacts with human society, it..."


## Testing the base functionalities

Testing the OpenAI provider (with OpenAI and Gemini models):

In [7]:
# openanai_provider = OpenAIProvider(model="gpt-4o-mini")
openanai_provider = OpenAIProvider(model="gemini-1.5-flash")
question = "What is the capital of France?"
asyncio.run(openanai_provider.get_response(question))



('The capital of France is Paris.\n',
 {'input_cost': 5.25e-07, 'output_cost': 2.1e-06, 'total_cost': 2.625e-06})

Testing the Ollama provider:

In [5]:
ollama_provider = OllamaProvider(model="llama3.2-vision:latest", host="http://localhost:11434")
question = "What is the capital of France?"
asyncio.run(ollama_provider.get_response(question))

('The capital of France is Paris!',
 {'input_cost': 0, 'output_cost': 0, 'total_cost': 0})

Testing the LiteLLM provider:

In [9]:
# litellm_provider = LiteLLMProvider(model="gpt-4o-mini")
# litellm_provider = LiteLLMProvider(model="claude-3-5-sonnet-20240620")
# litellm_provider = LiteLLMProvider(model="groq/llama-3.3-70b-versatile")
# litellm_provider = LiteLLMProvider(model="ollama/llama3.2-vision:latest")
# litellm_provider = LiteLLMProvider(model="groq/llama-3.3-70b-versatile")
litellm_provider = LiteLLMProvider(model="gemini/gemini-1.5-flash")

question = "What is the capital of France?"
asyncio.run(litellm_provider.get_response(question))

('The capital of France is Paris.\n', 3.45e-06)

Testing the ScoringReviewer agent:

In [7]:
agent = ScoringReviewer(
    # provider=OpenAIProvider(model="gpt-4o-mini"),
    # provider=OpenAIProvider(model="gemini-1.5-flash"),
    # provider=OllamaProvider(model="llama3.2-vision:latest", host="http://localhost:11434"),
    # provider=LiteLLMProvider(model="gpt-4o-mini"),
    provider=LiteLLMProvider(model="claude-3-5-sonnet-20241022"),
    # provider=LiteLLMProvider(model="groq/llama-3.3-70b-versatile"),
    # provider=LiteLLMProvider(model="gemini/gemini-1.5-flash"),
    name="Pouria",
    max_concurrent_requests=1, 
    backstory="an expert reviewer and researcher!",
    input_description = "article title",
    model_args={"max_tokens": 100, "temperature": 0.1},
    reasoning = "brief",
    review_criteria="Look for articles that certainly do not employ any AI or machine learning agents",
    score_set=[1, 2],
    scoring_rules='Score 1 if the paper does not meet the criteria, and 2 if the paper meets the criteria.',
)


# Dummy input
text_list = data.Title.str.lower().tolist()
print("Inputs:\n\n", '\n'.join(text_list[:3]), "\n\n")

# Dummy review
results, total_cost = asyncio.run(agent.review_items(text_list[:3]))
print("Outputs:")
for result in results:
    print(result)

# Dummy costs
print("\nCosts:\n")
for item in agent.memory:
    print(item['cost'])

print("\nTotal cost:\n")
print(total_cost)

Inputs:

 segmentized quarantine policy for managing a tradeoff between containment of infectious disease and social cost of quarantine
autoproteinengine: a large language model driven agent framework for multimodal automl in protein engineering
integration of large vision language models for efficient post-disaster damage assessment and reporting 




Reviewing 3 items - 2024-12-14 21:22:53: 100%|██████████| 3/3 [00:09<00:00,  3.05s/it]

Outputs:
{'reasoning': 'The title clearly indicates a policy-focused study that addresses the balance between disease containment and social impacts through quarantine segmentation, which is a relevant and well-defined research topic.', 'score': 2}
{'reasoning': 'The title clearly indicates a framework for protein engineering that leverages large language models and multimodal AutoML, which demonstrates a clear focus on protein engineering applications.', 'score': 2}
{'reasoning': 'The title effectively indicates the use of vision-language models for post-disaster assessment, which is a clear and relevant application of AI technology for disaster management.', 'score': 2}

Costs:

0.003057
0.0029760000000000003
0.00294

Total cost:

0.00294





## Testing the main Functionalities

#### A multiagent review workflow for doing title/abstract analysis

Setting up the agents:

In [8]:
pouria = ScoringReviewer(
    # provider=OpenAIProvider(model="gemini-1.5-flash"),
    # provider=OllamaProvider(model="llama3.2-vision:latest", host="http://localhost:11434"),
    # provider=LiteLLMProvider(model="groq/llama-3.3-70b-versatile"),
    # provider=LiteLLMProvider(model="groq/llama-3.3-70b-versatile"),
    provider=LiteLLMProvider(model="gemini/gemini-1.5-flash"),
    name="Pouria",
    max_concurrent_requests=20, 
    backstory="a radiologist with many years of background in statistcis and data science, who are famous among your colleagues for your systematic thinking, organizaton of thoughts, and being conservative",
    model_args={"max_tokens": 100, "temperature": 0.1},
    input_description = "tilte and abstract of scientific articles",
    reasoning = "cot",
    scoring_task="Look for articles that disucss large languange models-based AI agents applied to medical imaging data",
    score_set=[1, 2],
    scoring_rules='Score 1 if the paper meets the criteria, and 2 if the paper does not meet the criteria.',
)

bardia = ScoringReviewer(
    provider=OpenAIProvider(model="gpt-4o-mini"),
    name="Bardia",
    max_concurrent_requests=20, 
    backstory="an expert in data science with a background in developing ML models for healthcare, who are famous among your colleagues for your creativity and out of the box thinking",
    model_args={"max_tokens": 100, "temperature": 0.8},
    input_description = "tilte and abstract of scientific articles",
    reasoning = "brief",
    scoring_task="Look for articles that disucss large languange models-based AI agents applied to medical imaging data",
    score_set=[1, 2],
    scoring_rules='Score 1 if the paper meets the criteria, and 2 if the paper does not meet the criteria.',
)

brad = ScoringReviewer(
    provider=OpenAIProvider(model="gpt-4o"),
    name="Brad",
    max_concurrent_requests=20, 
    backstory="a senior radiologist with a PhD in computer science and years of experience as the director of a DL lab focused on developing ML models for radiology and healthcare",
    input_description = "tilte and abstract of scientific articles",
    temperature=0.4,
    reasoning = "cot",
    max_tokens=100,
    scoring_task="""Pouria and Bardia have Looked for articles that disucss large languange models-based AI agents applied to medical imaging data. 
                       They scored an article 1 if they thought it does not meet this criteria, 2 if they thought it meets the criteria, 0 if they were uncertain of scoring.
                       You will receive an article they have had different opinions about, as well as each of their scores and their reasoning for that score. Read their reviews and determine who you agree with. 
                    """,
    score_set=[1, 2],
    scoring_rules="""Score 1 if you agree with Pouria, and score 2 if you agree with Bardia.""",
)


Setting up the review workflow:

In [9]:
title_abs_review = ReviewWorkflow(
    workflow_schema=[
        {
            "round": 'A',
            "reviewers": [pouria, bardia],
            "inputs": ["Title", "abstract"]
        },
        {
            "round": 'B',
            "reviewers": [brad],
            "inputs": ["Title", "abstract", "round-A_Pouria_output", "round-A_Bardia_output"],
            "filter": lambda row: row["round-A_Pouria_output"]["score"] != row["round-A_Bardia_output"]["score"]
        }
    ]
)

Applying the review workflow to a number of sample articles:

In [10]:
# Reload the data if needed.
sample_data = pd.read_excel('data.xlsx').sample(10).reset_index(drop=True)
updated_data = asyncio.run(title_abs_review(sample_data))

print("Total cost: ")
print(title_abs_review.get_total_cost())

print("\nDetailed cost:")
print(title_abs_review.reviewer_costs)

updated_data


Starting review round A (1/2)...
Processing 10 eligible rows


['round: A', 'reviewer_name: Pouria'] - 2024-12-14 21:23:02: 100%|██████████| 10/10 [00:01<00:00,  8.94it/s]
['round: A', 'reviewer_name: Bardia'] - 2024-12-14 21:23:03: 100%|██████████| 10/10 [00:09<00:00,  1.04it/s]


Starting review round B (2/2)...
Skipping review round B - no eligible rows
Total cost: 
0.0001284

Detailed cost:
{('A', 'Pouria'): 4.785e-05, ('A', 'Bardia'): 8.055e-05}





Unnamed: 0,ID,Title,1st author,repo,year,abstract,round-A_Pouria_output,round-A_Pouria_score,round-A_Pouria_reasoning,round-A_Bardia_output,round-A_Bardia_score,round-A_Bardia_reasoning
0,229,Towards a Surgeon-in-the-Loop Ophthalmic Robot...,"Gomaa, A.",arXiv,2023,Robot-assisted surgical systems have demonstra...,{'reasoning': 'The abstract does not mention t...,2,The abstract does not mention the use of large...,{'reasoning': 'The article focuses on a surgeo...,2,The article focuses on a surgeon-in-the-loop a...
1,430,Computational model for tumor response to adop...,"Luque, L.M.",arXiv,2022,One of the barriers to the development of effe...,{'reasoning': 'The abstract does not mention l...,2,The abstract does not mention large language m...,{'reasoning': 'The article does not discuss la...,2,The article does not discuss large language mo...
2,272,Using a library of chemical reactions to fit s...,"Burrage, P.M.",arXiv,2023,In this paper we introduce a new method based ...,{'reasoning': 'The article does not discuss la...,2,The article does not discuss large language mo...,{'reasoning': 'This paper discusses methods re...,2,This paper discusses methods related to chemic...
3,38,"Talk, Listen, Connect: Navigating Empathy in H...","Roshanaei, M.",arXiv,2024,"Social interactions promote well-being, yet ch...",{'reasoning': 'The article focuses on AI agent...,2,The article focuses on AI agents applied to me...,{'reasoning': 'The article focuses on empathy ...,2,The article focuses on empathy in human-AI int...
4,185,From multiplicity of infection to force of inf...,"Zhan, Q.",medRxiv,2024,"High multiplicity of infection or MOI, the num...",{'reasoning': 'The article focuses on applying...,2,The article focuses on applying queuing theory...,{'reasoning': 'The article discusses the appli...,2,The article discusses the application of queui...
5,123,Can large language models understand uncommon ...,"Wu, J.",arXiv,2024,Large language models (LLMs) like ChatGPT have...,{'reasoning': 'The abstract mentions large lan...,2,The abstract mentions large language models (L...,{'reasoning': 'The article discusses large lan...,2,The article discusses large language models (L...
6,352,Intrinsic Motivation in Model-based Reinforcem...,"Latyshev, A.",arXiv,2023,The reinforcement learning research area conta...,{'reasoning': 'The abstract does not mention l...,2,The abstract does not mention large language m...,{'reasoning': 'The article focuses on reinforc...,2,The article focuses on reinforcement learning ...
7,349,The impact of vaccination frequency on COVID-1...,"Stoddard, M.",medRxiv,2023,While the rapid deployment of SARS-CoV-2 vacci...,{'reasoning': 'The article does not discuss la...,2,The article does not discuss large language mo...,{'reasoning': 'The article discusses vaccinati...,2,The article discusses vaccination frequency's ...
8,585,Biomechanic posture stabilisation via iterativ...,"Hossny, M.",arXiv,2020,It is not until we become senior citizens do w...,{'reasoning': 'The article does not discuss la...,2,The article does not discuss large language mo...,{'reasoning': 'The article does not discuss la...,2,The article does not discuss large language mo...
9,715,Integrating episodic memory into a reinforceme...,"Young, K.J.",arXiv,2018,Episodic memory is a psychology term which ref...,{'reasoning': 'The abstract does not mention l...,2,The abstract does not mention large language m...,{'reasoning': 'The article discusses a reinfor...,2,The article discusses a reinforcement learning...


In [11]:
for i, row in updated_data.iterrows():
    print(
        f"""
        Title: {row.Title}
        Abstract: {row.abstract}
        Pouria's score: {row["round-A_Pouria_score"]}
        Pouria's reasoning: {row["round-A_Pouria_reasoning"]}
        Bardia's score: {row["round-A_Bardia_score"]}
        Bardia's reasoning: {row["round-A_Bardia_reasoning"]}
        Brad's score: {None if "round-B_Brad_score" not in row else row["round-B_Brad_score"]}
        Brad's reasoning: {None if "round-B_Brad_reasoning" not in row else row["round-B_Brad_reasoning"]}
        """
    )


        Title: Towards a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using Reinforcement and Imitation Learning
        Abstract: Robot-assisted surgical systems have demonstrated significant potential in enhancing surgical precision and minimizing human errors. However, existing systems cannot accommodate individual surgeons’ unique preferences and requirements. Additionally, they primarily focus on general surgeries (e.g., laparoscopy) and are unsuitable for highly precise microsurgeries, such as ophthalmic procedures. Thus, we propose an image-guided approach for surgeon-centered autonomous agents that can adapt to the individual surgeon’s skill level and preferred surgical techniques during ophthalmic cataract surgery. Our approach trains reinforcement and imitation learning agents simultaneously using curriculum learning approaches guided by image data to perform all tasks of the incision phase of cataract surgery. By integrating the surgeon’s actions and preferences into t