In [1]:
!pip install -q --upgrade langchain langchain-google-genai
!pip install -q --upgrade langchain-community
!pip install -q --upgrade langchain-core

[0m

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import json
from typing import List
import time
from pydantic import BaseModel, Field

In [3]:
os.environ["GOOGLE_API_KEY"] = "XXX"

In [5]:
query = """
Please help me analysis if there are mentions in the provided text. There are 7 different types of mentions. 

catalyst: The catalyst used in the experiment, often is a chemical compound
co-catalyst: The co-catalyst used in the experiment, often is a chemical compound
light source: The type of light source used in the experiment. Possible choices are UV, Solar, UV-Vis, Monochromatic, Solar Simulator
lamp: The type of lamp used in the experiment. Possible choices are Fluorescent, Mercury, 'Halogen', 'Mercury-Xenon', 'LED', 'Tungsten', 'Xenon', 'Tungsten-Halide', 'Solar Simulator'
reaction medium: The type of the medium of the reactor used in the experiment. Possible choices are 'Liquid', 'Gas'
reactor type: The type of the reactor, possible choices are 'Slurry', 'Fixed-bed', 'Optical Fiber', 'Monolithic', 'Membrane', 'Fluidised-bed'
operation mode: The mode of operation in the experiment. Possible choices are 'Batch', 'Continuous', 'Batch/Continuous'

Please help me analysis if there are mentions in the provided text. If so, please indicate the answer for each mention in the provided text. And please indicate which sentence is the source of the mention as well. 

If there are any of these types are mentioned, please answer the question as type, answer and source from the context.
"""

In [28]:
class Answer(BaseModel):
    category: str = Field(description="The category of this answer")
    answer: str = Field(description="The actual selection or inference of the answer")
    source: str = Field(description="The original source of the answer")

class Answers(BaseModel):
    analysis: str = Field(description="The thinking process")
    answers: List[Answer] = Field(description="list of answers which contains the category, answer and source context of the answer")

def run_llm_annotation(llm, parser, query, context):
    prompt = PromptTemplate(
        template="Answer the user query.\n{format_instructions}\n{query}\nContext: {context}",
        input_variables=["query", "context"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    chain = prompt | llm
    return chain.invoke({"query": query, "context": context})

def execute(files_dir, model_id, query, Answers):
    parser = PydanticOutputParser(pydantic_object=Answers)
    llm = ChatGoogleGenerativeAI(
        model=model_id,
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        # other params...
    )
    for file in files_dir:
        index_file = file.split("_")[-1]
        saved_dir = f"../SolarAnno/annotation/new_annotation_{index_file}"
        if os.path.exists(saved_dir):
            pass
        else:
            print(file)
            if file[-4:] == "json":
                f = open(file, "rb")
                data = json.load(f)
                res = {}
                for item in data:
                    if item["title"] == "Doi":
                        res["paper_doi"] = item["content"]
                        doi = item["content"]
                    elif item["title"] == "Article_Title":
                        res["paper_title"] = item["content"]
                res["paragraphs"] = []
                for item in data:
                    if item["title"] in ["Abstract", "Experimental", "Results_and_Discussion", "Conclusions"]:
                        print(item["title"])
                        temp = {
                            "paragraph_text": item["content"],
                            "annotations": []
                        }
                        answers = run_llm_annotation(llm, parser, query, item["content"])
                        try:
                            answers = parser.parse(answers.content)
                            print(answers.answers)
                            for answer in answers.answers:
                                temp_answer = dict(answer)
                                temp_answer["annotator"] = "hybrid annotation"
                                temp["annotations"].append(temp_answer)
                            print("Auto-Annotation Succeed!")
                        except:
                            temp_answer = {}
                            print(f"Auto-Annotation Failed :(")
                            temp["annotations"].append(temp_answer)
                        res["paragraphs"].append(temp)
                f = open(f"../SolarAnno/annotation/new_annotation_{index_file}", "w")
                json.dump(res, f)
                time.sleep(30)

In [29]:
root_dir = "../SolarAnno/extracted_text/"
files = os.listdir(root_dir)

In [30]:
json_files = []
for file in files:
    if file[-4:] == "json":
        json_files.append(root_dir + file)

In [31]:
execute(json_files, 'gemini-2.0-flash', query, Answers)

/home/jovyan/Solar/CLI/new_paper/new_result/new_paper_14.json
Abstract
[Answer(category='catalyst', answer='C1', source='A novel binuclear cobalt complex C1 of rigidly linking two Co(TPA)ClCl moieties at meta positions of a benzene was synthesized and characterized as a catalyst for homogeneous photocatalytic CO 2 reduction.')]
Auto-Annotation Succeed!
Experimental
[Answer(category='catalyst', answer='Pd(PPh 3 ) 4', source='Pd(PPh 3 ) 4 was obtained from Energy chemical.'), Answer(category='catalyst', answer='CoCl 2 6H 2 O', source='CoCl 2 6H 2 O was purchased from Acros.'), Answer(category='co-catalyst', answer='Ir(ppy) 3', source='Cobalt complex and Ir(ppy) 3 were put into the tube with certain concentration and the total volume of photocatalytic solution was 5.0 mL (the volume ratio of DMF/TEA was 4:1).'), Answer(category='reaction medium', answer='Liquid', source='Liquid products, such as formic acid, were not detected by ion chromatography (Metrohm IC 861).'), Answer(category='lam