In [8]:
from typing import Literal
from tqdm import tqdm
from langchain.agents import initialize_agent, load_tools, AgentType
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain_experimental.plan_and_execute import (
    load_chat_planner, load_agent_executor, PlanAndExecute
)

import pandas as pd

In [9]:
# load CSV
csv_dir = "../target_csv/"
csv_name = "train_data_smi_unique_target_name"
df = pd.read_csv(csv_dir + csv_name +".csv")

In [10]:
ReasoningStrategies = Literal["zero-shot-react", "plan-and-solve"]

def load_agent(
    tool_names: list[str],
    strategy: ReasoningStrategies = "zero-shot-react"
) -> LLMChain:
    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    tools = load_tools(
        tool_names=tool_names,
        llm=llm
    )
    
    if strategy == "plan-and-solve":
        planner = load_chat_planner(llm)
        executor = load_agent_executor(llm, tools, verbose=True)
        return PlanAndExecute(planner=planner, executor=executor, verbose=True)

    return initialize_agent(
        tools=tools, llm=llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        verbose=True
    )

In [11]:
# 에이전트 로드
tool_names = ["wikipedia",'pubmed']
agent_chain = load_agent(tool_names, strategy="plan-and-solve")

In [12]:
# Description 컬럼을 생성하여 각 protein_name에 대한 설명을 추가 및 저장
if "description" not in df.columns:
    df["description"] = ""

In [13]:
save_threshold = 10  # 열 개마다 저장
save_count = 0

In [14]:
for idx, protein_name in tqdm(enumerate(df["Target Name"]), desc="Fetching descriptions", total=len(df)):
    if not df.at[idx, "description"]:  # 이미 설명이 채워진 경우 건너뜀
        query = f"Please provide a brief description of {protein_name}."
        try:
            description = agent_chain.invoke(query, handle_parsing_errors=True)
        except ValueError as e:
            description = f"Error: {str(e)}"
        
        df.at[idx, "description"] = description
        save_count += 1
        hex_count = 'A'

        # 열 개의 설명이 추가되면 저장
        if save_count >= save_threshold:
            df.to_csv(csv_name +" cnt,"+ hex_count + save_count + ".csv", index=False)
            save_count = 0  # 저장 후 카운트 초기화
            hex_count += 1

KeyError: 'protein_name'

In [None]:
# 모든 작업이 끝난 후 마지막으로 저장 (남아있는 데이터 저장)
if save_count > 0:
    df.to_csv(csv_name + "_descriptions.csv", index=False)

In [None]:
# # 예시 도구 이름 리스트
# tool_names = ["wikipedia"]

# # 에이전트 로드 (제로샷 모델 사용)
# agent_chain = load_agent(tool_names, strategy="zero-shot-react")

# # 에이전트 실행
# result = agent_chain.run("Explain about T1-Faker")

# print("Result:\n", result)