In [14]:
from typing import Literal
from tqdm import tqdm
from langchain.agents import initialize_agent, load_tools, AgentType
from langchain.chains import LLMChain
# from langchain_community.chat_models import ChatOllama
from langchain.agents import load_tools, initialize_agent
from langchain_community.utilities import SerpAPIWrapper

from langchain_experimental.plan_and_execute import (
    load_chat_planner, load_agent_executor, PlanAndExecute
)
from langchain.tools import Tool
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer


# if use GPT
from langchain_openai import ChatOpenAI

import pandas as pd
import warnings

In [15]:
# load CSV
csv_name = "train_data_smi_unique_target_name"
csv_dir = "../target_csv/"
df = pd.read_csv(csv_dir + csv_name +".csv")

In [16]:
# 경고 무시 설정
warnings.filterwarnings("ignore", category=FutureWarning)

# 토크나이저 설정
# # clean_up_tokenization_spaces=True, TRUE => 공백 포함, FALSE => 공백 제거, default = TRUE, 글 => TRUE, 코드나 포맷 형식 => FALSE  
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# 임베딩 모델 로드
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
ReasoningStrategies = Literal["zero-shot-react", "plan-and-solve"]

def load_agent(
    tool_names: list[str],
    strategy: ReasoningStrategies = "zero-shot-react"
) -> LLMChain:
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, streaming=True)
    tools = load_tools(
        tool_names=tool_names,
        llm=llm
    )
    
    if strategy == "plan-and-solve":
        planner = load_chat_planner(llm)
        executor = load_agent_executor(llm, tools, verbose=True)
        return PlanAndExecute(planner=planner, executor=executor, verbose=True)

    return initialize_agent(
        tools=tools, llm=llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        verbose=True
    )

In [None]:
# tools_llm = ChatOllama(model="llama3.1:70b", temperature=0, streaming=True)

In [None]:
# 다른 도구들과 함께 PubMed Tool 로드
tools = ["wikipedia", "serpapi", "pubmed"]

In [None]:
# 에이전트 초기화
agent_chain = load_agent(tools, strategy="plan-and-solve")

In [None]:
# Description 컬럼을 생성하여 각 protein_name에 대한 설명을 추가 및 저장
if "description" not in df.columns:
    df["description"] = ""

In [None]:
save_threshold = 10  # 열 개마다 저장
save_count = 0

In [None]:
for idx, protein_name in tqdm(enumerate(df["Target Name"]), desc="Fetching descriptions", total=len(df)):
    if not df.at[idx, "description"]:  # 이미 설명이 채워진 경우 건너뜀
        query = f"Please provide a brief description of {protein_name}."
        try:
            # description = agent_chain.invoke(query, handle_parsing_errors=True)
            description = agent_chain.invoke(query)

            if isinstance(description, dict) and 'output' in description:
                description = description['output']
            else:
                description = f"Error: Unexpected format - {description}"
                
        except ValueError as e:
            description = f"Error: {str(e)}"
        
        df.at[idx, "description"] = description
        save_count += 1

        # 열 개의 설명이 추가되면 저장
        if save_count >= save_threshold:
            df.to_csv(csv_name + " cnt," + str(save_count) + ".csv", index=False)
            save_count = 0  # 저장 후 카운트 초기화

In [None]:
# 모든 작업이 끝난 후 마지막으로 저장 (남아있는 데이터 저장)
if save_count > 0:
    df.to_csv(csv_dir+csv_name + "_descriptions.csv", index=False)

In [None]:
df = df.drop(index=0)

In [None]:
df

In [None]:
# # JSon, output만 분리
# df['description'] = df['description'].apply(lambda x: x['output'])

In [None]:
# 임베딩을 Target Name에 대응시키는 딕셔너리 생성
df['embedding'] = df['description'].apply(
    lambda text: None if text is None else embedding_model.encode(text)
)

In [None]:
df.to_csv(csv_dir + csv_name + "_descriptions.csv", index=False)

In [None]:
# # 예시 도구 이름 리스트
# tool_names = ["wikipedia"]

# # 에이전트 로드 (제로샷 모델 사용)
# agent_chain = load_agent(tool_names, strategy="zero-shot-react")

# # 에이전트 실행
# result = agent_chain.run("Explain about T1-Faker")

# print("Result:\n", result)

In [None]:
# from langchain_experimental.plan_and_execute import PlanAndExecute
# from langchain.llms import OpenAI
# from langchain.tools import Tool
# from langchain_community.utilities import SerpAPIWrapper

# # LLM 초기화
# llm = OpenAI(model="gpt-4")

# # PubMed 검색 함수 정의
# def search_pubmed(query: str) -> str:
#     serpapi = SerpAPIWrapper()
#     results = serpapi.run(query)
#     return "\n".join(results)

# # PubMed 도구 생성
# pubmed_tool = Tool(
#     name="pubmed_search",
#     func=search_pubmed,
#     description="Search PubMed for scientific articles related to the query."
# )

# # Plan-and-Execute 에이전트 초기화
# agent = PlanAndExecute(
#     planner=llm,
#     executor=llm,
#     tools=[pubmed_tool],
#     verbose=True
# )

# # 에이전트를 사용한 쿼리 실행
# query = "Tyrosine-protein kinase receptor RET"
# response = agent.run(query)

# print(response)
