In [1]:
import requests
import pandas as pd

from bs4 import BeautifulSoup
from langchain import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
csv_dir = './target_csv/'
csv_name = 'target_protein_wiki'
df = pd.read_csv(csv_dir + csv_name + '.csv')

In [3]:
# 임베딩 모델 설정 (예: Sentence-BERT 사용)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')



In [4]:
def get_wiki_content(url):
    """ 주어진 Wikipedia URL에서 내용을 크롤링하여 텍스트를 반환합니다. """
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([para.text for para in paragraphs])
        return content.strip()
    except Exception as e:
        return f"Error: {str(e)}"

In [5]:
def summarize_text(text, llm_chain):
    """주어진 텍스트를 LLM을 통해 요약합니다."""
    try:
        return llm_chain.invoke(text).content
    except Exception as e:
        return f"Error during summarization: {str(e)}"

In [6]:
# LLM 설정
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

template = """
Please remove any unnecessary phrases (e.g., 'citation needed', '[who?]', '[when?]' etc.) from the text below and retain only the core content. Clean up any parts that disrupt the flow of the text. However, do not remove any identifiers (e.g., gene or protein IDs).
Text: {text}
"""

# Prompt 설정
prompt = PromptTemplate(
    input_variables=["text"],  # LLM이 처리할 입력 변수 이름
    template=template
)# LLM에게 줄 실제 프롬프트

# LLMChain 생성
llm_chain = prompt | llm

In [7]:
df_filtered = df[df['Wiki'] != 'FALSE']

In [13]:
# "Wiki_documents" 컬럼 생성 및 요약 처리
df_filtered.loc[:, 'Wiki_documents'] = df_filtered['Wiki'].apply(
    lambda x: summarize_text(get_wiki_content(x), llm_chain)
)


In [14]:
# 임베딩 생성 및 "embedding" 컬럼 추가
df_filtered.loc[:, 'embedding'] = df_filtered['Wiki_documents'].apply(
    lambda text: None if text is None else embedding_model.encode(text)
)

In [16]:
print(df_filtered.head)

<bound method NDFrame.head of                                           Target Name  \
1             Macrophage-stimulating protein receptor   
2                           Cyclin-dependent kinase 7   
4                                   Plasma kallikrein   
5                       PI3-kinase p110-alpha subunit   
6   Signal transducer and activator of transcripti...   
7                     Rho-associated protein kinase 2   
8                        Tyrosine-protein kinase JAK2   
10              Serine/threonine-protein kinase B-raf   
11                        Tyrosine-protein kinase TXK   
12                      PI3-kinase p110-delta subunit   
14               Serine/threonine-protein kinase mTOR   
15                Dual specificity protein kinase TTK   
16                    Tyrosine-protein kinase ITK/TSK   
17                               Choline kinase alpha   
18                        Tyrosine-protein kinase TEC   
19              Tyrosine-protein kinase receptor FLT3   
2

In [17]:
print(df_filtered.columns)

Index(['Target Name', 'Wiki', 'Wiki_documents', 'embedding'], dtype='object')


In [18]:
# 결과를 새 CSV 파일로 저장
output_file_path = './target_csv/protein_embedding_with_LLM.csv'
df_filtered.to_csv(output_file_path, index=False , index = False)
print(f"새 파일이 저장되었습니다: {output_file_path}")

새 파일이 저장되었습니다: ./target_csv/protein_embedding_with_LLM.csv


In [12]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

# csv_name = 'target_protein_wiki'
# df = pd.read_csv(csv_name + '.csv')

# def get_wiki_content(url):
#     """ 주어진 Wikipedia URL에서 내용을 크롤링하여 텍스트를 반환합니다. """
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, 'html.parser')
#         paragraphs = soup.find_all('p')
#         content = ' '.join([para.text for para in paragraphs])
#         return content.strip()
#     except Exception as e:
#         return f"Error: {str(e)}"

# # 새로운 컬럼 "Wiki_documents" 생성
# df['Wiki_documents'] = df['Wiki'].apply(lambda x: None if x == "FALSE" else get_wiki_content(x))

# # 결과를 새 CSV 파일로 저장
# output_file_path = csv_name + '_wiki_documents.csv'
# df.to_csv(output_file_path, index=False)

# print(f"새 파일이 저장되었습니다: {output_file_path}")
# None GPT