In [1]:
import requests
import pandas as pd

from bs4 import BeautifulSoup
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
csv_name = 'target_protein_wiki'
df = pd.read_csv(csv_name + '.csv')

In [3]:
# 임베딩 모델 설정 (예: Sentence-BERT 사용)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')



In [4]:
def get_wiki_content(url):
    """ 주어진 Wikipedia URL에서 내용을 크롤링하여 텍스트를 반환합니다. """
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([para.text for para in paragraphs])
        return content.strip()
    except Exception as e:
        return f"Error: {str(e)}"

In [5]:
def summarize_text(text, llm_chain):
    """주어진 텍스트를 LLM을 통해 요약합니다."""
    try:
        return llm_chain.invoke(text).content
    except Exception as e:
        return f"Error during summarization: {str(e)}"

In [6]:
# LLM 설정
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# Prompt 설정
prompt = PromptTemplate(
    input_variables=["text"],  # LLM이 처리할 입력 변수 이름
    template="Please summarize the following text: {text}"  # LLM에게 줄 실제 프롬프트
)

# LLMChain 생성
llm_chain = prompt | llm

In [7]:
# "Wiki_documents" 컬럼 생성 및 요약 처리
df['Wiki_documents'] = df['Wiki'].apply(
    lambda x: None if x == "FALSE" else summarize_text(get_wiki_content(x), llm_chain)
)

In [8]:
# 임베딩 생성 및 "embedding" 컬럼 추가
df['embedding'] = df['Wiki_documents'].apply(
    lambda text: None if text is None else embedding_model.encode(text)
)

In [11]:
# print(df.head)

<bound method NDFrame.head of                                 Target Name  \
0      Tyrosine-protein kinase receptor RET   
1   Macrophage-stimulating protein receptor   
2                 Cyclin-dependent kinase 7   
3                 Serine-protein kinase ATM   
4                         Plasma kallikrein   
..                                      ...   
56          Thymidine kinase, mitochondrial   
57         Stem cell growth factor receptor   
58              Tyrosine-protein kinase BRK   
59     Serine/threonine-protein kinase AKT3   
60            PI3-kinase p110-gamma subunit   

                                                 Wiki  \
0                                               FALSE   
1                 https://en.wikipedia.org/wiki/MST1R   
2   https://en.wikipedia.org/wiki/Cyclin-dependent...   
3                                               FALSE   
4     https://en.wikipedia.org/wiki/Plasma_kallikrein   
..                                                ...   
56    

In [9]:
# 결과를 새 CSV 파일로 저장
output_file_path = csv_name + '_wiki_documents_summarized.csv'
df.to_csv(output_file_path, index=False)
print(f"새 파일이 저장되었습니다: {output_file_path}")

새 파일이 저장되었습니다: target_protein_wiki_wiki_documents_summarized.csv


In [10]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

# csv_name = 'target_protein_wiki'
# df = pd.read_csv(csv_name + '.csv')

# def get_wiki_content(url):
#     """ 주어진 Wikipedia URL에서 내용을 크롤링하여 텍스트를 반환합니다. """
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, 'html.parser')
#         paragraphs = soup.find_all('p')
#         content = ' '.join([para.text for para in paragraphs])
#         return content.strip()
#     except Exception as e:
#         return f"Error: {str(e)}"

# # 새로운 컬럼 "Wiki_documents" 생성
# df['Wiki_documents'] = df['Wiki'].apply(lambda x: None if x == "FALSE" else get_wiki_content(x))

# # 결과를 새 CSV 파일로 저장
# output_file_path = csv_name + '_wiki_documents.csv'
# df.to_csv(output_file_path, index=False)

# print(f"새 파일이 저장되었습니다: {output_file_path}")
# None GPT