In [None]:
%pip install tiktoken
%pip install openai
%pip install textract
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install plotly
%pip install scipy
%pip install scikit-learn



# 라이브러리 임포트

In [None]:
import os
import numpy as np
import pandas as pd
from typing import Iterator
import tiktoken
import textract

# 논문 파일 리스트 구하기

In [None]:
!wget https://raw.githubusercontent.com/dhrim/2024_ai_workshop/main/material/data/paper.zip -O paper.zip


--2024-06-28 21:20:54--  https://raw.githubusercontent.com/dhrim/2024_ai_workshop/main/material/data/paper.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9912147 (9.5M) [application/zip]
Saving to: ‘paper.zip’


2024-06-28 21:20:55 (175 MB/s) - ‘paper.zip’ saved [9912147/9912147]



In [None]:
!rm -rf paper
!unzip paper.zip

Archive:  paper.zip
   creating: paper/
  inflating: paper/jkma-2023-66-3-200.pdf  
  inflating: paper/jkma-2023-66-1-50.pdf  
  inflating: paper/jkma-2023-66-3-160.pdf  
  inflating: paper/jkma-2023-66-3-166.pdf  
  inflating: paper/jkma-2023-66-3-173.pdf  
  inflating: paper/jkma-2023-66-1-41.pdf  
  inflating: paper/jkma-2023-66-2-105.pdf  
  inflating: paper/jkma-2023-66-1-31.pdf  
  inflating: paper/jkma-2023-66-1-19.pdf  
  inflating: paper/jkma-2023-66-2-112.pdf  
  inflating: paper/jkma-2023-66-2-116.pdf  
  inflating: paper/jkma-2023-66-2-132.pdf  
  inflating: paper/jkma-2023-66-1-11.pdf  
  inflating: paper/jkma-2023-66-2-92.pdf  
  inflating: paper/jkma-2023-66-2-123.pdf  
  inflating: paper/jkma-2023-66-3-209.pdf  
  inflating: paper/jkma-2023-66-3-180.pdf  
  inflating: paper/jkma-2023-66-3-191.pdf  
  inflating: paper/jkma-2023-66-2-143.pdf  
  inflating: paper/jkma-2023-66-1-60.pdf  


In [None]:
import os

data_path = "paper"
pdf_files = [x for x in os.listdir(data_path)]

print(len(pdf_files))
print(pdf_files[:3])

20
['jkma-2023-66-1-41.pdf', 'jkma-2023-66-2-112.pdf', 'jkma-2023-66-1-31.pdf']


# 파일을 조각(chunk)로 쪼개고 임베딩해 두기

In [None]:
from openai import OpenAI

client = OpenAI(api_key="")

In [None]:
import tiktoken

TEXT_EMBEDDING_CHUNK_SIZE = 2000
tokenizer = tiktoken.get_encoding("cl100k_base")

embedding_storage = []

# 긴 문자열을 최대 길이 chunk_size의 토큰들로 자르고, 토큰을 반환한다.
def chunks(text, chunk_size=TEXT_EMBEDDING_CHUNK_SIZE):
    """Yield successive chunk_size chunks from text."""
    tokens = tokenizer.encode(text)
    i = 0
    while i < len(tokens):
        # 토큰 길이의 0.5 ~ 1.5 사이에서 문장의 끝을 찾는다.
        j = min(i + int(1.5 * chunk_size), len(tokens))
        while j > i + int(0.5 * chunk_size):
            # 디코딩 해서 마침표 혹은 줄 바꿈인지 확인한다.
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # 문장 끝을 못찾으면 chunk_size로 한다.
        if j == i + int(0.5 * chunk_size):
            j = min(i + chunk_size, len(tokens))
        yield tokens[i:j]
        i = j

def get_embeddings(texts):
    response = client.embeddings.create(input=texts, model="text-embedding-3-small")
    return [data.embedding for data in response.data]

def create_embeddings_for_text(text):
    # 최대길이 TEXT_EMBEDDING_CHUNK_SIZE의 토큰 리스트로 변환. 쪼개진 개별을 chunk라 한다.
    token_chunks = list(chunks(text))
    # 각 토큰 리스트를 text 리스트로 변환
    text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]

    # 전체 chunk들의 임베딩을 구하고
    embedding_chunks = get_embeddings(text_chunks)
    return text_chunks, embedding_chunks


def handle_file_string(filename, file_content_string):

    # 파일 내용에서 라인 변경, 더블 스페이스, 세미 콜론을 삭제
    clean_file_content_string = " ".join(file_content_string.replace("\n", "; ").replace(";", " ").split())

    try:
        text_chunks, embedding_chunks = create_embeddings_for_text(clean_file_content_string)
        print(f"[INFO] Embedded into {len(text_chunks)} chunks")
    except Exception as e:
        print("[handle_file_string] Error creating embedding: {}".format(e))

    for i, (text_chunk, embedding_chunk) in enumerate(zip(text_chunks, embedding_chunks)):
        embedding_storage.append({"embedding":embedding_chunk, "filename": filename, "chunk_index":i, "text":text_chunk })


In [None]:

# Process each PDF file and prepare for embedding
for i, pdf_file in enumerate(pdf_files):

    pdf_path = os.path.join(data_path, pdf_file)
    print(f"[INFO] {i}/{len(pdf_files)}. processing paper : {pdf_path}\n")

    text = textract.process(pdf_path, method='pdfminer')
    print(text.decode("utf-8")[0:200])
    print()

    handle_file_string(pdf_file, text.decode("utf-8"))
    print()
    print("============================================================")


[INFO] 0/20. processing paper : paper/jkma-2023-66-1-41.pdf

CONTINUING EDUCATION COLUMN 
J Korean Med Assoc 2023 January; 66(1):41-47

pISSN 1975-8456 / eISSN 2093-5951

https://doi.org/10.5124/jkma.2023.66.1.41

약물에 의한 피부유해반응
약물에 의한 피부유해반응

이 은 혜·장 용 현
경북대학교 

[INFO] Embedded into 5 chunks

[INFO] 1/20. processing paper : paper/jkma-2023-66-2-112.pdf

FOCUSED ISSUE OF THIS MONTH 
J Korean Med Assoc 2023 February; 66(2):112-115

pISSN 1975-8456 / eISSN 2093-5951

https://doi.org/10.5124/jkma.2023.66.2.112

면역항암치료의 부작용 관리: 간과 위장관계를  
면역항암치료의 부작용 관리: 

[INFO] Embedded into 3 chunks

[INFO] 2/20. processing paper : paper/jkma-2023-66-1-31.pdf

 특집3

FOCUSED ISSUE OF THIS MONTH 
J Korean Med Assoc 2023 January; 66(1):31-40

pISSN 1975-8456 / eISSN 2093-5951

https://doi.org/10.5124/jkma.2023.66.1.31

후두신경통의 진단과 치료: 큰뒤통수신경 포착증후
후두신경통의 진단과 치료

[INFO] Embedded into 7 chunks

[INFO] 3/20. processing paper : paper/jkma-2023-66-3-173.pdf

 특집3

FOCUSED ISSUE OF THIS MONTH 
J Korean Med Assoc 

In [None]:
from pprint import pprint

print(len(embedding_storage))
print(embedding_storage[0]["embedding"][:10])
print(len(embedding_storage[0]["embedding"]))
print(embedding_storage[0]["filename"])
print(embedding_storage[0]["chunk_index"])
print(embedding_storage[0]["text"])

110
[-0.05653475224971771, 0.03928762301802635, 0.01875905506312847, 0.06679343432188034, -0.015511218458414078, 0.012576966546475887, 0.013484121300280094, 0.021245330572128296, -0.00978830736130476, -0.005123741924762726]
1536
jkma-2023-66-1-41.pdf
0
CONTINUING EDUCATION COLUMN J Korean Med Assoc 2023 January 66(1):41-47 pISSN 1975-8456 / eISSN 2093-5951 https://doi.org/10.5124/jkma.2023.66.1.41 약물에 의한 피부유해반응 약물에 의한 피부유해반응 이 은 혜·장 용 현 경북대학교 의과대학 피부과학교실 Cutaneous adverse drug reactions Eun Hye Lee, MD · Yong Hyun Jang, MD Department of Dermatology, School of Medicine, Kyungpook National University, Daegu, Korea Background: Cutaneous adverse drug reactions are common and produce easily identifiable clinical symptoms. These may range from mild maculopapular rashes to severe reactions associated with systemic disease. Current Concepts: The most common presentation of a drug eruption is in the form of a maculopapular rash or exanthematous skin eruption, followed by fixed drug eruptions an

# 가장 가까운 조각(chunk) 구하기

In [None]:
from scipy.spatial.distance import cosine
import pandas as pd

def find_closest_n_index(embedding_storage, embedded_query, n=3):
    embeddings = [entity["embedding"] for entity in embedding_storage]
    df = pd.DataFrame({'embeddings':embeddings})
    df["distances"] = df["embeddings"].apply(lambda x: cosine(embedded_query, x))
    sorted_index = df.sort_values(by="distances").index.to_list()
    return sorted_index[:n]

def find_closest_n_chunk(embedding_storage, query, n=3):
    embedded_query = get_embeddings(query)[0]
    closest_indexes = find_closest_n_index(embedding_storage, embedded_query, n)
    return [embedding_storage[i] for i in closest_indexes]

In [None]:
question = "코로나19 증후군의 증상은?"
queryed = find_closest_n_chunk(embedding_storage, question, n=1)[0]

In [None]:
print(queryed["filename"])
print(queryed["chunk_index"])
print(queryed["text"])

jkma-2023-66-1-50.pdf
0
SPECIAL CONTRIBUTION J Korean Med Assoc 2023 January 66(1):50-59 pISSN 1975-8456 / eISSN 2093-5951 https://doi.org/10.5124/jkma.2023.66.1.50 만성 코로나19 증후군 시대를 위한 만성 코로나19 증후군 시대를 위한 보건의료 대응 방안 보건의료 대응 방안 김 혜 준1·송 지 훈1·박 상 민1,2 ¹서울대학교 대학원 의과학과 헬스시스템 데이터 사이언스 연구실 ²서울대학교병원 가정의학과 Healthcare response strategies for the long- COVID era Hye Jun Kim, MS1 · Jihun Song, MS1 · Sang Min Park, MD, PhD, MPH1,2 ¹Health System Data Science Laboratory, Department of Biomedical Sciences, Seoul National University College of Medicine, Seoul, Korea ²Department of Family Medicine, Seoul National University Hospital, Seoul, Korea Background: Coronavirus disease (COVID-19), first reported at the end of 2019, is characterized by a broad spectrum of clinical manifestations ranging from asymptomatic to multi-organ dysfunction. These symptoms may persist even after the acute phase has passed. Post-acute COVID-19 syndrome (long-COVID) is a condition characterized by COVID-19 symptoms that p

# 구한 조각 내용으로 문의하기

In [None]:
prompt = f"""
You are helpful QnA engine.
You will be provided with text delimited by triple quotes and question.
Step 1. find the text part from the given text which is used to reason the answer of question
Step 2. answer the question in Korean and show the text part with quotation mark.
If not found just say 'could not answer'.
"""

text_and_question = f"""
Text:
```{queryed["text"]}```

Question:
```{question}```
"""



MODEL = "gpt-3.5-turbo"
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": text_and_question},
    ],
)


print("질문 : ", question)
print("근거 문서 파일 이름 : ", queryed["filename"])
print("근거 문서 조각 인덱스 : ", queryed["chunk_index"])
print("근거 문서 내용 : ", queryed["text"])
print("답변 : ", response.choices[0].message.content)

질문 :  코로나19 증후군의 증상은?
근거 문서 파일 이름 :  jkma-2023-66-1-50.pdf
근거 문서 조각 인덱스 :  0
근거 문서 내용 :  SPECIAL CONTRIBUTION J Korean Med Assoc 2023 January 66(1):50-59 pISSN 1975-8456 / eISSN 2093-5951 https://doi.org/10.5124/jkma.2023.66.1.50 만성 코로나19 증후군 시대를 위한 만성 코로나19 증후군 시대를 위한 보건의료 대응 방안 보건의료 대응 방안 김 혜 준1·송 지 훈1·박 상 민1,2 ¹서울대학교 대학원 의과학과 헬스시스템 데이터 사이언스 연구실 ²서울대학교병원 가정의학과 Healthcare response strategies for the long- COVID era Hye Jun Kim, MS1 · Jihun Song, MS1 · Sang Min Park, MD, PhD, MPH1,2 ¹Health System Data Science Laboratory, Department of Biomedical Sciences, Seoul National University College of Medicine, Seoul, Korea ²Department of Family Medicine, Seoul National University Hospital, Seoul, Korea Background: Coronavirus disease (COVID-19), first reported at the end of 2019, is characterized by a broad spectrum of clinical manifestations ranging from asymptomatic to multi-organ dysfunction. These symptoms may persist even after the acute phase has passed. Post-acute COVID-19 syndrome (lon