<a href="https://colab.research.google.com/github/RyuMinHo/GAI_project/blob/main/process_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 라이브러리 install

In [None]:
!pip install faiss-cpu
!pip install spacy
!pip install PyPDF2
!pip install gradio
!pip install tiktoken
!python -m spacy download en_core_web_md
!pip install spacy[ko]
!python -m spacy download ko_core_news_md

# 2. 라이브러리 import

In [None]:
import gradio as gr
from PyPDF2 import PdfReader
import spacy
from transformers import AutoTokenizer
import tiktoken
from sentence_transformers import SentenceTransformer
import faiss
import time

# 3. 멀티 모달

In [None]:
def print_like_dislike(x: gr.LikeData):
    print(x.index, x.value, x.liked)

def add_message(history, message):
    for file in message["files"]:
        try:
            history.append({"role": "user", "content": {"path": file}})
            text = message.get("text", None)
            if text:
                history.append({"role": "user", "content": text})

            result = process_pdf(file, text)
            history.append({"role": "assistant", "content": result})

        except Exception as e:
            history.append({"role": "assistant", "content": f"Invalid PDF file: {str(e)}"})

    if not message["files"] and message["text"]:
        history.append({"role": "user", "content": message["text"]})

    return history, gr.MultimodalTextbox(value=None, interactive=False)


def bot(history: list):
    response = "**That's cool!**"
    history.append({"role": "assistant", "content": ""})
    for character in response:
        history[-1]["content"] += character
        time.sleep(0.05)
        yield history

# 4. pdf 처리

In [None]:
# pdf파일에서 text 추출
def process_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return chunck_text(text)

# 문장 단위로 chunking
def chunck_text(text):
    nlp = spacy.load("en_core_web_sm")
    nlp_kor = spacy.load("ko_core_news_md")
    doc=nlp(text)
    chunks = []

    for sent in doc.sents: #단어 수를 기준으로 한국어/영어 모델 구분 사용
      count_eng=0
      count_kor=0

      for token in sent.text:
            if "\u3131" <= token <= "\uD79D":
                count_kor += 1
            elif token.isalpha():
                count_eng += 1
      if count_kor > count_eng:
        processed_doc = nlp_kor(sent.text)
      else:
        processed_doc = nlp(sent.text)

      chunks.append("".join([token.text_with_ws for token in processed_doc]).strip())

    return chunks

# 청크 임베딩 -> top-k
def index_chuncks(chunks, query, k=5):
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    embedding_chunks = model.encode(chunks)

    dim=embedding_chunks.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embedding_chunks)

    embedding_query = model.encode([query])
    distances, indices = index.search(embedding_query, k)

    results = list(zip(indices[0], distances[0]))
    return results


# 5. gui

In [None]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages")
    chat_input = gr.MultimodalTextbox(
        interactive=True,
        file_count="multiple",
        placeholder="Enter message or upload file...",
        show_label=False,
    )
    chat_msg = chat_input.submit(
        add_message, [chatbot, chat_input], [chatbot, chat_input]
    )
    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
    chatbot.like(print_like_dislike, None, None, like_user_message=True)

    demo.launch()