<a href="https://colab.research.google.com/github/RyuMinHo/GAI_project/blob/main/process_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF 사전 처리

In [34]:
!pip install faiss-cpu
!pip install semchunk
!pip install PyPDF2
!pip install gradio
!pip install tiktoken



# 1. 라이브러리 import

In [41]:
import gradio as gr
from PyPDF2 import PdfReader
import semchunk
from transformers import AutoTokenizer
import tiktoken
from sentence_transformers import SentenceTransformer
import faiss
import time

# 2. 멀티 모달

In [42]:
def print_like_dislike(x: gr.LikeData):
    print(x.index, x.value, x.liked)

def add_message(history, message):
    for file in message["files"]:
        try:
            history.append({"role": "user", "content": {"path": file}})
            text = message.get("text", None)
            if text:
                history.append({"role": "user", "content": text})

            result = process_pdf(file, text)
            history.append({"role": "assistant", "content": result})

        except Exception as e:
            history.append({"role": "assistant", "content": f"Invalid PDF file: {str(e)}"})

    if not message["files"] and message["text"]:
        history.append({"role": "user", "content": message["text"]})

    return history, gr.MultimodalTextbox(value=None, interactive=False)


def bot(history: list):
    response = "**That's cool!**"
    history.append({"role": "assistant", "content": ""})
    for character in response:
        history[-1]["content"] += character
        time.sleep(0.05)
        yield history


# 3. pdf 사전 처리

In [43]:
def process_pdf(file_path, query):

    #pdf 텍스트 추출
    reader = PdfReader(file_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()

    #청킹
    chunk_size=200
    chunker = semchunk.chunkerify('umarbutler/emubert', chunk_size) or \
              semchunk.chunkerify('gpt-4', chunk_size) or \
              semchunk.chunkerify('cl100k_base', chunk_size) or \
              semchunk.chunkerify(AutoTokenizer.from_pretrained('umarbutler/emubert'), chunk_size) or \
              semchunk.chunkerify(tiktoken.encoding_for_model('gpt-4'), chunk_size) or \
              semchunk.chunkerify(lambda text: len(text.split()), chunk_size)
    chunks = chunker(extracted_text)

    #임베딩
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    chunks_embeddings = model.encode(chunks)

    #인덱싱
    dim=chunks_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(chunks_embeddings)

    #쿼리 인덱싱
    if query is not None:
        query_embedding = model.encode([query])

    #top-k 검색
        top_k=1
        distances, indices = index.search(query_embedding, top_k)
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(chunks):
                results.append(chunks[idx])

        return results[0] if results else "No relevant content found."

# 4. gradio GUI

In [44]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages")
    chat_input = gr.MultimodalTextbox(
        interactive=True,
        file_count="multiple",
        placeholder="Enter message or upload file...",
        show_label=False,
    )
    chat_msg = chat_input.submit(
        add_message, [chatbot, chat_input], [chatbot, chat_input]
    )
    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
    chatbot.like(print_like_dislike, None, None, like_user_message=True)

    demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2a13ea4c916eb25d93.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
