### 이 파일은 vllm 을 통해 llava-hf/llava-v1.6-mistral-7b-hf 를 실행하기 위한 코드입니다.
### Google Colab에서 vllm을 구동하기 위해 설치가 필요한 모듈들입니다. 

!pip install vllm transformers triton PyMuPDF Pillow sentence_transformers numpy typing faiss-gpu

관련 Modules import

In [None]:
import fitz
import spacy
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from vllm import LLM, SamplingParams
from typing import List, Tuple

In [None]:
# 모델 및 토크나이저 초기화
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", dtype='half', max_model_len=8192)
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def chunk_text(text: str, chunk_size: int = 200) -> List[str]:
    doc = nlp(text)
    chunks = []
    current_chunk = []
    current_size = 0
    for sent in doc.sents:
        if current_size + len(sent) > chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_size = 0
        current_chunk.append(sent.text)
        current_size += len(sent)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def process_pdf(file_path: str) -> List[str]:
    with fitz.open(file_path) as doc:
        text = " ".join([page.get_text() for page in doc])
    return chunk_text(text)

def index_chunks(chunks: List[str]):
    global index
    embeddings = embedder.encode(chunks)
    embeddings_array = np.array(embeddings, dtype=np.float32)
    index.add(embeddings_array)
    return embeddings_array

def retrieve_relevant_chunks(query: str, k: int = 5) -> List[Tuple[int, float]]:
    query_vector = embedder.encode([query])
    query_vector = np.array(query_vector, dtype=np.float32)  # 명시적으로 float32로 변환
    D, I = index.search(query_vector, k)
    return list(zip(I[0], D[0]))

def generate_answer(question: str, context: str = "") -> str:
    if context:
        prompt = f"""[INST] You are a helpful AI assistant. Answer the following question based on the given context. If the context doesn't contain relevant information, say so. Do not make up information.

Context: {context}

Question: {question}

Answer: [/INST]"""
    else:
        prompt = f"[INST] You are a helpful AI assistant. {question} [/INST]"

    inputs = {"prompt": prompt}
    sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
    outputs = llm.generate(inputs, sampling_params=sampling_params)
    return outputs[0].outputs[0].text

# PDF 처리 및 인덱싱
pdf_chunks = process_pdf("/content/pdf_file.pdf")

# for debugging
print(f"Number of chunks: {len(pdf_chunks)}")
print(f"First chunk: {pdf_chunks[0][:100]}...")  # 첫 100자만 출력
# end debugging

chunk_embeddings = index_chunks(pdf_chunks)

# for debugging
print(f"Number of embeddings: {len(chunk_embeddings)}")
# end debugging

# 대화 루프
while True:
    user_input = input("User: ")
    if user_input.lower() == 'exit':
        break
    
    if user_input.startswith("PDF:"):
        # PDF 관련 질문 처리
        query = user_input[4:].strip()
        relevant_chunks = retrieve_relevant_chunks(query)
        #for debugging
        print(f"Number of relevant chunks: {len(relevant_chunks)}")
        # end debugging
        context = " ".join([pdf_chunks[i] for i, _ in relevant_chunks if i < len(pdf_chunks)])
        # for debugging
        print(f"Context length: {len(context)}")
        # end debugging
        answer = generate_answer(query, context)
    else:
        # 일반 대화 처리
        answer = generate_answer(user_input)
    
    print("AI:", answer)