In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from PyPDF2 import PdfReader


In [None]:
def read_pdf(file_path):
    reader = PdfReader(file_path)
    content = ""
    for page in reader.pages:
        content += page.extract_text() + "\n"  # Append text from each page
    return content

def split_text_into_chunks(text, chunk_size=300):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]


def embed_text_chunks(chunks, embedding_model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(embedding_model_name)
    embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
    return embeddings


def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [None]:
file_path = "documents/LLM.pdf"  # Replace with your PDF file path
pdf_content = read_pdf(file_path)

In [None]:
chunks = split_text_into_chunks(pdf_content, chunk_size=64)
embedding_model_name = "all-mpnet-base-v2"
embeddings = embed_text_chunks(chunks, embedding_model_name)
chunk_data = {"chunks": chunks, "embeddings": embeddings}

In [None]:
faiss_index = build_faiss_index(embeddings)

In [None]:
query = "Evolution of Large Language Models"  # Your search query
query_embedding = SentenceTransformer(embedding_model_name).encode([query], convert_to_numpy=True)
distances, indices = faiss_index.search(query_embedding, k=3)  # Retrieve top-3 closest chunks
response_chunks='\n'.join([chunks[i] for i in indices[0]])

In [None]:
summarize_model(response_chunks)

In [None]:
# https://ollama.com/download
# https://github.com/ollama/ollama-python

from ollama import chat
from ollama import ChatResponse


response_chunks='\n'.join([chunks[i] for i in indices[0]])
response: ChatResponse = chat(model='llama3.2', messages=[
  {
    'role': 'user',
    'content': f"Answer {query} from(do not hallucinate) in 100 words  {response_chunks}"
  },
])
print(response['message']['content'])
