In [4]:
import os

def load_documents(directory_path):
    documents=[]
    for filename in os.listdir(directory_path):
      file_path=os.path.join(directory_path,filename)
      with open(file_path, 'r') as file:
        text=file.read()
        documents.append(text)
    return documents

In [5]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

def generate_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt',
                          truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(embedding)
    return embeddings

In [8]:
import numpy as np

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def retrieve_top_k(query_embedding, vector_db, k=3):
    similarities = []
    for idx, doc_embedding in enumerate(vector_db['embeddings']):
        sim = cosine_similarity(query_embedding, doc_embedding)
        similarities.append((idx, sim))

    top_k = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    return [vector_db['texts'][idx] for idx, _ in top_k]

In [9]:
def generate_response(query, retrieved_chunks, llm):
    context = "\n".join(retrieved_chunks)
    prompt = f"""Based on this context: {context}

    Answer the question: {query}"""

    response = llm.generate(prompt)
    return response