In [1]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline

# Load local embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast + decent quality

# Load local LLM (e.g. Mistral, TinyLLaMA, LLaMA)
llm_name = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"  # Can be GGUF or HuggingFace format
# If using GGUF, you'd need llama.cpp + bindings
# For HF format:
llm_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Or another model from HF

tokenizer = AutoTokenizer.from_pretrained(llm_name)
model = AutoModelForCausalLM.from_pretrained(
    llm_name,
    device_map="auto",
    torch_dtype="auto",
)
generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)

# Load and chunk text
def load_and_split(file_path, chunk_size=500, overlap=100):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Embed and build index
def build_vector_index(chunks):
    vectors = embedding_model.encode(chunks)
    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(vectors).astype("float32"))
    return index, vectors

# Retrieve top chunks
def retrieve_top_k(query, chunks, index, k=3):
    query_vec = embedding_model.encode([query])
    D, I = index.search(np.array(query_vec).astype("float32"), k)
    return [chunks[i] for i in I[0]]

# Generate answer using local LLM
def generate_answer(query, context_chunks):
    context = "\n\n".join(context_chunks)
    prompt = f"""You are a helpful assistant. Use the context to answer the question.

Context:
{context}

Question:
{query}

Answer:"""
    output = generator(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
    return output[0]["generated_text"].split("Answer:")[-1].strip()

# Run
if __name__ == "__main__":
    chunks = load_and_split(r"C:\research_work_DIMAAG_AI_SSE\work_progress_reports\work_report_2025_2026\my_projects\building_rag_agents_LLMS_Langchain\input_data.txt")
    index, _ = build_vector_index(chunks)

    while True:
        query = input("Ask me something: ")
        top_chunks = retrieve_top_k(query, chunks, index)
        answer = generate_answer(query, top_chunks)
        print("\nAnswer:\n", answer)
        print("=" * 50)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1.
401 Client Error. (Request ID: Root=1-6805eeac-671684c3475e6c783ceb4596;4b1f37dd-0c59-4da6-8bf2-f88edcca7561)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.