In [2]:
pip install llama-cpp-python faiss-cpu sentence-transformers

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
     ---------------------------------------- 0.0/67.3 MB ? eta -:--:--
     ------- ------------------------------- 13.6/67.3 MB 77.7 MB/s eta 0:00:01
     ------------------ -------------------- 32.8/67.3 MB 83.2 MB/s eta 0:00:01
     ------------------------------ -------- 53.0/67.3 MB 87.4 MB/s eta 0:00:01
     --------------------------------------  67.1/67.3 MB 88.9 MB/s eta 0:00:01
     --------------------------------------- 67.3/67.3 MB 77.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting

In [7]:
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

MODEL_PATH = "llama-2-7b-chat.Q4_K_M.gguf"
DOC_PATH = "document.txt"

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,
    n_gpu_layers=32, 
    verbose=False
)

embedder = SentenceTransformer("all-MiniLM-L6-v2")

with open(DOC_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
chunk_embeddings = embedder.encode(chunks)

dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(chunk_embeddings))
chunk_map = {i: chunk for i, chunk in enumerate(chunks)}

def retrieve(query, k=3):
    query_embedding = embedder.encode([query])
    _, I = index.search(np.array(query_embedding), k)
    return [chunk_map[i] for i in I[0]]

def generate_answer(context, question):
    prompt = f"""Use the context below to answer the question.

Context:
{context}

Question:
{question}

Answer:"""

    output = llm(prompt, max_tokens=256, stop=["\n\n"])
    return output["choices"][0]["text"].strip()

while True:
    user_input = input("\nAsk a question (or type 'exit'): ")
    if user_input.lower() == "exit":
        break
    context = "\n\n".join(retrieve(user_input))
    answer = generate_answer(context, user_input)
    print(f"\nAnswer: {answer}")


llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized



Answer: Of course! Here are two quiz questions based on the context provided:
1. What are the two types of worms recommended for composting in the Worm Factory 360?
Answer: The two types of worms recommended for composting in the Worm Factory 360 are red wigglers (Eisenia fetida) and European (Belgian) Nightcrawlers (Eisenia hortensis).
2. How many trays are needed to reach full operating capacity in the Worm Factory 360?
Answer: According to the context, it takes 3 or more operating trays for the vermicomposter to be in full operation.
