In [1]:
import os
import pdfplumber
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
pdf_folder = r"D:\game\Terre_chatbot\data"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

all_texts = []

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            t = page.extract_text()
            if t:
                text += t + "\n"
        all_texts.append(text)

print(f"Loaded {len(all_texts)} PDFs")


def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

all_chunks = []
for text in all_texts:
    all_chunks.extend(chunk_text(text))

print(f"Total chunks: {len(all_chunks)}")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks
embeddings_array = model.encode(all_chunks, show_progress_bar=True)
print("Embeddings created")
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use cosine similarity
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print("FAISS index built")

def get_answer(question, top_k=3):
    # Embed question
    q_emb = model.encode([question])
    faiss.normalize_L2(q_emb)
    
    # Search top chunks
    distances, indices = index.search(q_emb, top_k)
    context = "\n".join([all_chunks[i] for i in indices[0]])
    
    return context  # Most relevant text from PDFs

while True:
    question = input("Ask a question about Terra (type 'exit' to quit): ")
    if question.lower() == "exit":
        break
    answer = get_answer(question)
    print("\nAnswer (from PDFs):")
    print(answer)
    print("-"*50)




  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [2]:
import os
import pdfplumber
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import joblib  # to save embeddings and index

# -----------------------------
# 1️⃣ Load PDFs
# -----------------------------
pdf_folder = r"D:\game\Terre_chatbot\data"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

all_texts = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            t = page.extract_text()
            if t:
                text += t + "\n"
        all_texts.append(text)
print(f"Loaded {len(all_texts)} PDFs")

# -----------------------------
# 2️⃣ Chunk text
# -----------------------------
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

all_chunks = []
for text in all_texts:
    all_chunks.extend(chunk_text(text))
print(f"Total chunks: {len(all_chunks)}")

# -----------------------------
# 3️⃣ Embeddings
# -----------------------------
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_array = model.encode(all_chunks, show_progress_bar=True)
print("Embeddings created")

# -----------------------------
# 4️⃣ FAISS index
# -----------------------------
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)
print("FAISS index built")

# -----------------------------
# 5️⃣ Save chunks and embeddings/index
# -----------------------------
os.makedirs("model_data", exist_ok=True)
np.save("model_data/embeddings.npy", embeddings_array)
joblib.dump(all_chunks, "model_data/chunks.pkl")
faiss.write_index(index, "model_data/faiss.index")
print("Model saved to 'model_data/'")


KeyboardInterrupt: 