In [2]:
!pip install -q sentence-transformers faiss-cpu PyMuPDF transformers openai tqdm --quiet

In [3]:
import os
import fitz  # PyMuPDF
import faiss
import openai
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import pipeline

In [4]:
def load_pdfs(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            doc = fitz.open(os.path.join(folder_path, filename))
            text = "\n".join(page.get_text() for page in doc)
            documents.append(text)
    return documents


In [5]:
def split_into_chunks(documents, chunk_size=500):
    chunks = []
    for doc in documents:
        words = doc.split()
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            chunks.append(chunk)
    return chunks

In [6]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def embed_chunks(chunks, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding chunks"):
        batch = chunks[i:i+batch_size]
        batch_embeddings = embedder.encode(batch, show_progress_bar=False)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [7]:
def store_in_faiss(embeddings, chunks):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [8]:
def retrieve_relevant_chunks(query, index, chunks, top_k=5):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [chunks[i] for i in indices[0]]

In [9]:
local_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0 if torch.cuda.is_available() else -1)

# OpenAI API Key (optional)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY

Device set to use cpu


In [10]:
def call_local_llm(prompt):
    output = local_model(prompt, max_length=512)[0]['generated_text']
    return output

In [11]:
def call_openai_llm(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']

In [None]:
folder_path = "data"

print("Loading documents...")
documents = load_pdfs(folder_path)
chunks = split_into_chunks(documents, chunk_size=1000)
embeddings = embed_chunks(chunks, batch_size=32)
faiss_index = store_in_faiss(embeddings, chunks)

print("\nWelcome to Structural Engineering Help Tool! \U0001f3d7\ufe0f")

while True:
    query = input("\nEnter your structural engineering question (or type 'exit'): ")
    if query.lower() == 'exit':
        break

    relevant_chunks = retrieve_relevant_chunks(query, faiss_index, chunks)
    context = "\n".join(relevant_chunks)
    augmented_prompt = f"""
You are a structural engineering assistant.
Use the following context to answer the question:

{context}

Question: {query}
Answer:
"""
    
    print("\nAnswering...\n")
    
    # Choose local or OpenAI model
    if OPENAI_API_KEY:
        answer = call_openai_llm(augmented_prompt)
    else:
        answer = call_local_llm(augmented_prompt)

    print(answer)

print("Goodbye!")
