In [1]:
!pip install -q sentence-transformers faiss-cpu PyMuPDF transformers openai

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import fitz  # PyMuPDF
import faiss
import openai
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline

In [3]:
def load_pdfs(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            doc = fitz.open(os.path.join(folder_path, filename))
            text = "\n".join(page.get_text() for page in doc)
            documents.append(text)
    return documents


In [5]:
def split_into_chunks(documents, chunk_size=500):
    chunks = []
    for doc in documents:
        words = doc.split()
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            chunks.append(chunk)
    return chunks

In [6]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def embed_chunks(chunks):
    return embedder.encode(chunks, show_progress_bar=True)

In [7]:
def store_in_faiss(embeddings, chunks):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [8]:
def retrieve_relevant_chunks(query, index, chunks, top_k=5):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [chunks[i] for i in indices[0]]

In [4]:
local_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0 if torch.cuda.is_available() else -1)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu


In [9]:
def call_local_llm(prompt):
    output = local_model(prompt, max_length=512)[0]['generated_text']
    return output

In [None]:
def call_openai_llm(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']

In [None]:
folder_path = "data"

print("Loading documents...")
documents = load_pdfs(folder_path)
chunks = split_into_chunks(documents)
embeddings = embed_chunks(chunks)
faiss_index = store_in_faiss(torch.tensor(embeddings).numpy(), chunks)

print("\nWelcome to Structural Engineering Help Tool! \U0001f3d7\ufe0f")

while True:
    query = input("\nEnter your structural engineering question (or type 'exit'): ")
    if query.lower() == 'exit':
        break

    relevant_chunks = retrieve_relevant_chunks(query, faiss_index, chunks)
    context = "\n".join(relevant_chunks)
    augmented_prompt = f"""
You are a structural engineering assistant.
Use the following context to answer the question:

{context}

Question: {query}
Answer:
"""
    
    print("\nAnswering...\n")
    
    # Choose local or OpenAI model
    if OPENAI_API_KEY:
        answer = call_openai_llm(augmented_prompt)
    else:
        answer = call_local_llm(augmented_prompt)

    print(answer)

print("Goodbye!")

Loading documents...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]