In [1]:
!pip install pymupdf sentence-transformers faiss-cpu transformers accelerate bitsandbytes --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h

**Uploading Sample Document**

In [6]:
from google.colab import files

uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print("Using PDF:", pdf_path)

Saving sample-service-manual 1.pdf to sample-service-manual 1.pdf
Using PDF: sample-service-manual 1.pdf


In [9]:
import fitz
import re
import numpy as np
import faiss
import torch
import json

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

**Pdf -> Text => Pdf Dataloader**

In [10]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    doc.close()
    return text

raw_text = extract_text_from_pdf(pdf_path)
print("Total characters:", len(raw_text))

Total characters: 857788


**Preprocessing and Chunking**

In [11]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

cleaned_text = clean_text(raw_text)

def chunk_by_section(text):
    sections = re.split(r"(SECTION\s+\d+-\d+[A-Z]?:.*?)", text)
    combined = []

    for i in range(1, len(sections), 2):
        header = sections[i]
        body = sections[i+1] if i+1 < len(sections) else ""
        combined.append(header + " " + body)

    return combined

chunks = chunk_by_section(cleaned_text)
print("Total sections:", len(chunks))

Total sections: 177


In [12]:
!pip install -U sentence-transformers



**Embedding**

In [13]:
embed_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

documents = [
    "Represent this document for retrieval: " + chunk
    for chunk in chunks
]

embeddings = embed_model.encode(documents, show_progress_bar=True)
embeddings = np.array(embeddings)

print("Embedding shape:", embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Embedding shape: (177, 768)


**Vector Store**

In [14]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index built")

FAISS index built


**Retrieval**

In [15]:
def retrieve_relevant_sections(query, top_k=5):
    query_embedding = embed_model.encode(
        "Represent this query for retrieving relevant passages: " + query
    )

    distances, indices = index.search(
        np.array([query_embedding]), top_k
    )

    return [chunks[i] for i in indices[0]]

In [16]:
def extract_torque_table(text):
    pattern = r"([A-Za-z\s\-\(\)\/]+?)\s+(\d+)\s+(\d+)"

    matches = re.findall(pattern, text)

    results = []

    for component, nm_value, lbft_value in matches:
        component = component.strip()

        # Remove junk rows
        if len(component) < 5:
            continue

        if component.lower().startswith(("tighten", "remove", "install", "note")):
            continue

        results.append({
            "component": component,
            "spec_type": "Torque",
            "value": nm_value,
            "unit": "Nm"
        })

    return results

**Semantic Retrieval**

In [28]:
def semantic_component_filter(query, specs):
    query_embedding = embed_model.encode(query)

    best_match = None
    best_score = -1

    for spec in specs:
        component_embedding = embed_model.encode(spec["component"])

        score = np.dot(query_embedding, component_embedding)

        if score > best_score:
            best_score = score
            best_match = spec

    return [best_match] if best_match else []

In [29]:
def query_pipeline(query):
    relevant_sections = retrieve_relevant_sections(query)
    combined_text = "\n\n".join(relevant_sections)

    all_specs = extract_torque_table(combined_text)

    return semantic_component_filter(query, all_specs)

In [30]:
queries = [
    "Brake disc shield bolts torque",
    "Lower ball joint nut torque",
    "Stabilizer bar bracket nuts torque",
    "Upper ball joint nut torque"
]

In [31]:
for query in queries:
    print("\nQuery:", query)
    results = query_pipeline(query)
    print(json.dumps(results, indent=2))


Query: Brake disc shield bolts torque
[
  {
    "component": "Brake caliper guide pin bolts",
    "spec_type": "Torque",
    "value": "37",
    "unit": "Nm"
  }
]

Query: Lower ball joint nut torque
[
  {
    "component": "Lower ball joint nut",
    "spec_type": "Torque",
    "value": "175",
    "unit": "Nm"
  }
]

Query: Stabilizer bar bracket nuts torque
[
  {
    "component": "Stabilizer bar bracket nuts",
    "spec_type": "Torque",
    "value": "55",
    "unit": "Nm"
  }
]

Query: Upper ball joint nut torque
[
  {
    "component": "Upper ball joint nut",
    "spec_type": "Torque",
    "value": "115",
    "unit": "Nm"
  }
]
