In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pdf_path = "/content/drive/MyDrive/KnowledgeDatabase/R048r12e.pdf"

In [None]:
store_dir = "/content/drive/MyDrive/KnowledgeGraphResults"

In [None]:
import os

In [None]:
os.path.exists(store_dir),os.path.exists(pdf_path)

(True, True)

In [None]:
# prompt: check current working directory

print(os.getcwd())


/content


In [None]:
os.chdir("/content/drive/MyDrive/")

# Using LLM Approach :

In [None]:
! pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.2


In [None]:
import os
import json
import re
import fitz  # PyMuPDF for PDF text extraction
import networkx as nx
from transformers import pipeline

def extract_text(file_path):
    """
    Extracts text from a PDF file using PyMuPDF.
    """
    text = ""
    # Open the PDF file
    with fitz.open(file_path) as pdf:
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            # Extract text from each page
            text += page.get_text()
    return text

def clean_text(raw_text):
    """
    Cleans and structures text while retaining key content.
    """
    # Normalize whitespace
    cleaned_text = re.sub(r"[ \t]+", " ", raw_text)  # Replace tabs and multiple spaces with single space

    # Remove headers, footers, and artifacts
    cleaned_text = re.sub(r"E/ECE/.*", "", cleaned_text)  # Remove specific headers
    cleaned_text = re.sub(r"Page\s*\d+", "", cleaned_text)  # Remove page numbers
    cleaned_text = re.sub(r"GE\.\d+-", "", cleaned_text)  # Remove generic document codes

    # Add newline before numbered sections or headings for better segmentation
    cleaned_text = re.sub(r"(\d+\.\s+[A-Z].*?:)", r"\n\1\n", cleaned_text)

    # Retain numbered lists while removing excess formatting
    cleaned_text = re.sub(r"\s{2,}", " ", cleaned_text)  # Reduce multiple spaces
    cleaned_text = re.sub(r"(\d+\.?\d*)\s+", r"\1 ", cleaned_text)  # Remove spaces within numbered lists

    # Remove excessive newlines
    cleaned_text = re.sub(r"\n\s*\n+", "\n\n", cleaned_text)
    # Convert all text to lowercase
    cleaned_text = cleaned_text.lower()

    return cleaned_text


def save_text_to_file(text, output_path):
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(text)

In [None]:
extracted_text = extract_text(pdf_path)
cleaned_text = clean_text(extracted_text)
cleaned_txt_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"
save_text_to_file(cleaned_text, cleaned_txt_path)

In [None]:
len(cleaned_text)

250574

In [None]:
import time
import json
from transformers import pipeline
import numpy as np

def extract_entities(text, chunk_size=1000):
    """
    Extract entities from a large text using a pre-trained NER model with time estimation.
    """
    # Load the NER pipeline
    ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

    entities = []
    text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    total_chunks = len(text_chunks)

    start_time = time.time()  # Start timer

    for idx, chunk in enumerate(text_chunks):
        chunk_start_time = time.time()  # Start time for this chunk

        # Process the current chunk
        chunk_entities = ner_pipeline(chunk)
        entities.extend(chunk_entities)

        # Estimate time remaining
        elapsed_time = time.time() - start_time
        avg_time_per_chunk = elapsed_time / (idx + 1)
        remaining_chunks = total_chunks - (idx + 1)
        estimated_time_remaining = avg_time_per_chunk * remaining_chunks

        print(f"Processed chunk {idx + 1}/{total_chunks}. "
              f"Estimated time remaining: {estimated_time_remaining:.2f} seconds.")

    return entities

def json_serializable(obj):
    """
    Custom function to convert non-serializable objects to serializable types.
    """
    if isinstance(obj, np.float32):  # Convert NumPy float32 to Python float
        return float(obj)
    if isinstance(obj, np.integer):  # Convert NumPy int to Python int
        return int(obj)
    return str(obj)  # Fallback: convert to string

# Extract entities with progress tracking
entities = extract_entities(cleaned_text)

# Save extracted entities to a JSON file
entities_json_path = "/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities.json"
with open(entities_json_path, "w", encoding="utf-8") as file:
    json.dump(entities, file, indent=4, default=json_serializable)

print(f"Extracted {len(entities)} entities. Results saved to {entities_json_path}.")


In [None]:
! pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import evaluate

def preprocess_text(text):
    """
    Normalize text: remove unnecessary headings, extra whitespace, and format sections.
    """
    text = re.sub(r"Page\s*\d+", "", text)  # Remove page numbers
    text = re.sub(r"[ \t]+", " ", text)  # Normalize whitespace
    text = re.sub(r"\n\s*\n+", "\n\n", text)  # Remove excessive newlines
    return text.strip()

def chunk_text_by_sections(text):
    """
    Split text into logical sections based on headings or numbering patterns.
    """
    sections = re.split(r"^\d+\.\s|\n\d+\.\d+\.\s", text, flags=re.MULTILINE)  # Split on numbered sections
    return [section.strip() for section in sections if section.strip()]

def extract_labeled_data(text):
    """
    Automatically extract labeled examples from the text for NER fine-tuning.
    """
    labeled_data = []
    sentences = re.split(r"\. |\n", text)  # Split text into sentences

    for sentence in sentences:
        entities = []

        # Technical terms
        for match in re.finditer(r"\b[A-Za-z]+(?:[- ]?[A-Za-z]+)* system\b", sentence):
            entities.append({"start": match.start(), "end": match.end(), "label": "TECH_TERM"})

        # Dates
        for match in re.finditer(r"\b\d{1,2}\s\w+\s\d{4}\b", sentence):
            entities.append({"start": match.start(), "end": match.end(), "label": "DATE"})

        # Section references
        for match in re.finditer(r"paragraph\s\d+\.\d+", sentence, flags=re.IGNORECASE):
            entities.append({"start": match.start(), "end": match.end(), "label": "SECTION_REF"})

        if entities:
            labeled_data.append({"text": sentence.strip(), "entities": entities})

    return labeled_data


def fine_tune_ner_model(labeled_data, model_name="bert-base-uncased", num_epochs=3):
    """
    Fine-tune a pre-trained transformer model for domain-specific NER.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3)

    # Prepare datasets
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True)
        labels = []
        for i, label in enumerate(examples["entities"]):
            tokens = tokenized_inputs.tokens(i)
            label_ids = [0] * len(tokens)
            for entity in label:
                start, end, tag = entity["start"], entity["end"], entity["label"]
                for idx, token in enumerate(tokens):
                    if start <= token.offset and token.offset < end:
                        label_ids[idx] = tag
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    dataset = Dataset.from_list(labeled_data)
    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

    # Split into train and validation sets
    train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
    train_dataset = train_test_split["train"]
    val_dataset = train_test_split["test"]

    # Evaluation metric
    metric = evaluate.load("seqeval")  # Use the seqeval metric for NER

    def compute_metrics(p):
        predictions, labels = p
        predictions = predictions.argmax(axis=2)
        true_predictions = [
            [model.config.id2label[p] for p, l in zip(pred, label) if l != -100]
            for pred, label in zip(predictions, labels)
        ]
        true_labels = [
            [model.config.id2label[l] for p, l in zip(pred, label) if l != -100]
            for pred, label in zip(predictions, labels)
        ]
        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"]}

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        save_total_limit=2,
        logging_dir="./logs",
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()

    return trainer, model, tokenizer


def apply_fine_tuned_model(text, model, tokenizer):
    """
    Use the fine-tuned model for inference on new text.
    """
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
    return ner_pipeline(text)

# Main pipeline
def main_pipeline(file_path):
    # Step 1: Load and preprocess the text
    with open(file_path, "r", encoding="utf-8") as file:
        raw_text = file.read()

    normalized_text = preprocess_text(raw_text)
    sections = chunk_text_by_sections(normalized_text)

    # Step 2: Extract labeled data
    labeled_data = []
    for section in sections:
        labeled_data.extend(extract_labeled_data(section))

    # Step 3: Fine-tune the NER model
    trainer, fine_tuned_model, tokenizer = fine_tune_ner_model(labeled_data)

    # Step 4: Apply the fine-tuned model to the text
    ner_results = []
    for section in sections:
        ner_results.append(apply_fine_tuned_model(section, fine_tuned_model, tokenizer))

    return ner_results

# Run the pipeline
file_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"
results = main_pipeline(file_path)

# Save results to a file
results_path = "/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities.json"
with open(results_path, "w", encoding="utf-8") as file:
    json.dump(results, file, indent=4)

print(f"NER results saved to {results_path}.")


In [None]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
import spacy

def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f"{ent.text} - {ent.start_char} - {ent.end_char} - {ent.label_} - {spacy.explain(ent.label_)}")
    else:
        print("No named entities found.")

# Example usage:
nlp = spacy.load("en_core_web_sm")
text = "Research suggests that including apples in a balanced diet may promote weight loss and improve overall health. Apples have several properties, for example, that may boost blood sugar control, heart health, and brain function. Regarding weight loss, it’s worth noting that apples are high in water, low in calorie density, and low in overall calories."

doc = nlp(text)
show_ents(doc)

No named entities found.


In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char,ent.label_)

Apple 0 5 ORG
Steve Jobs 21 31 PERSON
Cupertino 35 44 GPE
California 46 56 GPE


In [None]:
import spacy

# Function to display named entities
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f"{ent.text} - {ent.start_char} - {ent.end_char} - {ent.label_} - {spacy.explain(ent.label_)}")
    else:
        print("No named entities found.")

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Read a large text file
file_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"  # Replace with your actual file path
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()  # Read the entire file into a string

# Process the text with spaCy
doc = nlp(text)

# Show named entities
show_ents(doc)




2 - 281 - 282 - CARDINAL - Numerals that do not fall under another type
16 october 1995 - 336 - 351 - DATE - Absolute or relative dates or periods
47 - 362 - 364 - CARDINAL - Numerals that do not fall under another type
48 - 381 - 383 - CARDINAL - Numerals that do not fall under another type
12 - 393 - 395 - CARDINAL - Numerals that do not fall under another type
06 - 430 - 432 - CARDINAL - Numerals that do not fall under another type
1 - 466 - 467 - CARDINAL - Numerals that do not fall under another type
06 - 475 - 477 - CARDINAL - Numerals that do not fall under another type
15 july 2013 - 544 - 556 - DATE - Absolute or relative dates or periods
1 - 569 - 570 - CARDINAL - Numerals that do not fall under another type
1 - 585 - 586 - CARDINAL - Numerals that do not fall under another type
06 - 594 - 596 - CARDINAL - Numerals that do not fall under another type
13 - 664 - 666 - CARDINAL - Numerals that do not fall under another type
march 2013 - 667 - 677 - DATE - Absolute or relative d

* After Lunch 30-Jan-2025

In [None]:
import spacy

In [None]:
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
! pip install spacy-transformers
! python -m spacy download en_core_web_trf  # Transformer-based model


Collecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (fro

In [None]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher

# Load transformer-based spaCy model
nlp = spacy.load("en_core_web_trf")  # More accurate than "en_core_web_sm"

# Define a custom NER Entity Ruler
def add_entity_patterns(nlp):
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    patterns = [
        {"label": "R131_REGULATION", "pattern": "R131 Regulation"},
        {"label": "AEBS_TECHNOLOGY", "pattern": "Advanced Emergency Braking System"},
        {"label": "VEHICLE_TYPE", "pattern": [{"LOWER": "truck"}]},
        {"label": "VEHICLE_TYPE", "pattern": [{"LOWER": "bus"}]},
        {"label": "MANUFACTURER", "pattern": "Volvo"},
        {"label": "MANUFACTURER", "pattern": "Daimler"},
        {"label": "CERTIFICATION_BODY", "pattern": "TÜV Süd"},
        {"label": "CERTIFICATION_BODY", "pattern": "DEKRA"},
        {"label": "TESTING_STANDARD", "pattern": "UNECE Standard"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "EU"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "Japan"},
    ]
    ruler.add_patterns(patterns)

# Function to load text from a saved text file
def load_text_from_file(text_file_path):
    with open(text_file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function for rule-based Relation Extraction
def extract_relationships(doc):
    matcher = Matcher(nlp.vocab)

    relations = [
        ("R131 Regulation", "applies_to", "VEHICLE_TYPE"),
        ("AEBS Technology", "uses", "COMPONENT"),
        ("MANUFACTURER", "must_comply_with", "R131 Regulation"),
        ("CERTIFICATION_BODY", "certifies", "MANUFACTURER"),
        ("AEBS Technology", "tested_via", "TESTING_STANDARD"),
        ("R131 Regulation", "enforced_in", "GEOGRAPHIC_REGION"),
        ("COMPONENT", "integrated_into", "AEBS Technology"),
    ]

    extracted_relations = []

    for ent1, rel, ent2 in relations:
        pattern = [
            {"ENT_TYPE": ent1},
            {"LOWER": rel, "OP": "?"},
            {"ENT_TYPE": ent2}
        ]
        matcher.add(rel, [pattern])

    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        relation = nlp.vocab.strings[match_id]
        extracted_relations.append((span[0].text, relation, span[-1].text))

    return extracted_relations


  model.load_state_dict(torch.load(filelike, map_location=device))


In [None]:
# Load PDF
text_file_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"

text = load_text_from_file(text_file_path)

# Apply spaCy NLP
add_entity_patterns(nlp)
doc = nlp(text)

# Print Named Entities
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

# Extract relationships
relations = extract_relationships(doc)
print("\nExtracted Relationships:")
for rel in relations:
    print(rel)

  with torch.cuda.amp.autocast(self._mixed_precision):


Named Entities:
2 -> CARDINAL
16 october 1995 -> DATE
47 -> CARDINAL
12 -> CARDINAL
06 -> DATE
1 -> CARDINAL
06 -> DATE
15 july 2013 -> DATE
1 -> CARDINAL
1 -> CARDINAL
06 -> DATE
13 march 2013 -> DATE
2 -> CARDINAL
06 -> DATE
3 november 2013 -> DATE
3 -> CARDINAL
06 -> DATE
10 june 2014 -> DATE
4 -> CARDINAL
06 -> DATE
9 october 2014 -> DATE
united nations -> GPE
geneva -> FAC
20 march 1958 -> DATE
16 october 2014 -> DATE
3 -> CARDINAL
1 -> CARDINAL
2 -> CARDINAL
6.2.6.1.1. -> CARDINAL
129 14 -> CARDINAL
2.34. -> CARDINAL
2 -> CARDINAL
2.1 -> CARDINAL
2.2 -> CARDINAL
2.2.1 -> CARDINAL
2.2 -> CARDINAL
4 -> CARDINAL
2.2.1 -> CARDINAL
2.2.4. -> CARDINAL
2.3 -> CARDINAL
2.4 -> CARDINAL
2.5 -> CARDINAL
5 -> CARDINAL
2.6 -> CARDINAL
one -> CARDINAL
2.6.1 -> CARDINAL
2.6.2 -> CARDINAL
1 -> CARDINAL
the consolidated resolution on the construction of vehicles -> LAW
6 2.7 -> CARDINAL
regulation no. 107 -> LAW
m3 -> PRODUCT
2.7.1 -> CARDINAL
one or more -> CARDINAL
one or more -> CARDINAL
2.7.1

In [None]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher

# Load a transformer-based model for better NER
nlp = spacy.load("en_core_web_trf")  # More accurate than en_core_web_sm

# Add an entity ruler to improve NER
def add_entity_patterns(nlp):
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    patterns = [
        {"label": "REGULATION", "pattern": "Regulation No. 48"},
        {"label": "REGULATION", "pattern": "Regulation No. 107"},
        {"label": "VEHICLE_TYPE", "pattern": "M3"},
        {"label": "VEHICLE_TYPE", "pattern": "N2"},
        {"label": "VEHICLE_TYPE", "pattern": "trailers"},
        {"label": "COMPONENT", "pattern": "headlamps"},
        {"label": "COMPONENT", "pattern": "rear fog lamps"},
        {"label": "TECHNOLOGY", "pattern": "adaptive front-lighting system"},
        {"label": "CERTIFICATION_BODY", "pattern": "UNECE"},
        {"label": "CERTIFICATION_BODY", "pattern": "Geneva agreement"},
        {"label": "TESTING_STANDARD", "pattern": "photometric measurements"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "European Union"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "United Nations"},
    ]
    ruler.add_patterns(patterns)

# Function to load text from a saved file
def load_text_from_file(text_file_path):
    with open(text_file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function for rule-based Relation Extraction
def extract_relationships(doc):
    matcher = Matcher(nlp.vocab)

    relations = [
        ("REGULATION", "applies_to", "VEHICLE_TYPE"),
        ("TECHNOLOGY", "uses", "COMPONENT"),
        ("CERTIFICATION_BODY", "certifies", "REGULATION"),
        ("REGULATION", "enforced_in", "GEOGRAPHIC_REGION"),
        ("TESTING_STANDARD", "evaluates", "TECHNOLOGY"),
        ("COMPONENT", "integrated_into", "TECHNOLOGY"),
    ]

    extracted_relations = []

    for ent1, rel, ent2 in relations:
        pattern = [
            {"ENT_TYPE": ent1},
            {"LOWER": rel, "OP": "?"},
            {"ENT_TYPE": ent2}
        ]
        matcher.add(rel, [pattern])

    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        relation = nlp.vocab.strings[match_id]
        extracted_relations.append((span[0].text, relation, span[-1].text))

    return extracted_relations






In [None]:
# Load text from the provided file
text_file_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"
text = load_text_from_file(text_file_path)

# Apply enhanced spaCy NLP
add_entity_patterns(nlp)
doc = nlp(text)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [None]:
# Print improved Named Entities
print("🔹 Improved Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


🔹 Improved Named Entities:
2 -> CARDINAL
16 october 1995 -> DATE
47 -> CARDINAL
12 -> CARDINAL
06 -> DATE
1 -> CARDINAL
06 -> DATE
15 july 2013 -> DATE
1 -> CARDINAL
1 -> CARDINAL
06 -> DATE
13 march 2013 -> DATE
2 -> CARDINAL
06 -> DATE
3 november 2013 -> DATE
3 -> CARDINAL
06 -> DATE
10 june 2014 -> DATE
4 -> CARDINAL
06 -> DATE
9 october 2014 -> DATE
united nations -> GPE
geneva -> FAC
20 march 1958 -> DATE
16 october 2014 -> DATE
3 -> CARDINAL
1 -> CARDINAL
2 -> CARDINAL
headlamps -> COMPONENT
headlamps -> COMPONENT
6.2.6.1.1. -> CARDINAL
headlamps -> COMPONENT
129 14 -> CARDINAL
photometric measurements -> TESTING_STANDARD
2.34. -> CARDINAL
trailers -> VEHICLE_TYPE
2 -> CARDINAL
2.1 -> CARDINAL
2.2 -> CARDINAL
2.2.1 -> CARDINAL
2.2 -> CARDINAL
4 -> CARDINAL
2.2.1 -> CARDINAL
2.2.4. -> CARDINAL
2.3 -> CARDINAL
2.4 -> CARDINAL
2.5 -> CARDINAL
5 -> CARDINAL
2.6 -> CARDINAL
one -> CARDINAL
2.6.1 -> CARDINAL
2.6.2 -> CARDINAL
1 -> CARDINAL
the consolidated resolution on the constructio

In [None]:
# Extract improved relationships
relations = extract_relationships(doc)
print("\n🔹 Extracted Relationships:")
for rel in relations:
    print(rel)


🔹 Extracted Relationships:


In [None]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher
import re
from spacy.language import Language

# Load transformer-based model for better NER
nlp = spacy.load("en_core_web_trf")  # More accurate than en_core_web_sm

# Add an entity ruler to improve NER
def add_entity_patterns(nlp):
    ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

    patterns = [
        # Regulations
        {"label": "REGULATION", "pattern": [{"LOWER": "regulation"}, {"LOWER": "no."}, {"IS_DIGIT": True}]},
        {"label": "REGULATION", "pattern": [{"LOWER": "regulation"}, {"LOWER": "no"}, {"IS_DIGIT": True}]},
        {"label": "REGULATION", "pattern": [{"LOWER": "regulation"}, {"LOWER": "n"}, {"IS_DIGIT": True}]},

        # Vehicle Types
        {"label": "VEHICLE_TYPE", "pattern": "M3"},
        {"label": "VEHICLE_TYPE", "pattern": "N2"},
        {"label": "VEHICLE_TYPE", "pattern": "trailers"},

        # Components
        {"label": "COMPONENT", "pattern": "headlamps"},
        {"label": "COMPONENT", "pattern": "rear fog lamps"},

        # Technologies
        {"label": "TECHNOLOGY", "pattern": "adaptive front-lighting system"},

        # Certification Bodies
        {"label": "CERTIFICATION_BODY", "pattern": "UNECE"},
        {"label": "CERTIFICATION_BODY", "pattern": "Geneva agreement"},

        # Testing Standards
        {"label": "TESTING_STANDARD", "pattern": "photometric measurements"},

        # Geographic Regions
        {"label": "GEOGRAPHIC_REGION", "pattern": "European Union"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "United Nations"},
    ]
    ruler.add_patterns(patterns)

# Add entity patterns
add_entity_patterns(nlp)

# Function to identify section numbers (e.g., 2.7.16.3) as ENTITY_TYPE "SECTION_NUMBER"
def custom_tokenizer(doc):
    section_regex = re.compile(r'\b\d+(\.\d+)+\b')  # Matches patterns like "2.7.16.3"
    entities = []
    for match in section_regex.finditer(doc.text):
        start, end = match.span()
        entities.append((start, end, "SECTION_NUMBER"))

    return entities

# Register the custom component
@Language.component("section_number_ner")
def add_section_number_entities(doc):
    section_entities = custom_tokenizer(doc)
    for start, end, label in section_entities:
        span = doc.char_span(start, end, label=label)
        if span:
            doc.ents = list(doc.ents) + [span]
    return doc

# Add the registered component after the "ner" pipeline
nlp.add_pipe("section_number_ner", after="ner")

# Function for rule-based Relation Extraction
def extract_relationships(doc):
    matcher = Matcher(nlp.vocab)

    relations = [
        ("REGULATION", "applies_to", "VEHICLE_TYPE"),
        ("TECHNOLOGY", "uses", "COMPONENT"),
        ("CERTIFICATION_BODY", "certifies", "REGULATION"),
        ("REGULATION", "enforced_in", "GEOGRAPHIC_REGION"),
        ("TESTING_STANDARD", "evaluates", "TECHNOLOGY"),
        ("COMPONENT", "integrated_into", "TECHNOLOGY"),
    ]

    extracted_relations = []

    for ent1, rel, ent2 in relations:
        pattern = [
            {"ENT_TYPE": ent1},
            {"LOWER": rel, "OP": "?"},
            {"ENT_TYPE": ent2}
        ]
        matcher.add(rel, [pattern])

    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        relation = nlp.vocab.strings[match_id]
        extracted_relations.append((span[0].text, relation, span[-1].text))

    return extracted_relations

# Process the text and extract entities
def process_text(text):
    doc = nlp(text)

    print("\nExtracted Entities:")
    for ent in doc.ents:
        print(f"{ent.text} -> {ent.label_}")

    print("\nExtracted Relations:")
    relations = extract_relationships(doc)
    for rel in relations:
        print(rel)

    return doc

# Example Usage
# Example Usage
if __name__ == "__main__":
    text_file_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"
    sample_text = load_text_from_file(text_file_path)
    process_text(sample_text)


ValueError: [E1010] Unable to set entity information for token 493 which is included in more than one span in entities, blocked, missing or outside.

In [None]:
import re
import spacy

# Load a high-accuracy NLP model
nlp = spacy.load("en_core_web_trf")

# Function to preprocess text
def preprocess_text(file_path):
    # Read the file
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Remove unwanted characters and extra spaces
    text = re.sub(r"\n+", "\n", text)  # Remove excessive newlines
    text = re.sub(r"\s{2,}", " ", text)  # Remove extra spaces
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII characters

    # Standardize regulation references
    text = re.sub(r"Regulation No\.? (\d+)", r"R\1", text)

    # Standardize AEBS variations
    text = text.replace("Advanced Emergency Braking System", "AEBS")

    # Sentence Segmentation using Spacy
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    return sentences  # Return cleaned sentences for further processing

# Apply preprocessing
file_path = "/content/drive/MyDrive/KnowledgeGraphResults/cleaned_text1.txt"
processed_sentences = preprocess_text(file_path)

# Save cleaned text to a new file
output_path = "/content/drive/MyDrive/KnowledgeGraphResults/preprocessed_text.txt"
with open(output_path, "w", encoding="utf-8") as file:
    for sentence in processed_sentences:
        file.write(sentence + "\n")

print(f"Preprocessed text saved to: {output_path}")


  model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):


Preprocessed text saved to: /content/drive/MyDrive/KnowledgeGraphResults/preprocessed_text.txt


In [None]:
import re
import spacy

# Load Spacy transformer model
nlp = spacy.load("en_core_web_trf")

# Function to clean and refine text
def refine_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Remove headers, table of contents, and unwanted formatting
    text = re.sub(r"\n+", "\n", text)  # Normalize new lines
    text = re.sub(r"(^|\s)[0-9]+\s*\.\s*", "", text)  # Remove numbered sections
    text = re.sub(r"\s{2,}", " ", text)  # Remove excessive spaces
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII characters

    # Standardize regulation names (e.g., "Regulation No. 48" → "R48")
    text = re.sub(r"Regulation No\.? (\d+)", r"R\1", text)

    # Standardize technical terms
    text = text.replace("Advanced Emergency Braking System", "AEBS")

    # Sentence segmentation for better entity recognition
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    return sentences  # Return refined sentences for NER and relation extraction

# Apply refining function
input_file = "/content/drive/MyDrive/KnowledgeGraphResults/preprocessed_text.txt"
cleaned_sentences = refine_text(input_file)

# Save final cleaned text
final_output_path = "/content/drive/MyDrive/KnowledgeGraphResults/final_cleaned_text.txt"
with open(final_output_path, "w", encoding="utf-8") as file:
    for sentence in cleaned_sentences:
        file.write(sentence + "\n")

print(f"Final cleaned text is saved at: {final_output_path}")


Final cleaned text is saved at: /content/drive/MyDrive/KnowledgeGraphResults/final_cleaned_text.txt


In [None]:
# Process the file line by line to handle memory constraints
def process_file_line_by_line(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
        for line in infile:
            # Remove unwanted formatting from each line
            line = re.sub(r"(?i)contents\s+page\s+\d+", "", line)  # Remove table of contents
            line = re.sub(r"\n+", " ", line)  # Normalize new lines
            line = re.sub(r"(^|\s)\d+\.\d+\s*", "", line)  # Remove numbered sections (e.g., 2.1, 7.10)
            line = re.sub(r"\s{2,}", " ", line)  # Remove excessive spaces
            line = re.sub(r"[^\x00-\x7F]+", " ", line)  # Remove non-ASCII characters
            line = re.sub(r"Regulation No\.? (\d+)", r"R\1", line)  # Standardize regulation references

            # Skip empty lines
            if line.strip():
                outfile.write(line.strip() + "\n")

# Define paths
input_file = "/content/drive/MyDrive/KnowledgeGraphResults/final_cleaned_text.txt"
output_file = "/content/drive/MyDrive/KnowledgeGraphResults/fully_preprocessed_text.txt"

# Process the file line by line
process_file_line_by_line(input_file, output_file)

# Return the path of the cleaned file
output_file


'/content/drive/MyDrive/KnowledgeGraphResults/fully_preprocessed_text.txt'

In [None]:
# Final Cleaning Script for Fully Preprocessed Text

def final_text_cleaning(file_path, output_path):
    cleaned_lines = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()

            # Remove legal metadata and revision history
            line = re.sub(r"addendum\s+\d+.*", "", line, flags=re.IGNORECASE)
            line = re.sub(r"date of entry into force:.*", "", line, flags=re.IGNORECASE)

            # Remove numbered sections but keep regulatory references like R131
            line = re.sub(r"^\d+\.\d+\.\d+.*", "", line)  # E.g., "2.7.1.2.2"
            line = re.sub(r"^\d+\.\d+.*", "", line)  # E.g., "7.10"

            # Remove table of contents and annex references
            line = re.sub(r"contents\s+page\s+\d+", "", line, flags=re.IGNORECASE)
            line = re.sub(r"annex\s+\d+.*", "", line, flags=re.IGNORECASE)

            # Remove excessive whitespace
            line = re.sub(r"\s{2,}", " ", line)

            # Skip empty lines
            if line.strip():
                cleaned_lines.append(line)

    # Save the final cleaned file
    with open(output_path, "w", encoding="utf-8") as outfile:
        for line in cleaned_lines:
            outfile.write(line + "\n")

# Define paths
input_file = "/content/drive/MyDrive/KnowledgeGraphResults/fully_preprocessed_text.txt"
final_output_file = "/content/drive/MyDrive/KnowledgeGraphResults/ready_text.txt"

# Run final cleaning process
final_text_cleaning(input_file, final_output_file)

# Return path to the final cleaned file
final_output_file

'/content/drive/MyDrive/KnowledgeGraphResults/ready_text.txt'

# Entity Extraction

In [None]:
import spacy
from spacy.pipeline import EntityRuler

# Load Spacy's transformer-based model
nlp = spacy.load("en_core_web_trf")

# Function to add entity patterns
def add_custom_entities(nlp):
    ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

    patterns = [
        # Regulations
        {"label": "REGULATION", "pattern": "R131"},
        {"label": "REGULATION", "pattern": "UNECE R131"},
        {"label": "REGULATION", "pattern": "Regulation 48"},
        {"label": "REGULATION", "pattern": "UNECE R48"},

        # AEBS Technology
        {"label": "AEBS_TECHNOLOGY", "pattern": "Advanced Emergency Braking System"},
        {"label": "AEBS_TECHNOLOGY", "pattern": "AEBS"},

        # Vehicle Types
        {"label": "VEHICLE_TYPE", "pattern": "trucks"},
        {"label": "VEHICLE_TYPE", "pattern": "buses"},
        {"label": "VEHICLE_TYPE", "pattern": "passenger cars"},
        {"label": "VEHICLE_TYPE", "pattern": "M3"},

        # Manufacturers
        {"label": "MANUFACTURER", "pattern": "Volvo"},
        {"label": "MANUFACTURER", "pattern": "Daimler"},
        {"label": "MANUFACTURER", "pattern": "Mercedes"},
        {"label": "MANUFACTURER", "pattern": "Tesla"},

        # Components
        {"label": "COMPONENT", "pattern": "radar sensors"},
        {"label": "COMPONENT", "pattern": "LiDAR sensors"},
        {"label": "COMPONENT", "pattern": "control units"},
        {"label": "COMPONENT", "pattern": "actuators"},

        # Testing Standards
        {"label": "TESTING_STANDARD", "pattern": "stationary target tests"},
        {"label": "TESTING_STANDARD", "pattern": "dynamic target detection"},

        # Geographic Regions
        {"label": "GEOGRAPHIC_REGION", "pattern": "European Union"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "Japan"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "United States"},

        # Certification Bodies
        {"label": "CERTIFICATION_BODY", "pattern": "TÜV Süd"},
        {"label": "CERTIFICATION_BODY", "pattern": "UNECE"},
        {"label": "CERTIFICATION_BODY", "pattern": "KBA"},
        {"label": "CERTIFICATION_BODY", "pattern": "NHTSA"},

        # Approval Authorities
        {"label": "APPROVAL_AUTHORITY", "pattern": "KBA"},
        {"label": "APPROVAL_AUTHORITY", "pattern": "NHTSA"},
    ]

    ruler.add_patterns(patterns)

# Add the custom entity patterns
add_custom_entities(nlp)

# Function to process text and extract entities
def extract_entities(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Process the text using Spacy's pipeline
    doc = nlp(text)

    # Extract entities
    extracted_entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Save extracted entities
    with open(output_path, "w", encoding="utf-8") as file:
        for entity, label in extracted_entities:
            file.write(f"{entity} -> {label}\n")

    return output_path

# Define file paths
input_text_file = "/content/drive/MyDrive/KnowledgeGraphResults/ready_text.txt"
output_entities_file = "/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities.txt"

# Extract and save entities
extracted_entities_path = extract_entities(input_text_file, output_entities_file)

# Return path to extracted entities
extracted_entities_path


'/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities.txt'

In [None]:
import spacy
from spacy.pipeline import EntityRuler
import re

# Load Spacy transformer model
nlp = spacy.load("en_core_web_trf")

# Function to add improved entity patterns
def add_improved_entities(nlp):
    ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

    patterns = [
        # Regulations (Explicitly Define Instead of Relying on Default "LAW")
        {"label": "REGULATION", "pattern": "R131"},
        {"label": "REGULATION", "pattern": "UNECE R131"},
        {"label": "REGULATION", "pattern": "Regulation 48"},
        {"label": "REGULATION", "pattern": "UNECE R48"},

        # AEBS Technology
        {"label": "AEBS_TECHNOLOGY", "pattern": "Advanced Emergency Braking System"},
        {"label": "AEBS_TECHNOLOGY", "pattern": "AEBS"},
        {"label": "AEBS_TECHNOLOGY", "pattern": "afs"},  # Fix misclassification

        # Vehicle Types (Override Incorrect "PRODUCT" Classification)
        {"label": "VEHICLE_TYPE", "pattern": "trucks"},
        {"label": "VEHICLE_TYPE", "pattern": "buses"},
        {"label": "VEHICLE_TYPE", "pattern": "passenger cars"},
        {"label": "VEHICLE_TYPE", "pattern": "M3"},
        {"label": "VEHICLE_TYPE", "pattern": "N2"},
        {"label": "VEHICLE_TYPE", "pattern": "N3"},

        # Manufacturers
        {"label": "MANUFACTURER", "pattern": "Volvo"},
        {"label": "MANUFACTURER", "pattern": "Daimler"},
        {"label": "MANUFACTURER", "pattern": "Mercedes"},
        {"label": "MANUFACTURER", "pattern": "Tesla"},

        # Components
        {"label": "COMPONENT", "pattern": "radar sensors"},
        {"label": "COMPONENT", "pattern": "LiDAR sensors"},
        {"label": "COMPONENT", "pattern": "control units"},
        {"label": "COMPONENT", "pattern": "actuators"},

        # Testing Standards
        {"label": "TESTING_STANDARD", "pattern": "stationary target tests"},
        {"label": "TESTING_STANDARD", "pattern": "dynamic target detection"},

        # Geographic Regions
        {"label": "GEOGRAPHIC_REGION", "pattern": "European Union"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "Japan"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "United States"},

        # Certification Bodies
        {"label": "CERTIFICATION_BODY", "pattern": "TÜV Süd"},
        {"label": "CERTIFICATION_BODY", "pattern": "UNECE"},
        {"label": "CERTIFICATION_BODY", "pattern": "KBA"},
        {"label": "CERTIFICATION_BODY", "pattern": "NHTSA"},

        # Approval Authorities
        {"label": "APPROVAL_AUTHORITY", "pattern": "KBA"},
        {"label": "APPROVAL_AUTHORITY", "pattern": "NHTSA"},
    ]

    ruler.add_patterns(patterns)

# Add improved entity patterns
add_improved_entities(nlp)

# Function to remove unwanted numbers and filter entities
def filter_entities(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Process the text using Spacy's pipeline
    doc = nlp(text)

    filtered_entities = []

    for ent in doc.ents:
        # Remove isolated numbers that are likely paragraph references
        if ent.label_ == "CARDINAL" and re.match(r"^\d+$", ent.text):
            continue
        # Ignore meaningless paragraph references (e.g., "paragraph 5.2.6.3")
        if re.match(r"paragraph\s*\d+(\.\d+)*", ent.text, re.IGNORECASE):
            continue
        # Ignore single-word numbers
        if ent.text.lower() in {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"}:
            continue

        filtered_entities.append((ent.text, ent.label_))

    # Save filtered entities
    with open(output_path, "w", encoding="utf-8") as file:
        for entity, label in filtered_entities:
            file.write(f"{entity} -> {label}\n")

    return output_path

# Define file paths
input_text_file = "/content/drive/MyDrive/KnowledgeGraphResults/ready_text.txt"
filtered_entities_file =  "/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities1.txt"

# Extract and save improved entities
filtered_entities_path = filter_entities(input_text_file, filtered_entities_file)

# Return path to extracted entities
filtered_entities_path


'/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities1.txt'

In [None]:
import spacy
from spacy.pipeline import EntityRuler
import re

# Load Spacy transformer model
nlp = spacy.load("en_core_web_trf")

# Function to add improved entity patterns
def add_improved_entities(nlp):
    ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

    patterns = [
        # Regulations
        {"label": "REGULATION", "pattern": "R131"},
        {"label": "REGULATION", "pattern": "UNECE R131"},
        {"label": "REGULATION", "pattern": "Regulation 48"},
        {"label": "REGULATION", "pattern": "UNECE R48"},

        # AEBS Technology
        {"label": "AEBS_TECHNOLOGY", "pattern": "Advanced Emergency Braking System"},
        {"label": "AEBS_TECHNOLOGY", "pattern": "AEBS"},
        {"label": "AEBS_TECHNOLOGY", "pattern": "afs"},

        # Vehicle Types (Force Correct Classification)
        {"label": "VEHICLE_TYPE", "pattern": "M3"},
        {"label": "VEHICLE_TYPE", "pattern": "N2"},
        {"label": "VEHICLE_TYPE", "pattern": "N3"},
        {"label": "VEHICLE_TYPE", "pattern": "trucks"},
        {"label": "VEHICLE_TYPE", "pattern": "buses"},
        {"label": "VEHICLE_TYPE", "pattern": "passenger cars"},

        # Manufacturers
        {"label": "MANUFACTURER", "pattern": "Volvo"},
        {"label": "MANUFACTURER", "pattern": "Daimler"},
        {"label": "MANUFACTURER", "pattern": "Mercedes"},
        {"label": "MANUFACTURER", "pattern": "Tesla"},

        # Components
        {"label": "COMPONENT", "pattern": "radar sensors"},
        {"label": "COMPONENT", "pattern": "LiDAR sensors"},
        {"label": "COMPONENT", "pattern": "control units"},
        {"label": "COMPONENT", "pattern": "actuators"},

        # Testing Standards
        {"label": "TESTING_STANDARD", "pattern": "stationary target tests"},
        {"label": "TESTING_STANDARD", "pattern": "dynamic target detection"},

        # Geographic Regions
        {"label": "GEOGRAPHIC_REGION", "pattern": "European Union"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "Japan"},
        {"label": "GEOGRAPHIC_REGION", "pattern": "United States"},

        # Certification Bodies
        {"label": "CERTIFICATION_BODY", "pattern": "TÜV Süd"},
        {"label": "CERTIFICATION_BODY", "pattern": "UNECE"},
        {"label": "CERTIFICATION_BODY", "pattern": "KBA"},
        {"label": "CERTIFICATION_BODY", "pattern": "NHTSA"},
    ]

    ruler.add_patterns(patterns)

# Add improved entity patterns
add_improved_entities(nlp)

# Function to remove unwanted numbers, paragraph references, and irrelevant entities
def filter_entities(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Process the text using Spacy's pipeline
    doc = nlp(text)

    filtered_entities = []

    for ent in doc.ents:
        # Remove standalone numbers that are likely paragraph references
        if ent.label_ in ["CARDINAL", "QUANTITY", "PERCENT"] and re.match(r"^\d+(\.\d+)?$", ent.text):
            continue
        # Ignore meaningless paragraph references (e.g., "paragraph 5.2.6.3")
        if re.match(r"paragraph\s*\d+(\.\d+)*", ent.text, re.IGNORECASE):
            continue
        # Remove unexpected LAW or WORK_OF_ART classifications
        if ent.label_ in ["LAW", "WORK_OF_ART"] and not ent.text.startswith("Regulation"):
            continue

        filtered_entities.append((ent.text, ent.label_))

    # Save filtered entities
    with open(output_path, "w", encoding="utf-8") as file:
        for entity, label in filtered_entities:
            file.write(f"{entity} -> {label}\n")

    return output_path

# Define file paths
input_text_file = "/content/drive/MyDrive/KnowledgeGraphResults/ready_text.txt"
filtered_entities_file =  "/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities2.txt"

# Extract and save improved entities
filtered_entities_path = filter_entities(input_text_file, filtered_entities_file)

# Return path to extracted entities
filtered_entities_path


  model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):


'/content/drive/MyDrive/KnowledgeGraphResults/extracted_entities2.txt'