In [0]:
%pip install azure-storage-blob

In [0]:
dbutils.library.restartPython()

In [0]:
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import *

In [0]:
# Define Azure Blob Storage connection details
storage_account_name = "heauksdsdevcdosa"
storage_account_access_key = dbutils.secrets.get(scope="cdo-kv", key="heauksdsdevcdosa-key")
container_name = 'lb-container'

# Initialize the Azure Blob Service Client
block_blob_service = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential= storage_account_access_key)

# Set up Spark to authenticate with the Blob Storage
spark.conf.set(f'fs.azure.account.key.{storage_account_name}.blob.core.windows.net',storage_account_access_key)

# Define the path to the PDF file in the Blob Storage
file1 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCE0110B.pdf"
file2 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCE0107B.pdf"
file3 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCE1157E.pdf"
file4 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCG1093J.pdf"
file5 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCG1107B.pdf"
file6 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCG1202A.pdf"
file7 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCH1744H.pdf"
file8 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCH1759F.pdf"
file9 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCH1798H.pdf"
file10 = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/MCH2629A.pdf"


# Read the binary content of the file into a Spark DataFrame and then display
pdf_df = spark.read.format("binaryFile").load(file4).cache()
display(pdf_df)

In [0]:
%pip install pdfplumber

!python -m spacy download en_core_web_sm

%pip install prettytable

In [0]:
import pdfplumber
from io import BytesIO

def extract_text(binary_data):
  # wrap the binary data in a file-like object
  pdf_file = BytesIO(binary_data)

  # loading binary data into PDFplumber
  with pdfplumber.open(pdf_file) as pdf:
    plain_text = ""

    for page in pdf.pages:
      plain_text += page.extract_text()

    return plain_text


In [0]:
binary_pdf = pdf_df.select("content").collect()[0]["content"] #collects rows from the dataframe into a python list

extracted_text = extract_text(binary_pdf) 
#print(extracted_text)

In [0]:
def clean_text(text):
# cleAns dataset
    cleaned_text = " ".join(text.split())
    cleaned_text = cleaned_text.lower()
    return cleaned_text

cleaned_text = clean_text(extracted_text)
print(cleaned_text[:1000])


from langchain.text_splitter import CharacterTextSplitter

#https://www.youtube.com/watch?v=dXxQ0LR-3Hg&list=LL&index=1&t=1820s to explain the function

def get_text_chunks(text):
  text_splitter = CharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200, 
    separator="/n", 
    length_function=len
    )
  chunks = text_splitter.split_text(text)
  return chunks

chunked_text = get_text_chunks(cleaned_text)

In [0]:
tests = """
The TR 2144 M:3952 document specifies requirements.
Also see MCE 1234:123 and TR 2144 M for details.
Some MCE1234B document and TR 2144M reference.
"""


print(clean_text(tests))

In [0]:
'''USEFUL LINKS
https://www.babelstreet.com/blog/what-is-entity-extraction#:~:text=Entity%20extraction%20(aka%2C%20named%20entity,%2C%20webpages%2C%20text%20fields).

https://medium.com/@sanskrutikhedkar09/mastering-information-extraction-from-unstructured-text-a-deep-dive-into-named-entity-recognition-4aa2f664a453

https://www.microfocus.com/documentation/relativity/relativity1217/reldbdsn/GUID-7C2DF185-41A1-4448-81E7-3252AA8DEBB3.html 

'''

import spacy
from spacy.pipeline import EntityRuler
import re
from typing import List, Dict, Any

def create_nlp_pipeline():
    nlp = spacy.load("en_core_web_sm")
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    
    patterns = [
    # Pattern 1: Connected with optional letter (mce0107b or mce0107)
    {"label": "TECH_DOC", "pattern": [
        {"LOWER": {"REGEX": r"^(mce|mch|mcx|mcg|tr)\d{4}[a-z]?$"}}
    ]},

    # Pattern 2: Space after prefix (MCE 0107B or MCE 0107)
    {"label": "TECH_DOC", "pattern": [
        # Match the prefix more flexibly
        {"LOWER": {"IN": ["mce", "mch", "mcx", "mcg", "tr"]}},
        # Match any numbers with optional suffix, removing strict boundaries
        {"TEXT": {"REGEX": r"^\d{4}[A-Za-z]?$"}}
    ]},

    # Pattern 3: Prefix, number, and separate letter (mce 0107 b)
    {"label": "TECH_DOC", "pattern": [
        {"LOWER": {"IN": ["mce", "mch", "mcx", "mcg","tr"]}},
        {"TEXT": {"REGEX": r"^\d{4}$"}},
        {"LOWER": {"REGEX": r"^[a-z]$"}},
        
    ]},
    
    # SYSTEM_COMPONENT Patterns
    {"label": "SYSTEM_COMPONENT", "pattern": [
        {"LOWER": {"REGEX": r"^(midas|nmcs2?|hadecs|hatms)$"}}
    ]},
    
    {"label": "SYSTEM_COMPONENT", "pattern": [
        {"LOWER": "midas"}, 
        {"LOWER": "gold"}
    ]},
    
    # HARDWARE_COMPONENT Patterns
    {"label": "HARDWARE_COMPONENT", "pattern": [
        {"LOWER": {"REGEX": r"^(cabinet|plinth|lantern|post|frame|skirt)$"}}, 
        {"LOWER": "type"}, 
        {"TEXT": {"REGEX": r"^\d+[a-z]?$"}}
    ]},
    
    {"label": "HARDWARE_COMPONENT", "pattern": [
        {"LOWER": {"REGEX": r"^(ms[1-4]r?|ami|ert)$"}}
    ]},
    
    {"label": "HARDWARE_COMPONENT", "pattern": [
        {"LOWER": {"IN": ["indicator", "signal", "sensor", "detector", "camera", "telephone"]}}
    ]},
    
    # COMMUNICATION_COMPONENT Patterns
    {"label": "COMMUNICATION_COMPONENT", "pattern": [
        {"LOWER": {"REGEX": r"^(rs485|rs422|tcp\/ip|lan|wan)$"}}
    ]},
    
    {"label": "COMMUNICATION_COMPONENT", "pattern": [
        {"LOWER": "rs"}, 
        {"TEXT": {"REGEX": r"^(485|422)$"}}
    ]},
    
    {"label": "COMMUNICATION_COMPONENT", "pattern": [
        {"LOWER": "ethernet"}, 
        {"LOWER": {"IN": ["lan", "connection", "interface"]}}
    ]},
    
    # SUBSYSTEM_COMPONENT Patterns
    {"label": "SUBSYSTEM_COMPONENT", "pattern": [
        {"LOWER": {"IN": ["signal", "message", "meteorological", "tidal", "tunnel"]}}, 
        {"LOWER": "subsystem"}
    ]},
    
    {"label": "SUBSYSTEM_COMPONENT", "pattern": [
        {"LOWER": {"REGEX": r"^(lcc|pdu|cobs|ceclb|ceceb|cecr)$"}}
    ]},
    
    # CONTROL_COMPONENT Patterns
    {"label": "CONTROL_COMPONENT", "pattern": [
        {"LOWER": {"IN": ["control", "monitoring", "outstation", "instation"]}}, 
        {"LOWER": {"IN": ["system", "unit", "equipment", "interface"]}}
    ]},
    
    {"label": "CONTROL_COMPONENT", "pattern": [
        {"LOWER": "cctv"}, 
        {"LOWER": {"IN": ["system", "camera", "equipment"]}}
    ]},
    
    # SPECIFICATION_TYPE Patterns
    {"label": "SPECIFICATION_TYPE", "pattern": [
        {"LOWER": {"IN": ["requirements", "specification", "instructions", "overview", "process"]}}, 
        {"LOWER": "document"}
    ]},
    
    {"label": "SPECIFICATION_TYPE", "pattern": [
        {"LOWER": "technical"}, 
        {"LOWER": "requirements"}
    ]}
]
    
    ruler.add_patterns(patterns)
    return nlp



In [0]:

def debug_document_code(nlp, text):
    """
    Provides detailed analysis of how document codes are being processed.
    """
    print(f"\nAnalyzing document code: '{text}'")
    
    # First, show raw tokenization
    doc = nlp(text)
    print("\nTokenization details:")
    for token in doc:
        print(f"Token: '{token.text}'")
        print(f"  Position: {token.idx} to {token.idx + len(token.text)}")
        print(f"  Is part of entity: {token.ent_type_ != ''}")
        print(f"  Entity type: {token.ent_type_ if token.ent_type_ else 'None'}")
        print()
    
    # Show complete entities found
    print("\nComplete entities found:")
    for ent in doc.ents:
        print(f"Entity: '{ent.text}'")
        print(f"  Label: {ent.label_}")
        print(f"  Includes all tokens: {all(t.ent_type_ == ent.label_ for t in ent)}")
        print()
    
    # Show what didn't match
    unmatched = [t.text for t in doc if not t.ent_type_]
    if unmatched:
        print("\nUnmatched tokens:")
        print(", ".join(unmatched))
     

def test_pattern_variations(nlp) -> List[Dict[str, Any]]:
    """
    Tests various pattern matching scenarios to verify entity recognition.
    Includes comprehensive test cases for all entity types and their variations.
    """
    # We organize test cases by category for better clarity and coverage
    test_cases = [
    # === Technical Document Tests ===
    # Positive Cases
    "MCE 0107B",             # Traditional uppercase format
    "mce0107b",             # Complete lowercase
    "MCE-0107B",            # With hyphen uppercase
    "MCE 0107 B",           # Spaced format
    
    # Negative Cases
    "MCE01007B",           # Too many digits
    "MCE107B",             # Too few digits
    "MCEA0107B",           #  Extra character in prefix
    "MCE0107BC",           #  Multiple suffix letters
    "MC-0107B",            # Wrong prefix
    "MCE/0107B",           # Wrong separator
    "MCE_0107_B",          #  Underscore separator
    "MCE.0107.B",          #  Dot separator
    
    # === System Component Tests ===
    # Positive Cases
    "MIDAS",               #Basic system name
    "NMCS2",               #With number
    "MIDAS Gold",          #Multi-word system
    
    # Negative Cases
    "MIDAS3",              # Unexpected number
    "MIDAS-Gold",          #Hyphenated format
    "MIDASGold",           #Run together
    "MIDAS_GOLD",          #  Underscore format
    
    # === Hardware Component Tests ===
    # Positive Cases
    "Cabinet Type 600",     # Standard cabinet
    "Plinth Type 1A",      #  With letter suffix
    "MS3R",                # Signal type
    
    # Negative Cases
    "Cabinet Type ABC",     # : Non-numeric type
    "Cabinet Types 600",    # Wrong word
    "MS5R",                # Wrong number
    "MS3RR",               #Extra letter
    
    # === Communication Component Tests ===
    # Positive Cases
    "RS485",               # No space format
    "RS 485",              # Spaced format
    "Ethernet LAN",        #  Multi-word
    
    # Negative Cases
    "RS-485",              # Hyphenated
    "RS_485",              #  Underscore
    "RS 4855",             #  Wrong number
    "Ethernet_LAN",        #  Wrong separator
    
    # Subsystem Component Tests  
    # Positive Cases
    "Signal Subsystem",     #  Standard format
    "LCC",                  #   Short form
    
    # Negative Cases
    "Signal-Subsystem",     #  Hyphenated
    "SignalSubsystem",      #   Run together
    "LCCC",                 #   Extra character
    
    #  Control Component Tests 
    # Positive Cases
    "Control System",       #  : Basic format
    "CCTV System",         #  With prefix
    
    # Negative Cases
    "Control-System",       #  Hyphenated
    "ControlSystem",        #   Run together
    "CCTV_System",         #    Wrong separator
    
    # === Complex Test Cases ===
    # Positive Cases
    "The MCE0107B document describes the MIDAS Gold system",    # Valid: Multiple valid entities
    
    # Negative Cases
    "The MCE01007B document connects to RS-485",               #  Multiple invalid formats
    "The MIDAS_Gold system uses Cabinet Type ABC",             #   Multiple wrong separators
    "MCE0107BB is connected to RS_485 through CCTV_System"     #   Multiple format errors
]
    
    results = []
    for test_text in test_cases:
        doc = nlp(test_text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        results.append({
            'input': test_text,
            'entities': entities,
            'matched': len(entities) > 0,
            'context': test_text
        })
    
    return results
  

def analyze_test_results(results):
    
    # Initialize groupings for analysis
    entity_groups = {}
    missed_matches = []
    context_matches = []
    
    # Process results
    for result in results:
        # Group entities by type
        for entity, label in result['entities']:
            if label not in entity_groups:
                entity_groups[label] = []
            entity_groups[label].append({
                'text': entity,
                'context': result['context']
            })
            
            # Track entities found in complex contexts
            if len(result['context'].split()) > 3:  # More than 3 words indicates complex context
                context_matches.append({
                    'entity': entity,
                    'label': label,
                    'context': result['context']
                })
        
        # Track potential missed matches
        if not result['matched'] and len(result['input'].split()) <= 3:  # Simple cases that didn't match
            missed_matches.append(result['input'])
    
    # Display comprehensive analysis
    print("=== Pattern Matching Analysis ===\n")
    
    # Show matches by entity type
    for label in sorted(entity_groups.keys()):
        print(f"\n{label} Matches ({len(entity_groups[label])} total):")
        print("-" * 60)
        for match in entity_groups[label]:
            print(f"Entity: {match['text']}")
            print(f"Context: {match['context']}")
            print()
    
    # Show context analysis
    if context_matches:
        print("\n=== Complex Context Matches ===")
        print("-" * 60)
        for match in context_matches:
            print(f"Found {match['entity']} ({match['label']})")
            print(f"In context: {match['context']}")
            print()
    
    # Show statistics
    print("\n=== Match Statistics ===")
    print("-" * 60)
    for label in sorted(entity_groups.keys()):
        print(f"{label}: {len(entity_groups[label])} matches")
    
    # Show potential issues
    if missed_matches:
        print("\n=== Potential Missed Matches ===")
        print("-" * 60)
        for text in missed_matches:
            print(f"No entities found in: {text}")

def run_pattern_tests(nlp):
    """
    Executes pattern tests and provides comprehensive results analysis.
    """
    print("Showing pattern matching analysis...\n")
    
    # Run the tests
    results = test_pattern_variations(nlp)
    
    # Analyze and display results
    analyze_test_results(results)
    
    return results


# Function to print test results, with type hints for parameter and return value
def print_test_results(results: List[Dict[str, Any]]):
    # Print header for test results section
    print("Pattern Matching Test Results:")
    print("-" * 50)
    
    # Iterate through each test result
    for result in results:
        status = "✓" if result['matched'] else "✗"
        
        # Create formatted string of entities, or "No match" if none found
        # Uses list comprehension to format each entity with its label
        entities_str = ', '.join([f"{ent[0]} ({ent[1]})" for ent in result['entities']]) if result['entities'] else "No match"
        
        # Print formatted result line with consistent spacing
        print(f"{status} Input: {result['input']:<15} -> {entities_str}")





In [0]:
small_chunk = """
The MCE0107B document connects to RS485 while MCH 1070B uses RS 422.
MIDAS Gold system interfaces with the Ethernet LAN through Cabinet Type 600.
The Signal Subsystem monitors the CCTV System and AMI-EE devices. (AMI bobo)
"""

medium_chunk = """
The assembly manual for MCH0107B specifies that RS485 components must be configured 
alongside RS422 adapters, with additional references to MCE 0150C outlined in 
Section 4.3 of the document. TR 2043 further details the integration with 
Ethernet LAN systems, enabling high-speed communication protocols compliant 
with IEEE 802.3 standards. The MIDAS Gold system connects through Cabinet Type 600A 
to the Control System, while monitoring occurs via the CCTV System and Signal Subsystem.

The MCE 1070B requirements document describes interfacing with NMCS2 through standard 
protocols. Local connections use RS 485 for primary communication, supported by 
AMI-EE devices and monitored by the Outstation Equipment.
"""

large_chunk = """
Technical Requirements Document: System Integration Specification

1. Overview
The MCE0107B specification, in conjunction with MCH 1070B and TR 2043, defines the 
integration requirements for the MIDAS Gold system. Primary communication occurs through 
RS485 interfaces, while secondary protocols utilize RS 422 and Ethernet LAN connections.

2. Hardware Components
Cabinet Type 600 houses the main control units, with additional Cabinet Type 450A units 
for auxiliary systems. The AMI-EE devices interface with MS3R indicators and standard 
AMI units. Signal sensors and detector units provide environmental monitoring capabilities.

3. System Architecture
The NMCS2 framework integrates with HADECS and HATMS subsystems through standardized 
interfaces. The Signal Subsystem and Message Subsystem handle primary control operations, 
while the Meteorological Subsystem provides environmental data. CECLB and CECEB units 
coordinate with the PDU for power distribution.

4. Communication Infrastructure
Primary TCP/IP networks connect through LAN and WAN interfaces. The Ethernet LAN provides 
local connectivity, supported by RS485 and RS 422 serial connections. Each Control System 
interfaces with its respective Monitoring Unit through dedicated channels.

5. Monitoring and Control
The CCTV System provides visual monitoring capabilities, integrated with the Control System 
and Monitoring Equipment. Outstation Equipment handles remote operations, while the 
Instation Interface manages central control functions.

6. Reference Documentation
MCE 1080B describes the detailed protocols, while TR 2044 and MCH 1075B provide 
supplementary specifications. The Requirements Document and Technical Requirements 
specify additional integration parameters.

7. System Components
Multiple AMI-EE installations connect through Cabinet Type 600B units, monitored by 
the Signal Subsystem. The MIDAS Gold deployment utilizes standard NMCS2 protocols for 
primary operations.
"""
small_chunk= small_chunk.lower()
medium_chunk = medium_chunk.lower()


In [0]:

#List of our cutom labels for the process text to look for
custom_labels = ['TECH_DOC', 'SYSTEM_COMPONENT', 'HARDWARE_COMPONENT', 'COMMUNICATION_COMPONENT', 'SUBSYSTEM_COMPONENT', 'CONTROL_COMPONENT', 'SPECIFICATION_TYPE']


def process_text(nlp, text: str):
    """
    Process text and return detailed entity information including:
    - Individual entity frequencies
    - Entity type counts
    - Context and position information
    
    This enhanced tracking helps build a more informed knowledge graph by showing
    which specific entities are most referenced in the documentation.
    """
    # Clean up whitespace while preserving document structure
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text)
    
    # Initialize tracking dictionaries
    entity_type_counts = {label: 0 for label in custom_labels}  # Counts by entity type
    entity_frequencies = {}  # Counts of specific entity mentions
    
    # Process entities and track frequencies
    entities = []
    for ent in doc.ents:
        if ent.label_ in custom_labels:
            # Create entity record
            entity_info = {
                'text': ent.text,
                'label': ent.label_,
                'original_text': text[ent.start_char:ent.end_char],
                'start': ent.start_char,
                'end': ent.end_char
            }
            entities.append(entity_info)
            
            # Update type count
            entity_type_counts[ent.label_] += 1
            
            # Update specific entity frequency
            entity_key = (ent.text, ent.label_)  # Tuple of text and label to handle same text with different labels
            if entity_key not in entity_frequencies:
                entity_frequencies[entity_key] = {
                    'count': 0,
                    'text': ent.text,
                    'label': ent.label_
                }
            entity_frequencies[entity_key]['count'] += 1
    
    return entities, entity_type_counts, entity_frequencies

def print_document_results(entities, type_counts, frequencies):
    """
    Display comprehensive entity analysis including:
    - Individual entities found
    - Counts by entity type
    - Frequency of specific entities
    """
    print("\nDocument Processing Results:")
    print("-" * 50)
    
    # Print each found entity with context
    print("Entities Found in Context:")
    for entity in entities:
        print(f"Found: {entity['text']} ({entity['label']})")
        print(f"Original text: '{entity['original_text']}'")
        print(f"Position: {entity['start']} to {entity['end']}")
        print("-" * 30)
    
    # Print entity type summary
    print("\nEntity Type Summary:")
    print("-" * 50)
    for label, count in type_counts.items():
        if count > 0:  # Only show types that were found
            print(f"{label}: {count} total mentions")
    
    # Print specific entity frequencies, grouped by type
    print("\nDetailed Entity Frequencies:")
    print("-" * 50)
    
    # Group frequencies by entity type for clearer presentation
    grouped_frequencies = {}
    for (text, label), info in frequencies.items():
        if label not in grouped_frequencies:
            grouped_frequencies[label] = []
        grouped_frequencies[label].append(info)
    
    # Print frequencies by type
    for label in custom_labels:
        if label in grouped_frequencies:
            print(f"\n{label}:")
            # Sort by frequency, highest first
            sorted_entities = sorted(grouped_frequencies[label], 
                                  key=lambda x: x['count'], 
                                  reverse=True)
            for entity in sorted_entities:
                print(f"  {entity['text']}: {entity['count']} mentions")

    


In [0]:
# Initialize the NLP pipeline
nlp = create_nlp_pipeline()
    
    # Run pattern tests, run once
    #test = run_pattern_tests(nlp)
    #print_test_results(test)

debug_document_code(nlp, "MCE0107B , MCE2344B x  TR 0543 C more, MCE 234 bro nn, MCE 3342C cap, MCE 3333 B bag, MCE3234 A stir TR 2144 M:3952") 

    
   

In [0]:
 
big_chunk = str(chunked_text)
 
 # Process the sample text
entities, counts, freq = process_text(nlp, big_chunk)
print_document_results(entities, counts, freq)

In [0]:

manual_counts1 = {'TR 1100': 10, 'TR 2070': 8, 'TR 2142': 4, 'TR 2043': 3, 'TR 2067': 4, 'TR 2130': 3, 'TR2070': 3, 'TR 2072': 2, 'TR2139': 1, 'MCE 1349': 3, 'MCE0110': 1, 'MCE0107': 3, 'MCH 1618': 2, 'MCX 0731': 1, 'MCX 0925': 1, 'MCX 1069': 1}


manual_counts2 = {
        'TR 2033': 5,'TR 2043': 14,'TR 1100': 12,'TR 2070': 6,'TR 2130': 5,'TR 2142': 5,'TR 2067': 3,'TR2070': 4,'TR 2072': 2,'TR2139': 1,'TR 1173': 1,'TR 1238': 1,'TR 2110': 1,'MCX1031':17,'MCX0920': 3,'MCX0918': 1,'MCX0733': 1,'MCH 1618': 2,'MCH 1689': 2,'MCH1349': 1,'MCH 1621': 1,'MCE 0110': 1,'MCE0110': 2,'MCE0107': 1,'MCG 1069': 1,
    }

manual_counts3 = {'mce 1157': 2, 'mce 1157 a': 1, 'mce 1157 b': 1, 'mce 1157 c': 1, 'mce 1157 d': 1, 'mce 1157 e': 1, 'tr 1100': 7, 'mcx 0708': 4, 'tr 2130': 4, 'mch 1616': 3, 'mch 1618': 3, 'tr 2199': 3, 'mcg 1107': 2, 'mch 1349': 2, 'tr2199': 1}

manual_counts4 = {'mch 1744': 23, 'tr 2144': 11, 'mcg 1091': 3, 'tr2144': 3, 'tr 2144 m': 7, 'mcg 1069': 3, 'mcg 1092': 2, 'mcg 1093': 1, 'mch 1714': 1, 'mch 1753': 1, 'mch 1748': 1}

manual_counts5 = {'tr 2199': 138, 'tr 2130': 16, 'tr 1100': 14, 'tr 2067': 13, 'tr 2070': 10, 'mcg 1069': 5, 'tr 2516': 5, 'mce 1137': 4, 'mch 1689': 3, 'mcx 0028': 3, 'tr 2195': 2, 'mcx 0071': 2, 'tr 2045': 2, 'mcg 1107': 1, 'mch 1616': 1}

manual_counts6 = {'mcg 1069': 3, 'mce 0110': 3, 'tr 1100': 2, 'tr 2199': 2, 'mce 0107': 2, 'tr 2195': 2, 'mce 2214': 2, 'mch 1616': 2, 'mcg 1202': 1}

manual_counts7 = {'mch 1753': 13, 'tr 2144': 4, 'mch 1744': 1}

manual_counts8 = {'mch 1748': 39, 'tr 2163': 23, 'tr 2133': 5, 'mch 1726': 4, 'tr 2139': 3, 'mch 1689': 3, 'mch 1617': 2, 'mch 1618': 2, 'mch 1655': 2, 'tr 2072': 1, 'mch 1700': 2, 'mch 1759': 1, 'mch1619': 1, 'mch 1124': 1}

manual_counts9 = {'mch 1748': 38, 'mch 1689': 14, 'mce 2103': 4, 'mch 1700': 4, 'mch 1616': 3, 'tr 2072': 3, 'tr 2133': 2, 'mch 1798': 1, 'mch 1619': 1}

manual_counts10 = {'mch2624': 4, 'mch1689': 2, 'mch 2629': 1}




In [0]:
copilot_count1 = {'MCE0110': 1,'MCH1618': 2,'MCE0107': 3,'TR2043': 3,'TR2067': 4,'TR2070': 11,'TR2072': 2,'TR1100': 9,'MCE1349': 3,'TR2130': 3,'MCX0731': 1,'MCX0925': 1,'TR2033': 2,'MCX0910': 1,'TR2142': 4,'MCG1069': 1,'TR2139': 1
}

copilot_count2 = {'MCE0107': 1, 'MCE0110': 3, 'MCH1618': 2, 'MCX1031': 15, 'TR2043': 13, 'MCH1689': 2, 'TR2070': 9, 'MCX0920': 3, 'TR1100': 12, 'MCH1349': 1, 'TR2130': 4, 'TR2067': 3, 'MCX0918': 1, 'TR2033': 4, 'MCX0733': 1, 'TR2142': 5, 'MCG1069': 1, 'TR2072': 2, 'TR2139': 1, 'MCH1621': 1, 'TR1173': 1, 'TR1238': 1, 'TR2110': 1, 'MCE 0107 B': 1, 'MCX 1031': 2}

copilot_count3 = {'MCE1157': 7, 'MCH1616': 3, 'MCH1618': 3, 'TR2199': 4, 'MCG1107': 2, 'TR1100': 7, 'MCX0708': 4, 'MCH1349': 2, 'TR2130': 4}

copilot_count4 = {'MCG1093': 1, 'TR2144': 14, 'MCH1744': 23, 'MCG1069': 3, 'MCG1091': 3, 'MCG1092': 2, 'MCH1714': 1, 'MCH1753': 1, 'MCH1748': 1}

copilot_count6 = {'MCG1202': 1, 'TR1100': 2, 'MCG1069': 3, 'TR2199': 2, 'MCE0107': 2, 'TR2195': 2, 'MCE2214': 2, 'MCE0110': 3, 'MCH1616': 2}

copilot_count7 = {'MCH1744': 1, 'TR2144': 4, 'MCH1753': 11}

copilot_count8= {'MCH1759': 1, 'MCH1726': 4, 'MCH1617': 2, 'MCH1618': 2, 'MCH1655': 2, 'TR2139': 3, 'TR2163': 23, 'MCH1748': 39, 'MCH1689': 3, 'TR2133': 5, 'MCH1700': 2, 'TR2072': 1, 'MCH1619': 1, 'MCH1124': 1}

copilot_count9 = {'MCH1798': 1, 'MCH1616': 3, 'MCH1748': 38, 'MCE2103': 4, 'MCH1689': 14, 'TR2072': 3, 'MCH1700': 4, 'TR2133': 1, 'MCH1619': 1}

copilot_count10 = {'MCH2629': 1, 'MCH2624': 4, 'MCH1689': 2}


In [0]:
def get_accuracies_for_all_pdfs(nlp):
    pdf_accuracies = {}
    
    # List of all files and their corresponding manual counts
    files_man = [
        (file1, manual_counts1, "MCE0110B"),
        (file2, manual_counts2, "MCE0107B"),
        (file3, manual_counts3, "MCE1157E"),
        (file4, manual_counts4, "MCG1093J"),
        (file5, manual_counts5, "MCG1107B"),
        (file6, manual_counts6, "MCG1202A"),
        (file7, manual_counts7, "MCH1744H"),
        (file8, manual_counts8, "MCH1759F"),
        (file9, manual_counts9, "MCH1798H"),
        (file10, manual_counts10, "MCH2629A")
    ]

    files_copilot = [
        (file1, copilot_count1, "MCE0110B"),
        (file2, copilot_count2, "MCE0107B"),
        (file3, copilot_count3, "MCE1157E"),
        (file4, copilot_count4, "MCG1093J"),
        (file6, copilot_count6, "MCG1202A"),
        (file7, copilot_count7, "MCH1744H"),
        (file8, copilot_count8, "MCH1759F"),
        (file9, copilot_count9, "MCH1798H"),
        (file10, copilot_count10, "MCH2629A")
    ]


    
    for file_path, manual_counts, pdf_name in files_man:
        # Read PDF content
        pdf_df = spark.read.format("binaryFile").load(file_path).cache()
        binary_pdf = pdf_df.select("content").collect()[0]["content"]
        
        # Extract text
        extracted_text = extract_text(binary_pdf)
        
        # Clean and chunk text
        cleaned_text = clean_text(extracted_text)
        chunked_text = get_text_chunks(cleaned_text)
        
        # Run validation
        _, accuracy = validate_tech_doc_recognition(nlp, cleaned_text, manual_counts)
        pdf_accuracies[pdf_name] = accuracy
        
        print(f"{pdf_name} Accuracy: {accuracy:.1f}%")
        
        # Clear cache
        pdf_df.unpersist()
    
    return pdf_accuracies

In [0]:
from prettytable import PrettyTable
from typing import Dict, List, Tuple
from builtins import min  # Explicitly import the built-in min function


def normalize_tech_doc_id(doc_id: str) -> str:
    """
    Normalizes technical document IDs by removing spaces and converting to lowercase.
    For example: 'MCE 0107 B' -> 'mce0107b'
    """
    return ''.join(doc_id.split()).lower()

def validate_tech_doc_recognition(nlp, text: str, manual_counts: Dict[str, int]) -> Tuple[PrettyTable, float]:
    """
    Validates the NER model's performance on technical document recognition,
    treating different format variations of the same document ID as equivalent.
    """
    doc = nlp(text)
    
    # First, create normalized versions of manual counts and track variations
    normalized_manual_counts = {}
    variations_map = {}  # Maps normalized IDs to sets of original variations
    
    for original_id, count in manual_counts.items():
        normalized_id = normalize_tech_doc_id(original_id)
        
        # Update normalized counts
        if normalized_id not in normalized_manual_counts:
            normalized_manual_counts[normalized_id] = 0
            variations_map[normalized_id] = set()
        
        normalized_manual_counts[normalized_id] += count
        variations_map[normalized_id].add(original_id)
    
    # Count model predictions, normalizing as we go
    predicted_counts = {}
    for ent in doc.ents:
        if ent.label_ == "TECH_DOC":
            normalized_ent = normalize_tech_doc_id(ent.text)
            if normalized_ent not in predicted_counts:
                predicted_counts[normalized_ent] = 0
            predicted_counts[normalized_ent] += 1
            
            # Add this variation to our tracking if it's a new format
            if normalized_ent in variations_map:
                variations_map[normalized_ent].add(ent.text)
    
    # Create comparison table
    table = PrettyTable()
    table.field_names = [
        "Technical Document",
        "Variations Found",
        "Manual Count",
        "Model Count",
        "Difference",
        "Accuracy %"
    ]
    table.align = "l"
    
    # Track totals
    total_manual = 0
    total_predicted = 0
    total_correct = 0
    
    # Add rows for each unique normalized document ID
    processed_ids = set()
    
    # First, process all manual counts
    for normalized_id in normalized_manual_counts.keys():
        if normalized_id in processed_ids:
            continue
            
        processed_ids.add(normalized_id)
        
        manual_count = normalized_manual_counts[normalized_id]
        predicted_count = predicted_counts.get(normalized_id, 0)
        
        # Get all variations found
        variations = sorted(variations_map[normalized_id])
        variations_str = ", ".join(variations)
        
        # Calculate accuracy
        accuracy = min(predicted_count, manual_count) / manual_count * 100 if manual_count > 0 else 0
        
        # Update totals
        total_manual += manual_count
        total_predicted += predicted_count
        total_correct += min(predicted_count, manual_count)
        
        # Use the first variation as the primary ID for display
        primary_id = sorted(variations_map[normalized_id])[0]
        
        table.add_row([
            primary_id,
            variations_str,
            manual_count,
            predicted_count,
            predicted_count - manual_count,
            f"{accuracy:.1f}%"
        ])
    
    # Calculate overall accuracy
    overall_accuracy = (total_correct / total_manual * 100) if total_manual > 0 else 0
    
    # Add totals row
    table.add_row([
        "TOTAL",
        "",
        total_manual,
        total_predicted,
        total_predicted - total_manual,
        f"{overall_accuracy:.1f}%"
    ])
    
    return table, overall_accuracy

# Run validation
results_table, overall_accuracy = validate_tech_doc_recognition(nlp, big_chunk, manual_counts4)
print(results_table)

In [0]:
def plot_accuracies(accuracies):
    plt.figure(figsize=(12, 6))
    
    # Create bar plot
    pdfs = list(accuracies.keys())
    acc_values = list(accuracies.values())
    
    bars = plt.bar(pdfs, acc_values, color='skyblue')
    
    # Customize plot
    plt.title('Technical Document Recognition Accuracy Across PDFs', pad=20)
    plt.xlabel('PDF Documents')
    plt.ylabel('Accuracy (%)')
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%',
                ha='center', va='bottom')
    
    # Rotate x-axis labels
    plt.xticks(rotation=45)
    
    # Add grid
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Set y-axis range
    plt.ylim(0, 100)
    
    # Add average line
    avg_accuracy = np.mean(acc_values)
    plt.axhline(y=avg_accuracy, color='r', linestyle='--', alpha=0.8)
    plt.text(len(pdfs)-1, avg_accuracy, f'Average: {avg_accuracy:.1f}%', 
             va='bottom', ha='right', color='r')
    
    plt.tight_layout()
    plt.show()

In [0]:
import matplotlib.pyplot as plt
def main():
    # Initialize spaCy pipeline
    nlp = create_nlp_pipeline()
    

    
    # Run validation
    results_table, overall_accuracy = validate_tech_doc_recognition(nlp, big_chunk, manual_counts4)

    # Print results
    print("\nTechnical Document Recognition Validation")
    print("=" * 80)
    print(results_table)
    print(f"\nOverall Model Accuracy: {overall_accuracy:.1f}%")

    # Get accuracies for all PDFs
    accuracies = get_accuracies_for_all_pdfs(nlp)
    
    # Plot results
    plot_accuracies(accuracies)
    


if __name__ == "__main__":
    main()