# LER and AR evaluation

### Library imports, read env variables, define constants

In [None]:
%pip install -r requirements.txt

In [None]:
import os
import time
import requests
import pandas as pd
import json
from collections import defaultdict
import openai

from azure.storage.blob import BlobServiceClient
from azure.ai.documentintelligence.models import ParagraphRole
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (AnalyzeResult, AnalyzeDocumentRequest, DocumentFieldType,DocumentSelectionMarkState)

from dotenv import load_dotenv
load_dotenv()

endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
client_secret = os.getenv("AZURE_CLIENT_SECRET")

openai_endpoint = os.getenv("AZURE_OPENAI_SERVICE_URI")
openai_key = os.getenv("AZURE_OPENAI_SERVICE_KEY")
openai_deployment = "gpt-4o"
openai_version = "2024-02-15-preview"

container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

ler_pdf_directory = "ler_pdfs"
processed_ler_directory = "processed_LERs"
incident_dir = "incident_reports"
output_dir = "outputs"
os.makedirs(ler_pdf_directory, exist_ok=True)
os.makedirs(processed_ler_directory, exist_ok=True)
os.makedirs(incident_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)


MIN_SLEEP_TIME = 5 # seconds to wait between requests to avoid rate limits
MAX_SLEEP_TIME = 100

### Download all LER pdfs from NRC ADAMS

In [None]:
def downloadLERpdfs(url_list_filepath, download_folder="ler_pdfs"):

    sleeptime = MIN_SLEEP_TIME

    with open(url_list_filepath, "r") as file:
        urls = [line.strip() for line in file if line.strip()]

    os.makedirs(download_folder, exist_ok=True)
    for url in urls:

        pdfname = url.split("/")[-1]
        if not pdfname.endswith(".pdf"):
            print(f"Skipping non-PDF URL: {url}")
            continue
        
        filepath = os.path.join(download_folder, pdfname)
        if os.path.exists(filepath):
            print(f"File already exists - skipping: {pdfname}")
            continue

        try:
            response = requests.get(url, stream=True, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"})
            response.raise_for_status()
            with open(filepath, "wb") as pdf_file:
                for chunk in response.iter_content(chunk_size=8192):
                    pdf_file.write(chunk)
            print(f"Downloaded: {pdfname}")
            sleeptime = max(MIN_SLEEP_TIME, sleeptime - 1)
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")
            sleeptime = min(sleeptime + MIN_SLEEP_TIME, MAX_SLEEP_TIME)

        time.sleep(sleeptime)
    

### Uncomment the following line to download LER PDFs from the provided URLs
### Otherwise downloaded PDFs should be downloaded from the Azure Blob Storage container
#downloadLERpdfs("inputs/ler_urls.txt")

### Upload to Azure

In [None]:
def uploadToAzure(local_dir):

    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    for fln in os.listdir(local_dir):
        local_file_path = os.path.join(local_dir, fln)
        with open(local_file_path, "rb") as form:
            container_client.upload_blob(name=local_file_path,data=form, overwrite=True)
        print(f"Uploaded: {fln}")

# Uncomment the following line to upload LER PDFs to Azure Blob Storage
#uploadToAzure(ler_pdf_directory) 

### Read the LERs using Document Intelligence

In [None]:

LER_CONTINUATION_TITLE = "LICENSEE EVENT REPORT (LER) CONTINUATION SHEET"

EXCLUDED_PARAGRAPH_CONTENT = {
    'LICENSEE EVENT REPORT (LER) CONTINUATION SHEET',
    'NARRATIVE',
    'NRC FORM 366A (04-02-2024)',
    (
        '(See NUREG-1022, R.3 for instruction and guidance for completing this form'
        'http://www.nrc.gov/reading-rm/doc-collections/nuregs/staff/sr1022/r3/)'
    ),
    'APPROVED BY OMB: NO. 3150-0104 EXPIRES: 04/30/2027',
    (
        'Estimated burden per response to comply with this mandatory collection request: 80 hours. Reported lessons '
        'learned are incorporated into the licensing process and fed back to industry. Send comments regarding burden '
        'estimate to the FOIA, Library, and Information Collections Branch (T-6 A10M), U. S. Nuclear Regulatory '
        'Commission, Washington, DC 20555-0001, or by email to Infocollects.Resource@nrc.gov, and the OMB reviewer at: '
        'OMB Office of Information and Regulatory Affairs, (3150-0104), Attn: Desk Officer for the Nuclear Regulatory '
        'Commission, 725 17th Street NW, Washington, DC 20503. The NRC may not conduct or sponsor, and a person is not '
        'required to respond to, a collection of information unless the document requesting or requiring the collection'
        ' displays a currently valid OMB control number.'
    )
}


def isLERContinutationSection(section, analyzed_result):
    _, first_element_kind, index = section.elements[0].split('/')

    if first_element_kind != 'paragraphs':
        return False

    first_paragraph = analyzed_result.paragraphs[int(index)]
    if first_paragraph.role == ParagraphRole.TITLE and first_paragraph.content == LER_CONTINUATION_TITLE:
        return True
    else:
        return False


def processContinuationSections(section_index, analyzed_result, narrative_paragraphs):
    section = analyzed_result.sections[section_index]

    for element in section.elements:
        _, kind, index = element.split('/')
        if kind == 'paragraphs':
            paragraph = analyzed_result.paragraphs[int(index)]
            # skip the first paragraph if it contains boilerplate text
            if paragraph.content in EXCLUDED_PARAGRAPH_CONTENT:
                continue
            narrative_paragraphs.append(paragraph.content)
        elif kind == 'sections':
            processContinuationSections(int(index), analyzed_result, narrative_paragraphs)


def processRootSection(analyzed_result, narrative_paragraphs):
    # Sections are organized as a tree
    # The root section contains all the seperate sections as children
    # We only want to process sections that have a title of LER_CONTINUATION_TITLE
    # Since that contains Narrative information
    section_tree_root = analyzed_result.sections[0]
    for section in section_tree_root.elements:
        _, kind, index = section.split('/')
        section = analyzed_result.sections[int(index)]
        if isLERContinutationSection(section, analyzed_result):
            processContinuationSections(int(index), analyzed_result, narrative_paragraphs)


In [None]:
def analyze_layout(pdf_path):

    document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

    with open(pdf_path, "rb") as f:
        file_bytes = f.read()
        poller = document_intelligence_client.begin_analyze_document(
            "custom-ler-2025-03-26", AnalyzeDocumentRequest(bytes_source=file_bytes))
        result: AnalyzeResult = poller.result()

    if len(result.documents) != 1:
        print(f"Expected 1 document, but got {len(result.documents)}")
        return None
    
    narrative_paragraphs = []
    processRootSection(result, narrative_paragraphs)

    document = result.documents[0]

    if document.doc_type == "custom-ler-2025-03-26":
        event_year = document.fields.get("Event Date Year").content
        event_month = document.fields.get("Event Date Month").content
        event_day = document.fields.get("Event Date Day").content
        event_datetime = f"{event_year}-{event_month}-{event_day}T00:00:00Z"

        report_year = document.fields.get("Report Date Year").content
        report_day = document.fields.get("Report Date Day").content
        report_month = document.fields.get("Report Date Month").content
        report_datetime = f"{report_year}-{report_month}-{report_day}T00:00:00Z"

        ler_year = document.fields.get("LER Number Year").content
        ler_seq_no = document.fields.get("LER Number Seq No").content
        ler_rev_no = document.fields.get("LER Number Rev No").content
        ler_number = f"{ler_year}-{ler_seq_no}-{ler_rev_no}"

        cfr_requirements = []
        for name, field in document.fields.items():
            if field.type == DocumentFieldType.SELECTION_MARK and field.value_selection_mark == DocumentSelectionMarkState.SELECTED:
                cfr_requirements.append(name)

        document_data = {
            "doc_name" : f"{pdf_path}",
            "ler_number": f"{ler_number}",
            "report_date": report_datetime,
            "event_date": event_datetime,
            "facility_name": document.fields.get("Facility Name").content,
            "title": document.fields.get("Title").content,
            "cfr_requirements": cfr_requirements,
            "abstract": document.fields.get("Abstract").content,
            "narrative": '\n'.join(narrative_paragraphs)
        }


        print(json.dumps(document_data, indent=4))
        return document_data
    
    return None

In [None]:
# When running the subsequent cell, don't try the probematic LERs again
issue_LERs = []

### Process LERs into text files

In [None]:
original_LERs = [f for f in os.listdir(ler_pdf_directory) if f.endswith('.pdf')]

for fln in original_LERs:
    pdf_path = os.path.join(ler_pdf_directory, fln)
    form_name = fln.split('.')[0]
    if os.path.exists(os.path.join(processed_ler_directory, f"{form_name}.txt")):
        print(f"File already exists - skipping: {form_name}")
        continue
    if fln in issue_LERs: continue
    try: output = analyze_layout(pdf_path)
    except Exception as e:
        print(f"Error processing {fln}: {e}");  issue_LERs.append(fln);
        continue
    if not output:
        print(f"Failed to process {fln}"); 
        issue_LERs.append(fln);
        continue
    with open(os.path.join(processed_ler_directory, f"{form_name}.txt"), 'w') as f: json.dump(output, f, indent=4)

print("DONE")

### LERs with issues

In [None]:
print(f"Issue LERs:{len(issue_LERs)}")
for fln in issue_LERs: print(fln)

### Upload LER contents to Azure

In [None]:
# Uncomment the following line to upload processed LER txt contents to Azure Blob Storage
#uploadToAzure(processed_ler_directory) 

### EDA on LERs

In [None]:
noabstract = 0
nonarrative = 0
processed_files = [f for f in os.listdir(processed_ler_directory) if f.endswith('.txt')]
counts = defaultdict(int)
mult_counts = defaultdict(int)

for fln in processed_files:
    with open(os.path.join(processed_ler_directory, fln), 'r') as f:
        try: data = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {fln}: {e}")
            continue
        mult_counts[len(data['cfr_requirements'])] += 1
        for cfr in data['cfr_requirements']:  counts[cfr] += 1

        if not data["abstract"]: noabstract += 1
        if not data["narrative"]: nonarrative += 1


print("Total LERs Processed: ", len(processed_files))

print("Box Counts:")
for count, num in mult_counts.items():
    print(f"{count}: {num}")


print("\n Section Counts:")
for cfr, count in counts.items():
    print(f"{cfr}: {count}")

print(f"\nNo Abstract Count: {noabstract}")
print(f"No Narrative Count: {nonarrative}")

### Read the structured NUREG JSON output

In [None]:
with open("inputs/structured_output.json", 'r') as f: structured_output = json.load(f)
print(f"Structure Output Entries:{len(structured_output)}\n\n\n")
for x in structured_output: print(x['sub_section_5073'])

### Common sections between processed NUREG structured JSON file and different paragraphs

In [None]:
structured_sections = [x['sub_section_5073'] for x in structured_output if x]
structured_sections = [x for x in structured_sections if x]
structured_sections = [item for sublist in structured_sections for item in sublist]
structured_sections = list(set(structured_sections))
structured_sections = [x[2:] for x in structured_sections if len(x) > 7]
print(structured_sections)

ler_sections = sorted(list(counts.keys()))

for section in ler_sections:
    if section not in structured_sections: print(f"Section {section} not found in structured sections.")
    else: print(f"Section {section} found in structured sections.")

### Read the LLM prompts

In [None]:
with open(f"inputs/remove_references.txt", "r") as file: remove_refs_prompt = file.read()
with open(f"inputs/specific_reportability.txt", "r") as file: specific_prompt = file.read()

### Test the LLM

In [None]:
client = openai.AzureOpenAI(
    api_version=openai_version,
    azure_endpoint=openai_endpoint,
    api_key=openai_key
)

response = client.chat.completions.create(
    messages = [
        {"role" : "system", "content" : "You are a scientist"},
        {"role": "user", "content" : "Tell me a short science joke"},
    ],
    temperature = 0.0,
    model = openai_deployment
)
print(response.choices[0].message.content)

### Evaluate LERs for each of the sections

In [None]:
processed_files = [f for f in os.listdir(processed_ler_directory) if f.endswith('.txt')]
processed_files = processed_files[:10]  # TODO REMOVE AFTER TESTING

ler_vector = list()
subsection_vector = list()
reported_vector = list()
llm_reportability = list()
llm_rationale = list()

for section in structured_output:
    if not section: continue

    title       = section['Title']
    paragraph   = section['Section']
    subsection  = section['sub_section_5073'][0] if section['sub_section_5073'] else None
    description = section['Description']
    discussion  = section['Discussion']
    examples    = section['Examples']

    if not title or not paragraph or not subsection or not description or not discussion or not examples: continue
    
    subsection = subsection[2:]   # Remove the paragraph prefix for easier matching

    #Only check the common sections for now
    if subsection not in ["50.73(a)(2)(iii)", "50.73(a)(2)(vii)", "50.73(a)(2)(x)"]: continue 

    section_prompt = specific_prompt.format(section_description=description, section_discussion=discussion, section_examples=examples)
        
    for fln in processed_files:
        
        with open(os.path.join(processed_ler_directory, fln), 'r') as f:
            try: data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from {fln}: {e}")
                continue
        
        if(len(data['cfr_requirements']) != 1): continue  ### Only test single CFR requirements for now

        # Check if the paragraphs that are in the structured JSON
        if data["cfr_requirements"][0] not in ["50.73(a)(2)(iii)", "50.73(a)(2)(vii)", "50.73(a)(2)(x)"]: continue
        
        abstract  =  data["abstract"]
        narrative =  data["narrative"]
        if not abstract or not narrative: continue

        print(f"Processing file:{fln}  LER Number:{data['ler_number']} subsection:{subsection}")

        response = client.chat.completions.create(
            messages = [
                {"role" : "system", "content" : remove_refs_prompt},
                {"role": "user", "content" : narrative}
            ],
            temperature = 0.0,
            model = openai_deployment
        )

        clean_narrative = response.choices[0].message.content        
        if not clean_narrative: continue
        
        response = client.chat.completions.create(
            messages = [
                {"role" : "system", "content" : section_prompt},
                {"role": "user", "content" : clean_narrative},
                
            ],
            response_format = {"type": "json_object"},
            temperature = 0.0,
            model = openai_deployment
        )

        reportability = response.choices[0].message.content
        report = json.loads(reportability)

        ler_vector.append(data['ler_number'])
        subsection_vector.append(subsection)
        reported_vector.append(data["cfr_requirements"][0])
        llm_reportability.append(report.get("answer", "N/A"))
        llm_rationale.append(report.get("rationale", "N/A"))

        time.sleep(MIN_SLEEP_TIME)  # Sleep to avoid hitting rate limits

print("DONE")

df = pd.DataFrame({"LER_NUMBER":ler_vector, "SECTION":subsection_vector, "REPORTED_SECTION": reported_vector, "LLM_ANSWER":llm_reportability, "LLM_RATIONALE":llm_rationale})
df["Correct"] = ((df["SECTION"] == df["REPORTED_SECTION"]) & (df["LLM_ANSWER"] == "YES")
 | (df["SECTION"] != df["REPORTED_SECTION"]) & (df["LLM_ANSWER"] == "NO"))
df.to_csv(os.path.join(output_dir, "ler_reportability.csv"), index=False)

dg = df.groupby(['SECTION'])['Correct'].count().reset_index(name='Count')
dg.to_csv(os.path.join(output_dir,"ler_reportability_eval.csv"), index=False)


In [None]:
df = pd.read_csv("inputs/irtable_test.csv")  # To test change it to irtable_test.csv
ars = list()
subsections = list()
rv = list()
tv = list()

for section in structured_output:
    if not section['sub_section_5073']:
        continue
    title = section['Title']
    paragraph = section['Section']
    if len(section['sub_section_5073']) != 1:
        continue
    subsection = section['sub_section_5073'][0]
    description = section['Description']
    discussion = section['Discussion']
    examples = section['Examples']

    section_prompt = specific_prompt.format(section_description=description,
                                            section_discussion=discussion, section_examples=examples)

    for x in df.iterrows():
        ar_number = x[1]["AR_NUMBER"]
        description = x[1]["CONTENT"]
        if not description:
            continue
        print(f"Processing AR: {ar_number}")

        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": section_prompt},
                {"role": "user", "content": description}
            ],
            response_format={"type": "json_object"},
            temperature=0.0,
            model=openai_deployment
        )

        reportability = (response.choices[0].message.content)
        report = json.loads(reportability)
        ars.append(ar_number)
        subsections.append(subsection)
        rv.append(report['answer'])
        tv.append(report['rationale'])

        with open(os.path.join(incident_dir, f"{ar_number}_{subsection}_reportability.txt"), 'w') as report_file:
            report_file.write(reportability)

        time.sleep(MIN_SLEEP_TIME)  # Sleep to avoid hitting rate limits

df = pd.DataFrame({"AR_NUMBER": ars, "REPORTED_SECTION": subsections, "REPORTABILITY": rv, "REASONING": tv})
dg = df.groupby(['REPORTED_SECTION', 'REPORTABILITY']).size().reset_index(name='count')
df.to_csv(os.path.join(output_dir, "/incident_reportability.csv"), index=False)
dg.to_csv(os.path.join(output_dir, "incident_reportability_grouped.csv"), index=False)