In [None]:
%pip install python-dotenv
%pip install -r requirements.txt

In [None]:
from io import StringIO
import requests
import os
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
import json
from urllib.parse import quote
from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentFieldType, DocumentSelectionMarkState
from collections import defaultdict
import json
import pandas as pd
from datetime import datetime
from pathlib import Path

In [None]:
# Load the .env file
load_dotenv()
AZURE_STORAGE_CONTAINER_NAME = os.getenv("AZURE_CONTAINER_NAME")
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
AZURE_DOCUMENT_INTELLIGENCE_KEY=os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
AZURE_TENANT_ID=os.getenv("AZURE_TENANT_ID")
AZURE_CLIENT_ID=os.getenv("AZURE_CLIENT_ID")
AZURE_CLIENT_SECRET=os.getenv("AZURE_CLIENT_SECRET")
AZURE_STORAGE_ACCOUNT_NAME=os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
AZURE_STORAGE_CONTAINER_NAME=os.getenv("AZURE_STORAGE_CONTAINER_NAME")
AZURE_CONNECTION_STRING=os.getenv('AZURE_CONNECTION_STRING')
# Define blob folder constant
BLOB_FOLDER = 'LER_CONSTELLATION'
blob_folder2 = 'ler_constellation_ground_truth'
blob_name = f'{blob_folder2}/ler_constellation_ground_truth.csv'
# Constants
data_dir = Path("index_data")
output_csv = Path("output/data/ground_truth.csv")

#### Getting Constellation LER PDF url from ADAMS

In [None]:
def urlResponse(author_affiliation, start_date, end_date, document_type):

    # Base API endpoint for NRC ADAMS advanced search
    base_url = "https://adams.nrc.gov/wba/services/search/advanced/nrc"

    # Build the query string
    q_param = (
        f"(mode:sections,sections:(filters:(public-library:!t),"
        f"options:(within-folder:(enable:!f,insubfolder:!f,path:'')),"
        f"properties_search_all:!("
        f"!(DocumentDate,range,(left:'{start_date}',right:'{end_date}'),''),"
        f"!(AuthorAffiliation,starts,'{author_affiliation}',''),"
        f"!(DocumentType,starts,'{document_type}',''))))"
    )

    # Full parameters dict
    params = {
        "q": q_param,
        "qn": "New",
        "tab": "advanced-search-pars",
        "z": "0"
    }

    # Make the GET request
    response = requests.get(base_url, params=params)
    
    return response

def extractXMLProperties(urlResponse):
    
    accessionNoList   = []
    publishedDateList = []
    
    root = ET.fromstring(urlResponse.content)
    for result in root.findall(".//result"):
        accession = result.findtext("AccessionNumber")
        publishedDate = result.findtext("PublishDatePARS")
        
        # cleaning Published Date
        cleaned = " ".join(publishedDate.split()[:3])  # Keeps "04/01/2025 08:10 AM"
        dt = datetime.strptime(cleaned, "%m/%d/%Y %I:%M %p")
        # Extract just the date
        cleanedPublishedDate = dt.date()
        
        # appending it to list
        accessionNoList.append(accession)
        publishedDateList.append(cleanedPublishedDate)
    
    return accessionNoList, publishedDateList

In [None]:
response = urlResponse(
    author_affiliation="Constellation Energy Generation, LLC",
    start_date="01/01/2021",
    end_date="05/15/2025",
    document_type="Licensee Event Report (LER)"
)
print(response.url)  # to inspect the full generated URL
# extractXMLProperties
accessionNo_List, publishedDate_List = extractXMLProperties(response)

#### Uploading Constellation LER PDFs to Blob Storage

In [None]:

# Upload new PDFs to Azure Blob    
def uploadPDFsToAzure(aList, pubDateList, blob_folder):
    # Create Azure Blob Service Client
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)

    for idx, aNo in enumerate(aList):
        downloadURL = f"https://adamswebsearch2.nrc.gov/webSearch2/main.jsp?AccessionNumber={aNo}"
        pdf_response = requests.get(downloadURL)
        try:
            if pdf_response.status_code == 200:
                file_name = f"{aNo}_{pubDateList[idx]}.pdf"
                blob_path = f"{blob_folder}/{file_name}"  # upload path in blob

                # Get blob client for the file
                blob_client = blob_service_client.get_blob_client(container=AZURE_STORAGE_CONTAINER_NAME, blob=blob_path)

                # Upload directly from memory
                blob_client.upload_blob(pdf_response.content, overwrite=False)

                print(f"Uploaded '{file_name}' to Azure Blob Storage.")
           
        except Exception as e:
            if 'blob already exists' in str(e).lower():
                print(f'{file_name}:already exists, skipping uploading it')
                continue

In [None]:
# Total PDFs driectly  to blob 
curDate  = datetime.now()
curMonth = curDate.month
curYear  = curDate.year
months = {1:'JAN', 2:'FEB',3:'MAR',4:'APR', 5:'MAY', 6:'JUN', 7:'JUL', 8:'AUG', 9:'SEP',10:'OCT',
            11:'NOV', 12:'DEC'}
print(f'Total PDFs from {months[curMonth]},{curYear}: {len(accessionNo_List)}')
# calling function
uploadPDFsToAzure(accessionNo_List, publishedDate_List, BLOB_FOLDER)

#### Getting PDFs from Blob Storage and Extracting it using Doc Intelligence 

In [None]:
def extract_clean_narrative(result, keyword="NARRATIVE"):
    """
    Extracts and cleans content starting from 'NARRATIVE' on each page individually.
    Returns a single cleaned string.
    """
    # seen = set()
    pages = defaultdict(list)
    output = 'NARRATIVE \n'
    # Group by page
    for doc in result.paragraphs:
        page_number = doc['boundingRegions'][0]['pageNumber']
        pages[page_number].append(doc)
    # Process each page
    for page_number in sorted(pages.keys()):
        capture = False
        for doc in pages[page_number]:
            content = doc['content'].strip()

            # Start capturing from keyword
            if keyword in content:
                capture = True
                continue
            if capture:
                output += content + '\n'
    return output
# Authenticate with Azure
credential = ClientSecretCredential(AZURE_TENANT_ID, AZURE_CLIENT_ID, AZURE_CLIENT_SECRET)
# Blob service setup
blob_service_client = BlobServiceClient(
    account_url=f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.usgovcloudapi.net",
    credential=credential
)

def list_pdf_blobs():
    container_client = blob_service_client.get_container_client(AZURE_STORAGE_CONTAINER_NAME)
    pdf_urls = []
    # Read current accession_numbers from ground truth CSV
    # blob_name = 'ler_constellation_ground_truth/ler_constellation_ground_truth.csv'    
    blob_client = blob_service_client.get_blob_client(container=AZURE_STORAGE_CONTAINER_NAME, blob=blob_name)
    blob_data = blob_client.download_blob()
    csv_data  = blob_data.content_as_text()
    df_gt = pd.read_csv(StringIO(csv_data))
    processed_accession_no = df_gt['accession_number'].astype(str).str.strip().tolist()
    
    for blob in container_client.list_blobs(name_starts_with='LER_CONSTELLATION'):
        if blob.name.endswith(".pdf"):
            blob_parts = blob.name.split("/")[-1].replace(".pdf", "").split("_")
            accession_no = blob_parts[0].strip()
            if accession_no not in processed_accession_no:
                safe_name = quote(blob.name)
                url = (
                    f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.usgovcloudapi.net/"
                    f"{AZURE_STORAGE_CONTAINER_NAME}/{safe_name}"
                )
                pdf_urls.append(url)
    return pdf_urls, df_gt

def analyze_pdf(blob_url):
    blob_name = blob_url.split("/")[-1].replace("%20", " ")
    blob_prefix = blob_name.replace(".pdf", "")
    accession_number, pub_date = blob_prefix.split("_", 1)
    client = DocumentIntelligenceClient(
        endpoint=AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
        credential=AzureKeyCredential(AZURE_DOCUMENT_INTELLIGENCE_KEY)
    )
    poller = client.begin_analyze_document(
        "custom-ler-2025-03-26",
        AnalyzeDocumentRequest(url_source=blob_url),
    )
    result = poller.result()
    documents = []
    for document in result.documents:
        if document.doc_type != "custom-ler-2025-03-26":
            continue

        try:
            event_date = (
                f"{document.fields['Event Date Year'].content}-"
                f"{document.fields['Event Date Month'].content}-"
                f"{document.fields['Event Date Day'].content}T00:00:00Z"
            )

            report_date = (
                f"{document.fields['Report Date Year'].content}-"
                f"{document.fields['Report Date Month'].content}-"
                f"{document.fields['Report Date Day'].content}T00:00:00Z"
            )

            ler_number = ( 
                f"{document.fields['LER Number Year'].content}-"
                f"{document.fields['LER Number Seq No'].content}-"
                f"{document.fields['LER Number Rev No'].content}"
            )

            cfr_list = [
                name for name, field in document.fields.items()
                if field.type == DocumentFieldType.SELECTION_MARK and
                   field.value_selection_mark == DocumentSelectionMarkState.SELECTED
            ]

            doc_data = {
                "ler_number": ler_number,
                "accession_number": accession_number,
                "accession_published_date": pub_date,
                "report_date": report_date,
                "event_date": event_date,
                "facility_name": document.fields.get("Facility Name").content,
                "title": document.fields.get("Title").content,
                "cfr_requirements": cfr_list,
                "abstract": document.fields.get("Abstract").content,
                "narrative": extract_clean_narrative(result)
            }

            documents.append(doc_data)

        except Exception as e:
            print(f"Error processing {blob_name}: {e}")
            return None
    # Save extracted result to JSON for indexing
    os.makedirs("index_data", exist_ok=True)
    with open(f"index_data/{accession_number}_index.json", "w") as f:
        json.dump(documents, f, indent=4)

In [None]:
pdf_urls, df_gt_parsed = list_pdf_blobs()
# len(pdf_urls)
parsed_accession_number_list = df_gt_parsed.accession_number.tolist()

In [None]:
# processed_accession_no = df_gt['accession_number'].astype(str).str.strip().tolist()
print(f"Found {len(pdf_urls)} LER PDFs.")
for url in pdf_urls:
    print(f"Processing: {url}")
    analyze_pdf(url)

#### Extracting new LER's data

In [None]:
# from io import StringIO
# Constants
data_dir = Path("index_data")
ground_truth_file_name="ground_truth.csv"
subsection_file_name="subsection.csv"
output_csv = Path(f"output1/data/{ground_truth_file_name}")
subsection_output_csv =  Path(f"output1/data/{subsection_file_name}")

def extract_ground_truth(blob_name, parsed_accession_number_list):
    rows = []

    for file in data_dir.glob("*.json"):

            with open(file, "r") as f:
                try:
                    records = json.load(f)
                    for record in records:
                            accNo = record.get("accession_number", "")
                            if accNo not in parsed_accession_number_list:
                                rows.append ( {
                                "content": f"Abstract:\n{record.get('abstract', '')}\n\n{record.get('narrative', '')}",
                                "subsections": ", ".join([f"10 CFR {subsection}" for subsection in record.get("cfr_requirements", [])]),
                                "recommendation": "reportable",
                                "title": record.get("title", ""),
                                "facility_name": record.get("facility_name", ""),
                                "ler_number": record.get("ler_number", ""),
                                "accession_number": record.get("accession_number", ""),
                                "accession_published_date": record.get("accession_published_date", "")
                            })
                except json.JSONDecodeError:
                    print(f"Failed to parse {file}")

        # print(len(rows))
    df = pd.DataFrame(rows)
    print(df)
    
    return df

In [None]:
# Newly added LER's ground truth
df_gt_new=extract_ground_truth(blob_name, parsed_accession_number_list)
print(df_gt_new.shape)

In [None]:
# required ground truth as per Acceptance Criteria
req_col = ['content', 'subsections', 'recommendation']
df_gt_req = df_gt_new[req_col]

# subsection
df_subsection = df_gt_req['subsections'].value_counts().reset_index()
df_subsection.columns = ['sub_section', 'count']
print(df_subsection.head())
df_subsection.to_csv(subsection_output_csv, index=None,header=True)

# saving to output directory
output_csv.parent.mkdir(parents=True, exist_ok=True)
df_gt_req.to_csv(output_csv, index=None, header=True)

##### Merging New LERs output with Old processed LERs and uploading to blobstorage

In [None]:
# merging newly extracted ground truth with already processed ground truth
df_gt_updated = pd.concat([df_gt_parsed, df_gt_new])
df_gt_updated.head()

In [None]:
# Upload new PDFs to Azure Blob    
def uploadGroundTruthToAzure(blob_folder_groundTruth, df):

    # Create Azure Blob Service Client
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
    filename_ground_truth = "ler_constellation_ground_truth.csv"
    blob_path = f"{blob_folder_groundTruth}/{filename_ground_truth}"  # upload path in blob

    # Get blob client for the file
    blob_client = blob_service_client.get_blob_client(container=AZURE_STORAGE_CONTAINER_NAME, blob=blob_path)

    # Upload directly from memory
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    csv_data = csv_buffer.getvalue()
    blob_client.upload_blob(csv_data, overwrite=True)
    print(f"Uploaded '{filename_ground_truth}' to Azure Blob Storage.")

In [None]:
uploadGroundTruthToAzure(blob_folder_groundTruth='ler_constellation_ground_truth', df=df_gt_updated)

#### reviewing output

In [None]:
df=pd.read_csv("output\data\ground_truth.csv")
df.head(10)

In [None]:
df=pd.read_csv("output\data\ground_truth.csv")
df.head(10)
df['subsections'].value_counts()

##### checking subsection counts

In [None]:
df_subsection = df['subsections'].value_counts().reset_index()
df_subsection.columns = ['sub_section', 'count']
print(df_subsection.head())
df_subsection.to_csv('subsection.csv', index=None,header=True)