## Generating Ground Truth

This notebook can be used to help generate ground truth for 50.72 and 50.73. The last cell in this notebook will merge
the ground truths from each 10 CFR section into one ground truth file to be used for evaluation.

In [None]:
%pip install -r requirements.txt

### Set up
Setting up env variables and defining file path

In [None]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

# Environment variable keys
ENV_VAR_OPENAI_KEY = 'AZURE_OPENAI_SERVICE_KEY'
ENV_VAR_OPENAI_URI = 'AZURE_OPENAI_SERVICE_URI'
ENV_VAR_OPENAI_DEPLOYMENT_ID = 'AZURE_OPENAI_DEPLOYMENT'
ENV_VAR_OPENAI_API_VERSION = 'AZURE_OPENAI_API_VERSION'

load_dotenv()

open_ai_uri = os.getenv(ENV_VAR_OPENAI_URI)
open_ai_key = os.getenv(ENV_VAR_OPENAI_KEY)
open_ai_deployment_id = os.getenv(ENV_VAR_OPENAI_DEPLOYMENT_ID)
open_ai_api_version = os.getenv(ENV_VAR_OPENAI_API_VERSION)

openai_client = AzureOpenAI(
    api_key = open_ai_key,
    api_version = open_ai_api_version,
    azure_endpoint=open_ai_uri,
    azure_deployment=open_ai_deployment_id,
)

GROUND_TRUTH_5072 = './ground_truth_5072.csv'
GROUND_TRUTH_5073 = './ground_truth_5073.csv'
SINGLE_GROUND_TRUTH = './ground_truth_single.csv'

### Generate Ground Truth for 50.72 sections

Cells below will generate ground truth for the 50.72 sections only.

In [None]:
import pickle
import pandas as pd
from pathlib import Path
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

file_path = Path('IRsWith5072.pkl') # 500 records from Manoj's data file
with open(file_path, 'rb') as file:
    data = pickle.load(file)
    display(data.iloc[:1])

remove_cfr_references_system_prompt = """
You are an expert technical editor for nuclear event reports. 
Your task is to receive an excerpt from a report to the Nuclear Regulatory Commission (NRC) describing an incident. 
Remove all references to any section of the Code of Federal Regulations (CFR), especially 10 CFR 50.72 and 
10 CFR 50.73. 
Do not alter the factual content or meaning of the description. 
Do not alter the format, punctuation, spacing, or capitalization of the description in any way.
Return only the cleaned incident description, with no additional commentary or formatting.
"""

In [None]:
import re

SECTIONS_5072 = [
    "50.72(b)(1)",
    "50.72(b)(2)(i)",
    "50.72(b)(2)(iv)(A)",
    "50.72(b)(2)(iv)(B)",
    "50.72(b)(2)(xi)",
    "50.72(b)(3)(ii)(A)",
    "50.72(b)(3)(ii)(B)",
    "50.72(b)(3)(iv)(A)",
    "50.72(b)(3)(v)(A)",
    "50.72(b)(3)(v)(B)",
    "50.72(b)(3)(v)(C)",
    "50.72(b)(3)(v)(D)",
    "50.72(b)(3)(xii)",
    "50.72(b)(3)(xiii)"   
]

SECTION_5072_PATTERN = re.compile(
    r"(?:10\s*CFR\s*)?50\.72\s*\(b\)\s*\(\s*(1|2|3)\s*\)"
    r"(?:\s*\(\s*((?:x)?(?:ix|iv|vi{0,3}|i{1,3}|xi|xii|xiii))\s*\))?"
    r"(?:\s*\(\s*([A-Z])\s*\)|\s*([A-Z]))?",
    re.IGNORECASE
)

def extract_5072_sections(text: str) -> str:
    if not isinstance(text, str):
        return ''
    found = set()
    for match in SECTION_5072_PATTERN.finditer(text):
        section = f"50.72(b)({match.group(1)})"
        if match.group(2):
            section += f"({match.group(2).lower()})"
        letter = match.group(3) or match.group(4)
        if letter:
            section += f"({letter.strip().upper()})"
        found.add(section)
    return ', '.join(sorted(found))

def combine_5072_sections(row: pd.Series) -> str:
    sections = set()
    for col in ['IMME_ACTN', 'CONTENT']:
        val = extract_5072_sections(row.get(col, ''))
        if val:
            sections.update(s.strip() for s in val.split(',') if s.strip())
    return ', '.join(sorted(sections))

def scrub_cfr_references(descriptions: list[str], openai_client: AzureOpenAI, open_ai_deployment_id: str, 
                         remove_cfr_references_system_prompt: str) -> list[str]:
    cleaned_descriptions = []
    for desc in descriptions:
        response = openai_client.chat.completions.create(
            messages=[
                {"role": "system", "content": remove_cfr_references_system_prompt},
                {"role": "user", "content": desc},
            ],
            model=open_ai_deployment_id,
        )
        cleaned_descriptions.append(response.choices[0].message.content.strip())
    return cleaned_descriptions

def get_cleaned_cond_desc(result_5072: pd.DataFrame, openai_client: AzureOpenAI, open_ai_deployment_id: str, remove_cfr_references_system_prompt: str) -> pd.Series:
    descriptions = result_5072['COND_DESC'].tolist()
    cleaned = scrub_cfr_references(descriptions, openai_client, open_ai_deployment_id, remove_cfr_references_system_prompt)
    return pd.Series(cleaned, index=result_5072.index)

def remove_excess_whitespace(text: str) -> str:
    return re.sub(r'\s+', ' ', text).strip()

def section_string_to_list(section_str: str) -> list[str]:
    if not isinstance(section_str, str) or not section_str.strip():
        return []
    return [s.strip() for s in section_str.split(',') if s.strip()]

def flatten_newlines(text: str) -> str:
    return re.sub(r'[\r\n]+', ' ', text).strip()

data['50.72 section'] = data.apply(combine_5072_sections, axis=1)
data['COND_DESC'] = data['COND_DESC'].apply(remove_excess_whitespace)
data['50.72 section'] = data['50.72 section'].apply(section_string_to_list)
result = data[['COND_DESC', '50.72 section']]
result = result[result['50.72 section'].apply(lambda x: bool(x))]
result['COND_DESC'] = get_cleaned_cond_desc(
    result,
    openai_client,
    open_ai_deployment_id,
    remove_cfr_references_system_prompt
)
result = result.rename(columns={'COND_DESC': 'content'})
result = result.rename(columns={'50.72 section': 'subsections'})
os.makedirs(os.path.dirname(GROUND_TRUTH_5072), exist_ok=True)
result['content'] = result['content'].apply(flatten_newlines)
result.to_csv(GROUND_TRUTH_5072, index=False)

### Generate Ground Truth For 50.73 Section
Cells below will generate ground truth for the 50.73 sections only.

In [None]:
import pandas as pd
import os
import json

processed_ler_directory = '../search/data/ler_processed'  #Run the index eval if you haven't downloaded LERs from Azure
processed_files = [f for f in os.listdir(processed_ler_directory) if f.endswith('.txt')]

filenames = list()
titles = list()
abstracts = list()
reported_sections = list()
facility_names = list()

for fln in processed_files:    
    with open(os.path.join(processed_ler_directory, fln), 'r') as f:
        ler_content = json.load(f)
        if not ler_content: continue
        title = ler_content.get('title')
        abstract = ler_content.get('abstract')
        cfr_sections = ler_content.get('cfr_requirements', [])

        if not title or not abstract or not cfr_sections or len(abstract) < 30: continue
        filenames.append(fln.split('.')[0])
        titles.append(abstract.replace(',', ' ').replace('\n', ' ').replace('\r', ' '))
        abstracts.append(abstract.replace(',', ' ').replace('\n', ' ').replace('\r', ' '))
        reported_sections.append(",".join([("10 CFR " + s) for s in cfr_sections]))
        facility_names.append(ler_content.get('facility_name', 'NONE'))

df = pd.DataFrame({
    'filename': filenames,
    'title': titles,
    'content': abstracts,
    'subsections': reported_sections,
    'facility_name': facility_names
})

os.makedirs(os.path.dirname(GROUND_TRUTH_5073), exist_ok=True)
df.to_csv(GROUND_TRUTH_5073, index=False)

# load the csv data from the file path defined by the ground_truth_5073 variable into a panda data frame
df = pd.read_csv(GROUND_TRUTH_5073)   # User ground_truth_file_path for Constellation-only LERs

#df = df.head(5) # for testing purposes
df = df.head(430)

print(f"Dataframe loaded {len(df)} LER records")

In [None]:
import sys
import ast
import re
sys.path.append("../api")
from eval_helpers import clean_all_ler_content, DataFrameColumnNames

clean_ler_prompt = """
    You are going to receive an excerpt text from a report to the Nuclear Regulation Commission (NRC) containing a
    detailed description of an incident.
    You need to remove any references to any section of the CFR code under which the incident is reported, especially
    relating to 10 CFR 50.72, and 10 CFR 50.73.
    The output should consist of the updated description only and no further changes.
"""


df[DataFrameColumnNames.SUBSECTIONS] = df[DataFrameColumnNames.SUBSECTIONS].apply(
    lambda x: x.split(",") if isinstance(x, str) else [])

def flatten_newlines(text: str) -> str:
    return re.sub(r'[\r\n]+', ' ', text).strip()


df = await clean_all_ler_content(df, openai_client, open_ai_deployment_id, clean_ler_prompt)
os.makedirs(os.path.dirname(GROUND_TRUTH_5073), exist_ok=True)
df.to_csv(GROUND_TRUTH_5073, index=False)

df = pd.read_csv(GROUND_TRUTH_5073)
df_subset = df[["content", "subsections"]]
df_subset['content'] = df_subset['content'].apply(flatten_newlines)
df_subset.to_csv(GROUND_TRUTH_5073, index=False)

In [None]:
import pandas as pd
import sys
import ast
sys.path.append("../api")
from eval_helpers import DataFrameColumnNames

df = pd.read_csv(GROUND_TRUTH_5072)
df[DataFrameColumnNames.SUBSECTIONS] = df[DataFrameColumnNames.SUBSECTIONS].apply(
    lambda x: x.replace('[', '').replace(']', '').replace("'", "").replace(" ", "").strip().split(","))

df[DataFrameColumnNames.SUBSECTIONS] = df[DataFrameColumnNames.SUBSECTIONS].apply(lambda x: ["10 CFR " + s for s in x])


print(df[DataFrameColumnNames.SUBSECTIONS].tolist())

df.to_csv(GROUND_TRUTH_5072, index=False)

### Merge Into One Ground Truth File

In [None]:
import pandas as pd

def merge_ground_truth_files(
    file1_path: str,
    file2_path: str,
    SINGLE_GROUND_TRUTH: str
) -> None:
    df1 = pd.read_csv(file1_path, usecols=["content", "subsections"])
    df2 = pd.read_csv(file2_path, usecols=["content", "subsections"])
    merged_df = pd.concat([df1, df2], ignore_index=True)
    os.makedirs(os.path.dirname(SINGLE_GROUND_TRUTH), exist_ok=True)
    merged_df.to_csv(SINGLE_GROUND_TRUTH, index=False)

merge_ground_truth_files(
    GROUND_TRUTH_5073,
    GROUND_TRUTH_5072,
    SINGLE_GROUND_TRUTH
)