## Generating Ground Truth for 5072 Sections

In [None]:
%pip install -r requirements.txt

### Set Up

In [None]:
# Environment variable keys
ENV_VAR_OPENAI_KEY = 'AZURE_OPENAI_SERVICE_KEY'
ENV_VAR_OPENAI_URI = 'AZURE_OPENAI_SERVICE_URI'
ENV_VAR_OPENAI_DEPLOYMENT_ID = 'AZURE_OPENAI_DEPLOYMENT'
ENV_VAR_OPENAI_API_VERSION = 'AZURE_OPENAI_API_VERSION'

import os
from dotenv import load_dotenv
import re

load_dotenv()

open_ai_uri = os.getenv(ENV_VAR_OPENAI_URI)
open_ai_key = os.getenv(ENV_VAR_OPENAI_KEY)
open_ai_deployment_id = os.getenv(ENV_VAR_OPENAI_DEPLOYMENT_ID)
open_ai_api_version = os.getenv(ENV_VAR_OPENAI_API_VERSION)

GROUND_TRUTH_5072 = './ground_truth_5072.csv'

SECTIONS_5072 = [
    "50.72(b)(1)",
    "50.72(b)(2)(i)",
    "50.72(b)(2)(iv)(A)",
    "50.72(b)(2)(iv)(B)",
    "50.72(b)(2)(xi)",
    "50.72(b)(3)(ii)(A)",
    "50.72(b)(3)(ii)(B)",
    "50.72(b)(3)(iv)(A)",
    "50.72(b)(3)(v)(A)",
    "50.72(b)(3)(v)(B)",
    "50.72(b)(3)(v)(C)",
    "50.72(b)(3)(v)(D)",
    "50.72(b)(3)(xii)",
    "50.72(b)(3)(xiii)"   
]

SECTION_5072_PATTERN = re.compile(
    r"(?:10\s*CFR\s*)?50\.72\s*\(b\)\s*\(\s*(1|2|3)\s*\)"
    r"(?:\s*\(\s*(i{1,3}|iv|xi|v|xii|xiii)\s*\))?"
    r"(?:\s*\(\s*([A-Z])\s*\)|\s*([A-Z]))?",
    re.IGNORECASE
)


In [None]:
from openai import AzureOpenAI

openai_client = AzureOpenAI(
    api_key = open_ai_key,
    api_version = open_ai_api_version,
    azure_endpoint=open_ai_uri,
    azure_deployment=open_ai_deployment_id,
)

### Look Through Current Dataset from Manoj

In [None]:
# 500 records from Manoj
import pickle
import pandas as pd
from pathlib import Path
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

file_path = Path('IRsWith5072.pkl')
with open(file_path, 'rb') as file:
    data = pickle.load(file)
    display(data.iloc[:1])

### Full Data Exploration

Only run this cell if you want to see the full table of all the information including AR Number, ENS, LER_LABEL. If not, skip this cell.

In [None]:
def extract_5072_sections(text: str) -> str:
    if not isinstance(text, str):
        return ''
    found = set()
    for match in SECTION_5072_PATTERN.finditer(text):
        section = f"50.72(b)({match.group(1)})"
        if match.group(2):
            section += f"({match.group(2).lower()})"
        letter = match.group(3) or match.group(4)
        if letter:
            section += f"({letter.strip().upper()})"
        found.add(section)
    return ', '.join(sorted(found))

def extract_ens_number(text: str) -> str:
    if not isinstance(text, str):
        return ''
    match = re.search(
        r'(?:ENS Notification#?|EN#?|event\s*#)\s*:?\s*-?\s*([0-9]{4,})',
        text,
        re.IGNORECASE
    )
    return match.group(1) if match else ''

def combine_5072_sections(row: pd.Series) -> str:
    sections = set()
    for col in ['IMME_ACTN', 'CONTENT']:
        val = extract_5072_sections(row.get(col, ''))
        if val:
            sections.update(s.strip() for s in val.split(',') if s.strip())
    return ', '.join(sorted(sections))

def combine_ens_sections(row: pd.Series) -> str:
    sections = set()
    for col in ['IMME_ACTN', 'CONTENT', 'REPO_BASI']:
        val = extract_ens_number(row.get(col, ''))
        if val:
            sections.update([s.strip() for s in val.split(',') if s.strip()])
    return ', '.join(sorted(sections))

data['50.72 section'] = data.apply(combine_5072_sections, axis=1)
data['ENS'] = data.apply(combine_ens_sections, axis=1)
result = data[['AR_NUMBER', 'COND_DESC', '50.72 section', 'ENS', 'LER_LABEL']]
result = data[['COND_DESC', '50.72 section']]
result_5072 = result[result['50.72 section'].str.strip() != '']
result_ens = result[result['ENS'].str.strip() != '']
result_both = result_5072[result_5072['ENS'].str.strip() != '']
print (f"Total records: {len(result)}")
print(f"Records with 50.72 section: {len(result_5072)}")
print(f"Records with both 50.72 section and ENS: {len(result_both)}")

### Generate Ground Truth for 50.72 Sections

Using the Conditional Description as input and the extracted 50.72 sections as output. There are cases known when the
Conditional Description has references to the 50.72 sections so the contents in this column are scrubbed to remove those
references using the LLM. 

In [None]:
remove_cfr_references_system_prompt = """
You are an expert technical editor for nuclear event reports. 
Your task is to receive an excerpt from a report to the Nuclear Regulatory Commission (NRC) describing an incident. 
Remove all references to any section of the Code of Federal Regulations (CFR), especially 10 CFR 50.72 and 
10 CFR 50.73. 
Do not alter the factual content or meaning of the description. 
Do not alter the format, punctuation, spacing, or capitalization of the description in any way.
Return only the cleaned incident description, with no additional commentary or formatting.
"""

In [None]:
import re

def extract_5072_sections(text: str) -> str:
    if not isinstance(text, str):
        return ''
    found = set()
    for match in SECTION_5072_PATTERN.finditer(text):
        section = f"50.72(b)({match.group(1)})"
        if match.group(2):
            section += f"({match.group(2).lower()})"
        letter = match.group(3) or match.group(4)
        if letter:
            section += f"({letter.strip().upper()})"
        found.add(section)
    return ', '.join(sorted(found))

def combine_5072_sections(row: pd.Series) -> str:
    sections = set()
    for col in ['IMME_ACTN', 'CONTENT']:
        val = extract_5072_sections(row.get(col, ''))
        if val:
            sections.update(s.strip() for s in val.split(',') if s.strip())
    return ', '.join(sorted(sections))

def scrub_cfr_references(descriptions: list[str], openai_client: AzureOpenAI, open_ai_deployment_id: str, 
                         remove_cfr_references_system_prompt: str) -> list[str]:
    cleaned_descriptions = []
    for desc in descriptions:
        response = openai_client.chat.completions.create(
            messages=[
                {"role": "system", "content": remove_cfr_references_system_prompt},
                {"role": "user", "content": desc},
            ],
            model=open_ai_deployment_id,
        )
        cleaned_descriptions.append(response.choices[0].message.content.strip())
    return cleaned_descriptions

def get_cleaned_cond_desc(result_5072: pd.DataFrame, openai_client: AzureOpenAI, open_ai_deployment_id: str, remove_cfr_references_system_prompt: str) -> pd.Series:
    descriptions = result_5072['COND_DESC'].tolist()
    cleaned = scrub_cfr_references(descriptions, openai_client, open_ai_deployment_id, remove_cfr_references_system_prompt)
    return pd.Series(cleaned, index=result_5072.index)

def remove_excess_whitespace(text: str) -> str:
    return re.sub(r'\s+', ' ', text).strip()

def section_string_to_list(section_str: str) -> list[str]:
    if not isinstance(section_str, str) or not section_str.strip():
        return []
    return [s.strip() for s in section_str.split(',') if s.strip()]

data['50.72 section'] = data.apply(combine_5072_sections, axis=1)
data['COND_DESC'] = data['COND_DESC'].apply(remove_excess_whitespace)
data['50.72 section'] = data['50.72 section'].apply(section_string_to_list)
result = data[['COND_DESC', '50.72 section']]
result = result[result['50.72 section'].apply(lambda x: bool(x))]
result['COND_DESC'] = get_cleaned_cond_desc(
    result,
    openai_client,
    open_ai_deployment_id,
    remove_cfr_references_system_prompt
)
result = result.rename(columns={'COND_DESC': 'content'})
result = result.rename(columns={'50.72 section': 'subsections'})
os.makedirs(os.path.dirname(GROUND_TRUTH_5072), exist_ok=True)
result.to_csv(GROUND_TRUTH_5072, index=False)

### Visual Representation of Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

category_counts = {cat: 0 for cat in SECTIONS_5072}
for sections in data['50.72 section'].dropna():
    for cat in SECTIONS_5072:
        if any(cat.lower() in s.lower() for s in [sec.strip() for sec in sections.split(',')]):
            category_counts[cat] += 1

cat_df = pd.DataFrame(list(category_counts.items()), columns=['Category', 'Count'])

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=cat_df, x='Category', y='Count')
plt.xticks(rotation=45)
plt.xlabel('50.72 Section Category')
plt.ylabel('Frequency')
plt.title('Frequency of 50.72 Section Categories')
plt.tight_layout()

# Add count labels on top of bars
for i, count in enumerate(cat_df['Count']):
    ax.text(i, count, str(count), ha='center', va='bottom', fontweight='bold')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def get_category_counts(df: pd.DataFrame) -> pd.DataFrame:
    counts = {cat: 0 for cat in SECTIONS_5072}
    for sections in df['50.72 section'].dropna():
        for cat in SECTIONS_5072:
            if any(cat.lower() in s.lower() for s in [sec.strip() for sec in sections.split(',')]):
                counts[cat] += 1
    return pd.DataFrame(list(counts.items()), columns=['Category', 'Count'])

df_ler_1 = data[data['LER_LABEL'] == 1.0]
df_ler_0 = data[data['LER_LABEL'] == 0.0]

cat_df_1 = get_category_counts(df_ler_1)
cat_df_0 = get_category_counts(df_ler_0)

total_1 = len(df_ler_1)
total_0 = len(df_ler_0)

fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
sns.barplot(data=cat_df_1, x='Category', y='Count', ax=axes[0])
axes[0].set_title(f'LER_LABEL = 1.0 (n={total_1})')
axes[0].set_xlabel('50.72 Section Category')
axes[0].set_ylabel('Frequency')
axes[0].tick_params(axis='x', rotation=45)

# Add count labels on top of bars for LER_LABEL = 1.0
for i, count in enumerate(cat_df_1['Count']):
    axes[0].text(i, count, str(count), ha='center', va='bottom', fontweight='bold')

sns.barplot(data=cat_df_0, x='Category', y='Count', ax=axes[1])
axes[1].set_title(f'LER_LABEL = 0.0 (n={total_0})')
axes[1].set_xlabel('50.72 Section Category')
axes[1].tick_params(axis='x', rotation=45)

# Add count labels on top of bars for LER_LABEL = 0.0
for i, count in enumerate(cat_df_0['Count']):
    axes[1].text(i, count, str(count), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()