## 1. Importing Dependencies

In [1]:
# PDF parsing
from pdfminer.high_level import extract_text

# NLP libraries
import spacy

# Text processing
import re

# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# JSON data
import json

# System operations
import os

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
sample_text = 'We are excited to simplify the IRB approval process!'

doc = nlp(sample_text)

sentences = [sent.text for sent in doc.sents]
print(f'Sentences: {sentences}')

Sentences: ['We are excited to simplify the IRB approval process!']


## 2. Sample IRB PDF Parsing

In [4]:
pdf_path = os.path.join('data', 'raw', 'nhsr_guidance.pdf')

### 2.1. Text Extraction

In [5]:
try:
    text = extract_text(pdf_path)
    print('PDF text extraction successful!')
    print(f'First 800 characters of extracted text: {text[:800]}')
except Exception as e:
    print(f'An error occurred during PDF text extraction: {e}')

PDF text extraction successful!
First 800 characters of extracted text: Guidance Document 
Version Date: 5/11/2023 

Not Human Subjects Research Determination 

45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or 
otherwise subject to regulation by any Federal department or agency that takes appropriate 
administrative action to make the policy applicable to such research.  As a result, IRBs must first 
determine that proposed activities meet the definitions of ‘research’ and ‘human subjects’ defined in the 
regulations. 

Introduction 

This document is designed to provide an approach to determining when a proposed activity is human 
subjects research.  Note, if WCM is not engaged in the research, there is no need to approach the 
human subjects research definition.  If WCM is engaged, first, the proposed activity must 


### 2.2. Text Pre-processing

In [6]:
def clean_text(text):
    text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text) # Remove Page X of Y patterns
    
    lines = text.split('\n')
    cleaned_lines = [line for line in lines if not re.fullmatch(r'\w', line.strip())] # Remove lines with only a single character
    text = ' '.join(cleaned_lines)
    
    text = re.sub(r'\s+', ' ', text) # Remove multiple spaces and newlines
    return text.strip()

In [7]:
cleaned_text = clean_text(text)

print(f'First 800 characters of cleaned text:\n\n{cleaned_text[:800]}')

First 800 characters of cleaned text:

Guidance Document Version Date: 5/11/2023 Not Human Subjects Research Determination 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research. As a result, IRBs must first determine that proposed activities meet the definitions of ‘research’ and ‘human subjects’ defined in the regulations. Introduction This document is designed to provide an approach to determining when a proposed activity is human subjects research. Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition. If WCM is engaged, first, the proposed activity must satisfy the defini


In [8]:
def normalize_text(text):
    text = re.sub(r'[^\w\s\.,!?;:/]', '', text) # Remove special characters except necessary punctuation (e.g., '/' for dates)
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
    return text.strip()

In [9]:
normalized_text = normalize_text(cleaned_text)

print(f'First 800 characters of normalized text:\n\n{normalized_text[:800]}')

First 800 characters of normalized text:

Guidance Document Version Date: 5/11/2023 Not Human Subjects Research Determination 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research. As a result, IRBs must first determine that proposed activities meet the definitions of research and human subjects defined in the regulations. Introduction This document is designed to provide an approach to determining when a proposed activity is human subjects research. Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition. If WCM is engaged, first, the proposed activity must satisfy the definition


### 2.3. Sentence Segmentation

In [10]:
doc = nlp(normalized_text)

In [11]:
sentences = [sent.text for sent in doc.sents]

print(f'Total sentences extracted: {len(sentences)}\n')
print('First 10 sentences:\n')
for i, sentence in enumerate(sentences[:10], 1):
    print(f'{i}. {sentence}')

Total sentences extracted: 165

First 10 sentences:

1. Guidance Document Version Date: 5/11/2023 Not Human Subjects Research Determination 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research.
2. As a result, IRBs must first determine that proposed activities meet the definitions of research and human subjects defined in the regulations.
3. Introduction This document is designed to provide an approach to determining when a proposed activity is human subjects research.
4. Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition.
5. If WCM is engaged, first, the proposed activity must satisfy the definition of research.
6. If the proposed activity meets this definition, and only if it meets this definition, IRBs may then approach the hu

### 2.4. Save Processed Data

In [12]:
processed_text_path = os.path.join('data', 'clean', 'nhsr_guidance_clean.txt')
os.makedirs(os.path.dirname(processed_text_path), exist_ok=True)

In [13]:
with open(processed_text_path, 'w', encoding='utf-8') as f:
    f.write(normalized_text)

print(f'Cleaned text saved to {processed_text_path}')

Cleaned text saved to data\clean\nhsr_guidance_clean.txt


In [14]:
sentences_json_path = os.path.join('data', 'clean', 'nhsr_guidance_sentences.json')
os.makedirs(os.path.dirname(sentences_json_path), exist_ok=True)

In [15]:
with open(sentences_json_path, 'w', encoding='utf-8') as f:
    json.dump(sentences, f, ensure_ascii=False, indent=4)

print(f'Sentences saved to {sentences_json_path}')

Sentences saved to data\clean\nhsr_guidance_sentences.json


## 3. Heading Extraction

In [16]:
known_headings = [
    'Not Human Subjects Research Determination',
    'Introduction',
    'Step 1: Research Definition',
    'Step 2: Human Subjects Definition',
    'Public Health Surveillance Activities',
    'Food and Drug Administration',
    'Department of Defense',
    'Examples of Not Human Subjects Research NHSR',
    'Not Human Subjects Research Determination Checklist',
]

In [17]:
def identify_headings_and_split(sentences, known_headings):
    """Identify headings within sentences and split sentences accordingly.
    When a heading is found within a sentence, split the sentence into:
        - before_heading
        - heading
        - after_heading
    Assign before_heading to the previous section,
    heading to start a new section,
    and after_heading to the new section's content.

    Args:
    ----
        sentences (list): List of sentence strings.
        known_headings (list): List of known heading titles.

    Returns:
    -------
        tuple:
            new_sentences (list): Updated list of sentences with headings as separate entries.
            headings_found (list): List of tuples (heading_title, new_sentence_index)
            
    """
    new_sentences = []
    headings_found = []

    # Precompile patterns for efficiency (case-sensitive)
    # Order by length to ensure longer headings are matched first
    sorted_headings = sorted(known_headings, key=lambda x: len(x), reverse=True)
    heading_patterns = {heading: re.compile(re.escape(heading)) for heading in sorted_headings}

    for i, sentence in enumerate(sentences):
        sentence_processed = sentence
        headings_in_sentence = []

        # Find the first occurrence of each heading in the sentence
        for heading, pattern in list(heading_patterns.items()):
            match = pattern.search(sentence)
            if match:
                headings_in_sentence.append((match.start(), match.end(), heading))
                # Remove the heading from patterns to prevent future matches
                del heading_patterns[heading]

        if not headings_in_sentence:
            # No headings found; append the sentence as is
            new_sentences.append(sentence)
            continue

        # Sort headings by their start position to handle them sequentially
        headings_in_sentence.sort(key=lambda x: x[0])

        last_pos = 0
        for start, end, heading in headings_in_sentence:
            # Extract text before the heading
            if start > last_pos:
                before = sentence[last_pos:start].strip()
                if before:
                    new_sentences.append(before)

            # Append the heading as a separate sentence
            new_sentences.append(heading)
            headings_found.append((heading, len(new_sentences)-1))

            last_pos = end

        # Extract any remaining text after the last heading
        if last_pos < len(sentence):
            after = sentence[last_pos:].strip()
            if after:
                new_sentences.append(after)

    return new_sentences, headings_found

In [18]:
new_sentences, headings_found = identify_headings_and_split(sentences, known_headings)

In [19]:
print(f'Total headings found: {len(headings_found)} out of {len(known_headings)} \n')
print('Headings:\n')
for heading in headings_found:
    print(heading)

print(f'\nUpdated Sentences:\n')
for i, sentence in enumerate(new_sentences[:10], 1):
    print(f'{i}. {sentence}')

Total headings found: 9 out of 9 

Headings:

('Not Human Subjects Research Determination', 1)
('Introduction', 4)
('Step 1: Research Definition', 14)
('Step 2: Human Subjects Definition', 35)
('Public Health Surveillance Activities', 43)
('Food and Drug Administration', 60)
('Department of Defense', 65)
('Examples of Not Human Subjects Research NHSR', 73)
('Not Human Subjects Research Determination Checklist', 129)

Updated Sentences:

1. Guidance Document Version Date: 5/11/2023
2. Not Human Subjects Research Determination
3. 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research.
4. As a result, IRBs must first determine that proposed activities meet the definitions of research and human subjects defined in the regulations.
5. Introduction
6. This document is designed to provide an approac

## 4. Hierarchy Structuring

In [20]:
import pprint

In [21]:
def structure_hierarchy(new_sentences, headings_found, known_headings):
    """Structures the document into a hierarchical dictionary based on identified headings.

    Args:
    ----
        new_sentences (list): Updated list of sentences with headings as separate entries.
        headings_found (list): List of tuples (heading_title, new_sentence_index).
        known_headings (list): List of known heading titles.

    Returns:
    -------
        dict: Hierarchical structure of the document.
        
    """
    structured_data = {}
    current_heading = None

    # Create a set of headings for quick lookup
    headings_set = set([heading for heading, idx in headings_found])

    for sentence in new_sentences:
        if sentence in headings_set:
            current_heading = sentence
            structured_data[current_heading] = {'Content': []}
        else:
            if current_heading:
                structured_data[current_heading]['Content'].append(sentence)
            else:
                if 'General Information' not in structured_data:
                    structured_data['General Information'] = {'Content': []}
                structured_data['General Information']['Content'].append(sentence)

    return structured_data

In [22]:
hierarchical_data = structure_hierarchy(new_sentences, headings_found, known_headings)

print('Hierarchical Structure:\n')
for data in hierarchical_data:
    print(f'{data}:\n{hierarchical_data[data]}\n\n')

Hierarchical Structure:

General Information:
{'Content': ['Guidance Document Version Date: 5/11/2023']}


Not Human Subjects Research Determination:
{'Content': ['45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research.', 'As a result, IRBs must first determine that proposed activities meet the definitions of research and human subjects defined in the regulations.']}


Introduction:
{'Content': ['This document is designed to provide an approach to determining when a proposed activity is human subjects research.', 'Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition.', 'If WCM is engaged, first, the proposed activity must satisfy the definition of research.', 'If the proposed activity meets this definition, and only if it meets this 