## 1. Installing + Importing Dependencies

In [1]:
# PDF parsing
from pdfminer.high_level import extract_text

# NLP libraries
import spacy

# Text processing
import re

# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# JSON data
import json

# System operations
import os

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
sample_text = 'We are excited to simplify the IRB approval process!'

doc = nlp(sample_text)

sentences = [sent.text for sent in doc.sents]
print(f'Sentences: {sentences}')

Sentences: ['We are excited to simplify the IRB approval process!']


## 2. Sample IRB PDF Parsing

In [4]:
pdf_path = os.path.join('data', 'raw', 'nhsr_guidance.pdf')

### 2.1. Text Extraction

In [5]:
try:
    text = extract_text(pdf_path)
    print('PDF text extraction successful!')
    print(f'First 800 characters of extracted text: {text[:800]}')
except Exception as e:
    print(f'An error occurred during PDF text extraction: {e}')

PDF text extraction successful!
First 800 characters of extracted text: Guidance Document 
Version Date: 5/11/2023 

Not Human Subjects Research Determination 

45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or 
otherwise subject to regulation by any Federal department or agency that takes appropriate 
administrative action to make the policy applicable to such research.  As a result, IRBs must first 
determine that proposed activities meet the definitions of ‘research’ and ‘human subjects’ defined in the 
regulations. 

Introduction 

This document is designed to provide an approach to determining when a proposed activity is human 
subjects research.  Note, if WCM is not engaged in the research, there is no need to approach the 
human subjects research definition.  If WCM is engaged, first, the proposed activity must 


### 2.2. Text Pre-processing

In [6]:
def clean_text(text):
    text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text) # Remove Page X of Y patterns
    
    lines = text.split('\n')
    cleaned_lines = [line for line in lines if not re.fullmatch(r'\w', line.strip())] # Remove lines with only a single character
    text = ' '.join(cleaned_lines)
    
    text = re.sub(r'\s+', ' ', text) # Remove multiple spaces and newlines
    return text.strip()

In [7]:
cleaned_text = clean_text(text)

print(f'First 800 characters of cleaned text:\n\n{cleaned_text[:800]}')

First 800 characters of cleaned text:

Guidance Document Version Date: 5/11/2023 Not Human Subjects Research Determination 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research. As a result, IRBs must first determine that proposed activities meet the definitions of ‘research’ and ‘human subjects’ defined in the regulations. Introduction This document is designed to provide an approach to determining when a proposed activity is human subjects research. Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition. If WCM is engaged, first, the proposed activity must satisfy the defini


In [8]:
def normalize_text(text):
    text = re.sub(r'[^\w\s\.,!?;:/]', '', text) # Remove special characters except necessary punctuation (e.g., '/' for dates)
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
    return text.strip()

In [9]:
normalized_text = normalize_text(cleaned_text)

print(f'First 800 characters of normalized text:\n\n{normalized_text[:800]}')

First 800 characters of normalized text:

Guidance Document Version Date: 5/11/2023 Not Human Subjects Research Determination 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research. As a result, IRBs must first determine that proposed activities meet the definitions of research and human subjects defined in the regulations. Introduction This document is designed to provide an approach to determining when a proposed activity is human subjects research. Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition. If WCM is engaged, first, the proposed activity must satisfy the definition


### 2.3. Sentence Segmentation

In [10]:
doc = nlp(normalized_text)

In [11]:
sentences = [sent.text for sent in doc.sents]

print(f'Total sentences extracted: {len(sentences)}')
print('First 10 sentences:\n')
for i, sentence in enumerate(sentences[:10], 1):
    print(f"{i}. {sentence}")

Total sentences extracted: 165
First 10 sentences:

1. Guidance Document Version Date: 5/11/2023 Not Human Subjects Research Determination 45CFR46 and 21CFR50 apply to all research involving human subjects conducted, supported, or otherwise subject to regulation by any Federal department or agency that takes appropriate administrative action to make the policy applicable to such research.
2. As a result, IRBs must first determine that proposed activities meet the definitions of research and human subjects defined in the regulations.
3. Introduction This document is designed to provide an approach to determining when a proposed activity is human subjects research.
4. Note, if WCM is not engaged in the research, there is no need to approach the human subjects research definition.
5. If WCM is engaged, first, the proposed activity must satisfy the definition of research.
6. If the proposed activity meets this definition, and only if it meets this definition, IRBs may then approach the hum

### 2.4. Save Processed Data

In [13]:
processed_text_path = os.path.join('data', 'clean', 'nhsr_guidance_clean.txt')
os.makedirs(os.path.dirname(processed_text_path), exist_ok=True)

In [15]:
with open(processed_text_path, 'w', encoding='utf-8') as f:
    f.write(normalized_text)

print(f'Cleaned text saved to {processed_text_path}')

Cleaned text saved to data\clean\nhsr_guidance_clean.txt


In [16]:
sentences_json_path = os.path.join('data', 'clean', 'nhsr_guidance_sentences.json')
os.makedirs(os.path.dirname(sentences_json_path), exist_ok=True)

In [None]:
with open(processed_text_path, 'w', encoding='utf-8') as f:
    f.write(normalized_text)