## 1. Installing + Importing Dependencies

In [None]:
# PDF parsing
from pdfminer.high_level import extract_text

# NLP libraries
import spacy

# Text processing
import re

# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# JSON data
import json

# System operations
import os

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
sample_text = 'We are excited to simplify the IRB approval process!'

doc = nlp(sample_text)

sentences = [sent.text for sent in doc.sents]
print(f'Sentences: {sentences}')

## 2. Sample IRB PDF Parsing

In [None]:
pdf_path = os.path.join('data', 'raw', 'nhsr_guidance.pdf')

### 2.1. Text Extraction

In [None]:
try:
    text = extract_text(pdf_path)
    print('PDF text extraction successful!')
    print(f'First 800 characters of extracted text: {text[:800]}')
except Exception as e:
    print(f'An error occurred during PDF text extraction: {e}')

### 2.2. Text Pre-processing

In [None]:
def clean_text(text):
    text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text) # Remove Page X of Y patterns
    
    lines = text.split('\n')
    cleaned_lines = [line for line in lines if not re.fullmatch(r'\w', line.strip())] # Remove lines with only a single character
    text = ' '.join(cleaned_lines)
    
    text = re.sub(r'\s+', ' ', text) # Remove multiple spaces and newlines
    return text.strip()

In [None]:
cleaned_text = clean_text(text)

print(f'First 800 characters of cleaned text:\n\n{cleaned_text[:800]}')

In [None]:
def normalize_text(text):
    text = re.sub(r'[^\w\s\.,!?;:/]', '', text) # Remove special characters except necessary punctuation (e.g., '/' for dates)
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
    return text.strip()

In [None]:
normalized_text = normalize_text(cleaned_text)

print(f'First 800 characters of normalized text:\n\n{normalized_text[:800]}')

### 2.3. Sentence Segmentation

In [None]:
doc = nlp(normalized_text)

In [None]:
sentences = [sent.text for sent in doc.sents]

print(f'Total sentences extracted: {len(sentences)}')
print('First 10 sentences:\n')
for i, sentence in enumerate(sentences[:10], 1):
    print(f"{i}. {sentence}")