In [None]:
# pipeline-api
import tempfile
from unstructured.documents.pdf import PDFPage, PDFDocument

In [None]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [None]:
import os

DIRECTORY = os.getcwd()
SAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "sample-docs")

filename = os.path.join(SAMPLE_DOCS_DIRECTORY, "fake-oer.pdf")

In [None]:
# pipeline-api
import warnings

def partition_oer(filename: str):
    doc = PDFDocument(filename)

    # NOTE(robinson) - The warning we catch comes from the detectron2
    # code. We have an issue to contribute back a fix for that
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        doc._read()
        pages = [page for page in doc.pages]
        
    return pages

In [None]:
pages = partition_oer(filename)







## Cleaning Brick

In [None]:
for element in pages[0].elements:
    print(f"\n\n{element}")



c. SIGNIFICANT DUTIES AND RESPONSIBILITIES Personnel and Administration Officer (S1) for a training battalion in the U.S. Army reserve. Principal staff assistant to the battalion commander. Exercise staff supervisor in matters pertaining to strength management, personnel qualifications and evaluations, personnel assignment, clearance, recruiting, retention, and battalion administration. Responsible for the overall supervision of the battalion Personnel Administration Center (PAC) and its activities. Serves as commander of Headquarters and Headquarters Detachment. Additional duties include; Battalion Safety Officer, Equal Opportunity Officer, Records Management Officer, and Retention Officer.


1LT X performed flawlessly in the execution of an overseas detention and area security mission at Guantanamo Bay, Cuba. Exceptional performance during this limited rating period by CPT X.


In [None]:
# pipeline-api
import re


BLOCK_TITLE_RE = re.compile(r"c. (SIGNIFICANT DUTIES AND RESPONSIBILITIES"
                            r"|COMMENTS ON POTENTIAL)")

def clean_block_titles(narrative: str) -> str:
    """Cleans the name of the block from the extracted narrative text"""
    return BLOCK_TITLE_RE.sub("", narrative).strip()

In [None]:
clean_block_titles(pages[0].elements[0].text)

'Personnel and Administration Officer (S1) for a training battalion in the U.S. Army reserve. Principal staff assistant to the battalion commander. Exercise staff supervisor in matters pertaining to strength management, personnel qualifications and evaluations, personnel assignment, clearance, recruiting, retention, and battalion administration. Responsible for the overall supervision of the battalion Personnel Administration Center (PAC) and its activities. Serves as commander of Headquarters and Headquarters Detachment. Additional duties include; Battalion Safety Officer, Equal Opportunity Officer, Records Management Officer, and Retention Officer.'

In [None]:
for element in pages[1].elements:
    print(f"\n\n{element}")



1LT X’s exceptional command presence and resilience lends itself to consistent mission accomplishment, good order and discipline, and a positive climate. 1LT X’s outstanding attitude and thirst for knowledge exceeds those around him which contributes to his overall exceptional character.


1LT X is able to analyze a situation and introduce new ideas when opportunities exist, approaching challenging circumstances with creativity and intellect. 1LT X is highly proficient in interacting with others, effectively adjusting behaviors when interacting with superiors, peers, and subordinates.


1LT X demonstrates the full range of required influence techniques enabling him to speak, lead and motivate every person in his unit. 1LT X works with the Alameda County Sheriff’s office, as well as other outside agencies, in order to build positive relationships established that have enhanced unit training.


Absolute professional and squared away for duty; current on all applicable skills, knowledge

### Staging

In [None]:
# pipeline-api
COMMENT_BLOCKS = [
    "character",
    "presence",
    "intellect",
    "leads",
    "develops",
    "achieves",
]

In [None]:
# pipeline-api
def structure_oer(pages):
    """Creates a dictionary with the extracted elements of the OER"""
    if len(pages) < 2:
        raise ValueError(f"Pages length is {len(pages)}). "
                          "Expected 2 pages.")
    
    structured_oer = dict()
        
    first_page = pages[0].elements 
    if len(first_page) < 2:
        raise ValueError(f"Number of narrative text elements on the "
                         f"first page is {len(first_page)}. "
                          "Expected at least two.")
    
    duty_description = clean_block_titles(first_page[0].text)
    structured_oer["duty_description"] = duty_description
    structured_oer["rater_comments"] = first_page[-1].text
    
    second_page = pages[1].elements
    num_sections = len(COMMENT_BLOCKS)

    if len(first_page) < 2:
        raise ValueError(f"Number of narrative text elements on the "
                         f"second page is {len(second_page)}. "
                         f"Expected at least {num_sections}.")
    
    for i, section in enumerate(second_page[:num_sections]):
        key = COMMENT_BLOCKS[i]
        structured_oer[key] = section.text
        
    structured_oer["intermediate_rater"] = second_page[-2].text
        
    return structured_oer

In [None]:
oer = structure_oer(pages)

### API Definition

In [None]:
# pipeline-api
def pipeline_api(text):
    tmp = tempfile.NamedTemporaryFile(prefix="tmp_", delete=False)
    tmp.write(text)
    tmp.close()
    
    pages = partition_oer(tmp.name)
    return structure_oer(pages)

In [None]:
with open(filename, "rb") as f:
    oer = pipeline_api(f.read())







In [None]:
import json

print(json.dumps(oer, indent=4))

{
    "duty_description": "Personnel and Administration Officer (S1) for a training battalion in the U.S. Army reserve. Principal staff assistant to the battalion commander. Exercise staff supervisor in matters pertaining to strength management, personnel qualifications and evaluations, personnel assignment, clearance, recruiting, retention, and battalion administration. Responsible for the overall supervision of the battalion Personnel Administration Center (PAC) and its activities. Serves as commander of Headquarters and Headquarters Detachment. Additional duties include; Battalion Safety Officer, Equal Opportunity Officer, Records Management Officer, and Retention Officer.",
    "rater_comments": "1LT X performed flawlessly in the execution of an overseas detention and area security mission at Guantanamo Bay, Cuba. Exceptional performance during this limited rating period by CPT X.",
    "character": "1LT X\u2019s exceptional command presence and resilience lends itself to consisten