In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# pipeline-api
import requests

In [3]:
import os
# Note(yuming): Move `import json` from next block because API covertion will redefine it
import json

# Note(yuming): If you are running the jupter notebook locally, please use DIRECTORY path with: 
DIRECTORY = os.path.split(os.getcwd())[0]
# DIRECTORY = os.getcwd()
SAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "sample-docs")

In [4]:
# pipeline-api
def partition_oer(file, filename, file_content_type=None, include_elems=["Text", "Title", "Table"]):
    response = requests.post(
        "https://dev.ml.unstructured.io/layout/pdf",
        files={"file": (filename, file, file_content_type)},
        data={"include_elems" : include_elems}
    )
    # NOTE(yuming): return the result from post request as a dictionary
    partition_result = json.loads(response.content.decode("utf-8"))
    return partition_result

In [5]:
filename = os.path.join(SAMPLE_DOCS_DIRECTORY, "fake-oer.pdf")
file = open(filename, "rb")

pages = partition_oer(file, filename)["pages"]

## Cleaning Brick

In [6]:
for element in pages[0]["elements"]:
    if element["type"] == "Text":
        print(f"\n\n{element['text']}")



c. SIGNIFICANT DUTIES AND RESPONSIBILITIES Personnel and Administration Officer (S1) for a training battalion in the U.S. Army reserve. Principal staff assistant to the battalion commander. Exercise staff supervisor in matters pertaining to strength management, personnel qualifications and evaluations, personnel assignment, clearance, recruiting, retention, and battalion administration. Responsible for the overall supervision of the battalion Personnel Administration Center (PAC) and its activities. Serves as commander of Headquarters and Headquarters Detachment. Additional duties include; Battalion Safety Officer, Equal Opportunity Officer, Records Management Officer, and Retention Officer.


1LT X performed flawlessly in the execution of an overseas detention and area security mission at Guantanamo Bay, Cuba. Exceptional performance during this limited rating period by CPT X.


In [7]:
# pipeline-api
import re

BLOCK_TITLE_PATTTERN = (r"c. (SIGNIFICANT DUTIES AND RESPONSIBILITIES"
                        r"|COMMENTS ON POTENTIAL):?")


BLOCK_TITLE_RE = re.compile(r"c. (SIGNIFICANT DUTIES AND RESPONSIBILITIES"
                            r"|COMMENTS ON POTENTIAL)")

def clean_block_titles(narrative: str) -> str:
    """Cleans the name of the block from the extracted narrative text"""
    return BLOCK_TITLE_RE.sub("", narrative).strip()

### Staging

In [8]:
pages[1]["elements"][0]

{'type': 'Table',
 'coordinates': [[7.690715312957764, 51.52677917480469],
  [594.323974609375, 51.52677917480469],
  [594.323974609375, 367.3587341308594],
  [7.690715312957764, 367.3587341308594]],
 'text': 'c. 1) Character : Developed AAR reporting template that standardized information across the battalion and ensured compliance with Army Regulations. She consistently presented appropriate and useful monthly reports on security clearances, weather effects, and threat assessments. (Gets Results) c. 6) Achieves : Absolute professional and squared away for duty; current on all applicable skills, knowledge, and mental toughness by engaging in engages in continual self-development. Using his extensive experience, 1LT X works well after normal duty hours, provides coaching, and counseling and mentoring. (Creates a positive command/workplace environment/Fosters Esprit de Corps, Prepares Self, Develops Others, Stewards the Profession) c. 5) Develops : 1LT X demonstrates the full range of r

In [9]:
# pipeline-api

RATER_SECTIONS = [
    "character",
    "presence",
    "intellect",
    "leads",
    "develops",
    "achieves",
]
SECTION_PATTERN = r"c. [1-6]\) ({0}) :".format("|".join(RATER_SECTIONS))


DESCRIPTIONS = [
    "Gets Results",
    "Creates a positive command/workplace environment/Fosters "
    "Esprit de Corps, Prepares Self, Develops Others, Stewards the Profession",
    "Leads Others, Builds Trust, Extends Influence beyond the Chain of Command, Leads by Example, Communicates",
    "Mental Agility, Sound Judgment, Innovation, Interpersonal Tact, Expertise",
    "Military and Professional Bearing, Fitness, Confident, Resilient",
    "Adherence to Army Values, Empathy, and Warrior Ethos/Service Ethos and Discipline. "
    "Fully supports SHARP, EO, and EEO."   
]
DESCRIPTION_PATTERN = r"({0})".format("|".join(DESCRIPTIONS))


def get_rater_sections(pages):
    """Extracts the Character, Presence, Intellect, Leads, Develops, and Achieves blocks
    from the rater comments and converts them to a dictionary."""
    rater_sections = dict()
    for element in pages[1]["elements"]:
        if re.search(SECTION_PATTERN, element["text"], flags=re.IGNORECASE):
            section_split = re.split(SECTION_PATTERN, element["text"], flags=re.IGNORECASE)
            for i, chunk in enumerate(section_split):
                if chunk.lower() in RATER_SECTIONS and i < len(section_split) - 1:
                    rater_sections[chunk.lower()] = section_split[i+1]
    return rater_sections

In [10]:
get_rater_sections(pages)

{'character': ' Developed AAR reporting template that standardized information across the battalion and ensured compliance with Army Regulations. She consistently presented appropriate and useful monthly reports on security clearances, weather effects, and threat assessments. (Gets Results) ',
 'achieves': ' Absolute professional and squared away for duty; current on all applicable skills, knowledge, and mental toughness by engaging in engages in continual self-development. Using his extensive experience, 1LT X works well after normal duty hours, provides coaching, and counseling and mentoring. (Creates a positive command/workplace environment/Fosters Esprit de Corps, Prepares Self, Develops Others, Stewards the Profession) ',
 'develops': ' 1LT X demonstrates the full range of required influence techniques enabling him to speak, lead and motivate every person in his unit. 1LT X works with the Alameda County Sheriff’s office, as well as other outside agencies, in order to build positiv

In [11]:
pages[1]["elements"][-2]

{'type': 'Table',
 'coordinates': [[29.08255958557129, 515.68212890625],
  [588.7683715820312, 515.68212890625],
  [588.7683715820312, 744.3233642578125],
  [29.08255958557129, 744.3233642578125]],
 'text': 'PART VI - SENIOR RATER POTENTIAL COMPARED WITH OFFICERS SENIOR RATED IN SAME GRADE (OVERPRINTED BY DA) MOST QUALIFIED (limited to 49%) HIGHLY QUALIFIED QUALIFIED NOT QUALIFIED b. I currently senior rate Army Officers in this grade. 1LT X is an intelligent and creative Officer with the potential to progress in rank as a leader. 1LT X is ready for positions of increased responsibilities; he will excel as a Staff Officer followed by Company Command if given the opportunity. Select for Military Police Captains Career Course and promote to captain when eligible. c. COMMENTS ON POTENTIAL: d. List 3 future SUCCESSIVE assignments for which this Officer is best suited: Battalion FDO; Battalion AS3; Battalion S4'}

In [12]:
# pipeline-api
from unstructured.cleaners.core import clean_postfix, clean_prefix
from unstructured.cleaners.extract import extract_text_after, extract_text_before


SENIOR_RATER_PREFIX = (r"PART VI - SENIOR RATER POTENTIAL COMPARED WITH OFFICERS SENIOR RATED IN SAME GRADE "
                       r"\(OVERPRINTED BY DA\) MOST QUALIFIED "
                       r"\(limited to 49%\) HIGHLY QUALIFIED QUALIFIED NOT QUALIFIED b. ")
        
NEXT_ASSIGNMENT_PREFIX = "d. List 3 future SUCCESSIVE assignments for which this Officer is best suited: "

def get_senior_rater_comments(pages):
    for element in pages[1]["elements"]:
        if re.search(SENIOR_RATER_PREFIX, element["text"]):
            raw_comments = clean_prefix(element["text"], SENIOR_RATER_PREFIX)
            
            sr_rater_comments = extract_text_before(raw_comments, NEXT_ASSIGNMENT_PREFIX)
            sr_rater_comments = clean_postfix(sr_rater_comments, BLOCK_TITLE_PATTTERN)
            
            
            next_assigments = extract_text_after(raw_comments, NEXT_ASSIGNMENT_PREFIX)
            
            return {"comments": sr_rater_comments, "next_assignment": next_assigments.split(";")}
    
    return dict()

In [17]:
get_senior_rater_comments(pages)

{'comments': 'I currently senior rate Army Officers in this grade. 1LT X is an intelligent and creative Officer with the potential to progress in rank as a leader. 1LT X is ready for positions of increased responsibilities; he will excel as a Staff Officer followed by Company Command if given the opportunity. Select for Military Police Captains Career Course and promote to captain when eligible.',
 'next_assignment': ['Battalion FDO', ' Battalion AS3', ' Battalion S4']}

In [18]:
# pipeline-api
def structure_oer(pages):
    """Creates a dictionary with the extracted elements of the OER.
    Input is a list of dictionaries,
    each dictionary contains raw information of a page as extracted from PDF parsing.
    Output is a dictionary,
    each key is a block name from COMMENT_BLOCKS 
    and the value is the extracted texts from the block.
    """
    if len(pages) < 2:
        raise ValueError(f"Pages length is {len(pages)}. "
                          "Expected 2 pages.")
    
    structured_oer = dict()
        
    first_page = [element for element in pages[0]["elements"] if element["type"] == "Text"]
    if len(first_page) < 2:
        raise ValueError(f"Number of narrative text elements on the "
                         f"first page is {len(first_page)}. "
                          "Expected at least two.")
    
    duty_description = clean_block_titles(first_page[0]["text"])
    structured_oer["duty_description"] = duty_description
    structured_oer["rater_comments"] = first_page[-1]["text"]
    structured_oer["rater_sections"] = get_rater_sections(pages)
    structured_oer["senior_rater_comments"] = get_senior_rater_comments(pages)
    
    second_page = [element for element in pages[1]["elements"] if element["type"] == "Text"]
    structured_oer["intermediate_rater"] = second_page[-2]["text"]
        
    return structured_oer

In [19]:
oer = structure_oer(pages)

In [20]:
oer

{'duty_description': 'Personnel and Administration Officer (S1) for a training battalion in the U.S. Army reserve. Principal staff assistant to the battalion commander. Exercise staff supervisor in matters pertaining to strength management, personnel qualifications and evaluations, personnel assignment, clearance, recruiting, retention, and battalion administration. Responsible for the overall supervision of the battalion Personnel Administration Center (PAC) and its activities. Serves as commander of Headquarters and Headquarters Detachment. Additional duties include; Battalion Safety Officer, Equal Opportunity Officer, Records Management Officer, and Retention Officer.',
 'rater_comments': '1LT X performed flawlessly in the execution of an overseas detention and area security mission at Guantanamo Bay, Cuba. Exceptional performance during this limited rating period by CPT X.',
 'rater_sections': {'character': ' Developed AAR reporting template that standardized information across the

### API Definition

In [21]:
# pipeline-api
def pipeline_api(file, file_content_type=None, filename=None):
    pages = partition_oer(file, filename, file_content_type=file_content_type)["pages"]

    return structure_oer(pages)

In [22]:
with open(filename, "rb") as f:
    oer = pipeline_api(f, filename=filename)

In [23]:
print(json.dumps(oer, indent=4))

{
    "duty_description": "Personnel and Administration Officer (S1) for a training battalion in the U.S. Army reserve. Principal staff assistant to the battalion commander. Exercise staff supervisor in matters pertaining to strength management, personnel qualifications and evaluations, personnel assignment, clearance, recruiting, retention, and battalion administration. Responsible for the overall supervision of the battalion Personnel Administration Center (PAC) and its activities. Serves as commander of Headquarters and Headquarters Detachment. Additional duties include; Battalion Safety Officer, Equal Opportunity Officer, Records Management Officer, and Retention Officer.",
    "rater_comments": "1LT X performed flawlessly in the execution of an overseas detention and area security mission at Guantanamo Bay, Cuba. Exceptional performance during this limited rating period by CPT X.",
    "rater_sections": {
        "character": " Developed AAR reporting template that standardized in