# Gather background data on the Holocaust 
## Dates, people, places and events

Material is extracted from the glossary page of the United States Holocaust Memorial Museum, https://www.ushmm.org/. 
    
The page was downloaded as a flat HTML file, and processed using the code below.

Due to the difficulties in correctly sub-classing noun types, an interactive spreadsheet is used in this notebook ... for the ontologist to insert the appropriate superclasses and to indicate if a term is a type or instance.

Note that this notebook must be executed in Jupyter Notebook (vs JupyterLab due to problems with qgrid in JupyterLab).

In [1]:
## Imports

from bs4 import BeautifulSoup
import string
import pandas as pd
import qgrid

In [2]:
## Constants
ttl_prefix = '@prefix : <urn:ontoinsights:ontology:dna:> . \n'\
             '@prefix dna: <urn:ontoinsights:ontology:dna:> . \n'\
             '@prefix owl: <http://www.w3.org/2002/07/owl#> . \n'\
             '@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . \n'\
             '@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . \n'\
             '@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . \n\n'

In [12]:
def fix_up_name(text: str) -> (str, str):
    """
    Fix the word(s) that will become the class name to remove quotation marks, be upper camel-cased, 
    handle parentheses and commas, and remove spaces.
    
    The input parameter is the text that will become the class name.
    The output parameter is the 'label' for the text.
    """
    
    label = text.replace('"', '')   # Remove quotation marks
    label = label.replace('“', '')
    label = label.replace('”', '')
    label = label.title()  # Upper camel-cased

    if '(' in label:
        paren_index = label.find('(') 
        class_name = label[0:paren_index].strip()    # Get rid of anything after parentheses
        label_text = f'{label[0:paren_index]}; {label[paren_index + 1:label.find(')')]}'
    else:
        class_name = label
        label_text = label
    class_name = class_name.replace(',', '')   # Remove commas
    class_name = class_name.replace(' ', '')   # Remove spaces
    return class_name, label_text
    

In [13]:
# Process the glossary
with open('Glossary _ Holocaust Encyclopedia.html', 'r') as gloss_in:
    gloss_page = gloss_in.read()
    
soup = BeautifulSoup(gloss_page, 'html.parser')
terms = soup.find('div', {'class': 'article-main-text'})
# print(terms) results are:
# <div class="article-main-text" id="story-text">
# <p><strong><a href="/narrative/3225/en">Antisemitism</a>:</strong> hostility toward or hatred of Jews as a religious or ethnic group, often accompanied by social, economic, or political discrimination.</p>
# <p><strong><em>Appellplatz</em>:</strong> German word for roll call square where prisoners were forced to assemble.</p>
# ...
# </div>

# Create lists of glossary terms, their labels and their definitions
gloss_terms = list()
gloss_labels = list()
gloss_defns = list()
gloss = ''
for term in terms.find_all('p'):
    found_colon = term.get_text().find(':', 0, term.find('</strong>'))
    if found_colon > 0:
        # Colon indicates that there is a new term being defined
        # Save the info
        if gloss:
            gloss_terms.append(gloss)
            gloss_labels.append(label)
            gloss_defns.append(new_defn)
        text = term.get_text().split(':')
        gloss, label = fix_up_name(text[0]) 
        defn = text[1].strip()
        # Make sure that the first character of the defn is upper case and that double quotes are escaped
        new_defn = (defn[0].upper() + defn[1:]).replace('"', '\\"')
    else:
        # Another paragraph but not a new term
        defn = term.get_text().replace('"', '\\"')
        new_defn += defn
# When finished, write out the last term
gloss_terms.append(gloss)
gloss_labels.append(label)
gloss_defns.append(new_defn)
    
# Turn the lists into a dataframe 
dict = {'Term': gloss_terms, 'Label': gloss_labels, 'Defn': gloss_defns} 
gloss_df = pd.DataFrame(dict)
# Add columns to be hand-edited (using qgrid, next)
gloss_df['Superclass'] = ''
gloss_df['IsInstance'] = 'False'

In [14]:
# To run qgrid, need to 1) import it and 2) have executed:
#   jupyter nbextension enable --py --sys-prefix qgrid
#   jupyter nbextension enable --py --sys-prefix widgetsnbextension # only required if you have not enabled the ipywidgets nbextension yet
# Currently qgrid does not work in JupyterLab 3 

# QGrid is used to add superclass and instance info for the new concepts
# Also, used to fix up any acronyms (e.g., 'SS') since they are not capitalized correctly due to .title()
# And, added synonyms from the text to the labels (will be done automatically later)
grid_widget = qgrid.show_grid(gloss_df, show_toolbar=True)
grid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [29]:
updated_gloss = grid_widget.get_changed_df()
print(updated_gloss.iloc[[25]])

             Term                                 Label  \
25  Kristallnacht  Kristallnacht; Night of Broken Glass   

                                                 Defn   Superclass IsInstance  
25  Usually referred to as the \"Night of Broken G...  PointInTime       True  


In [30]:
# Write out the dataframe as turtle
with open('holocaust-gloss.ttl', 'w') as gloss_out:
    # Write the prefix details
    gloss_out.write(ttl_prefix)
    # Write out each gloss term
    for index, row in updated_gloss.iterrows():
        superclass_text = row['Superclass']
        if superclass_text == 'XXX':  # Term will be addressed manually
            continue
        if ',' in superclass_text:
            superclass_text = superclass_text.replace(', ', ', :')
        term_text = row['Term']
        label_text = row['Label']
        if ';' in label_text:
            label_text = label_text.replace('; ', '", "')    
        defn_text = row['Defn']
        instance_text = row['IsInstance']
        if instance_text == 'True':
            gloss_out.write(f':{term_text} a :{superclass_text} ;\n')
        else:  
            gloss_out.write(f':{term_text} a owl:Class ;\n  rdfs:subClassOf :{superclass_text} ;\n')
        gloss_out.write(f'  rdfs:label "{label_text}" ;\n  :defn "{defn_text}" .\n\n')

In [31]:
# Note that the resulting Turtle is then further hand-edited and stored in the /ontologies directory