# Using GLiNER and spaCy for Named Entity Recognition

In [1]:
import glob
import datasets
import pandas as pd
import spacy
from tqdm import tqdm

In [2]:
metadata = datasets.load_dataset("placingholocaust/testimony-metadata")["train"]
testimonies_metadata = pd.DataFrame(metadata)
testimonies_metadata

Unnamed: 0,RG Number,PDF URL,USHMM URL,First Name,Middle Name,Last Name,Birth Name,Gender,Birth Date,Birth Year,...,Ghetto,Camp(s) Encyclopedia,Camp,Non-SS Camp,Region,Needs Research,Data Entry,Accession,Notes:,Revisit
0,RG-50.549.02.0033,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Hetty,d'Ancona de,Leeuwe,Hetty D'Ancona,F,1930-05-01,1930.0,...,,,,,,,CL,1999.A.0293,,
1,RG-50.549.02.0072,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Emanuel,,Mandel,,M,,1936.0,...,,,,,,checked,GG,2003.205,Follow-up interview,
2,RG-50.549.02.0035,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Judith,,Meisel,,F,,1929.0,...,Kaunas,,,,,checked,GG,1999.A.0024,This is a follow-up interview to one already d...,checked
3,RG-50.471.0015,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Esther,,Lurie,,F,,,...,,,,,,,CL,1998.A.0119.15,,
4,RG-50.030.0585,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Eugene,,Miller,,M,1923-10-16,1923.0,...,Lodz,"Auschwitz,Dachau",,,,checked,GG,2010.249,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,RG-50.549.02.0073,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Flory,,Jagoda,,F,1923-12-21,1923.0,...,,,,,,,GG,2004.48,Follow-up,checked
973,RG-50.030.0137,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Cornelius,,Loen,,M,1922-05-02,1922.0,...,,,,,,,CL,1990.437.1,,
974,RG-50.030.0058,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Isaac,,Danon,,M,,1929.0,...,,,,,,,GG,,,
975,RG-50.549.02.0078,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,Lucie,,Rosenberg,,F,,1921.0,...,,,,,,checked,CL,2004.214,"Not a survivor, volunteered for the museum?",


In [3]:
testimonies_metadata[testimonies_metadata["RG Number"] == "RG-50.030.0001"]

Unnamed: 0,RG Number,PDF URL,USHMM URL,First Name,Middle Name,Last Name,Birth Name,Gender,Birth Date,Birth Year,...,Ghetto,Camp(s) Encyclopedia,Camp,Non-SS Camp,Region,Needs Research,Data Entry,Accession,Notes:,Revisit
602,RG-50.030.0001,https://collections.ushmm.org/oh_findingaids/R...,https://collections.ushmm.org/search/catalog/i...,David,A.,Kochalski,,M,1928-05-05,1928.0,...,,,,,West,,CL,,,


In [4]:
testimonies_metadata.iloc[0]

RG Number                                                 RG-50.549.02.0033
PDF URL                   https://collections.ushmm.org/oh_findingaids/R...
USHMM URL                 https://collections.ushmm.org/search/catalog/i...
First Name                                                            Hetty
Middle Name                                                     d'Ancona de
Last Name                                                            Leeuwe
Birth Name                                                   Hetty D'Ancona
Gender                                                                    F
Birth Date                                                       1930-05-01
Birth Year                                                           1930.0
Place of Birth                                                         None
Country                                                                None
Experience Group                                                   Survivor
Ghetto(s) En

In [5]:
import spacy
from bs4 import BeautifulSoup
import os
from pathlib import Path
import pandas as pd

def create_yaml_header(row):
    # Function to create the YAML-like header from a dataframe row
    header = "---\n"
    header += f"layout: transcript\n"
    
    # Helper function to safely get values from the row
    def safe_get(column, default='none'):
        if column in row.index and pd.notna(row[column]):
            return str(row[column]).lower()
        return default

    header += f"interviewee: {safe_get('First Name')} {safe_get('Middle Name')} {safe_get('Last Name')}\n"
    header += f"rg_number: {safe_get('RG Number')}\n"
    header += f"pdf_url: {safe_get('PDF URL')}\n"
    header += f"ushmm_url: {safe_get('USHMM URL')}\n"
    header += f"gender: {safe_get('Gender')}\n"
    header += f"birth_date: {safe_get('Birth Date')}\n"
    header += f"birth_year: {safe_get('Birth Year')}\n"
    header += f"place_of_birth: {safe_get('Place of Birth')}\n"
    header += f"country: {safe_get('Country')}\n"
    header += f"experience_group: {safe_get('Experience Group')}\n"
    header += f"ghetto(s)_encyclopedia: {safe_get('Ghetto(s) Encyclopedia')}\n"
    header += f"ghetto: {safe_get('Ghetto')}\n"
    header += f"camp(s)_encyclopedia: {safe_get('Camp(s) Encyclopedia')}\n"
    header += f"camp: {safe_get('Camp')}\n"
    header += f"non_ss_camp: {safe_get('Non-SS Camp')}\n"
    header += f"region: {safe_get('Region')}\n"
    header += f"needs_research: {safe_get('Needs Research')}\n"
    header += f"data_entry: {safe_get('Data Entry')}\n"
    header += f"accession: {safe_get('Accession')}\n"
    header += f"revisit: {safe_get('Revisit')}\n"
    header += f"tags: transcripts\n"
    header += "---\n\n"
    return header

def process_html_with_spacy(html_content, nlp_model, yaml_header):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Add the YAML header to the top of the HTML
    header_tag = soup.new_tag("pre")
    header_tag.string = yaml_header
    soup.insert(0, header_tag)
    
    # Find all sentence elements
    sentences = soup.find_all('sentence')
    
    for sentence in sentences:
        # Get the text content of the sentence
        text = sentence.get_text()
        
        # Process the text with spaCy
        doc = nlp_model(text)
        
        # Clear the sentence content
        sentence.clear()
        
        # Add annotated content
        last_end = 0
        for ent in doc.ents:
            # Add text before the entity
            sentence.append(text[last_end:ent.start_char])
            
            # Create a new span tag
            span_tag = soup.new_tag("span", attrs={"class": ent.label_})
            span_tag.string = text[ent.start_char:ent.end_char]
            sentence.append(span_tag)
            
            last_end = ent.end_char
        
        # Add any remaining text
        sentence.append(text[last_end:])
    
    # Return the modified HTML as a string
    return str(soup)

def process_files(input_folder, output_folder, nlp_model, testimonies_data):
    # Ensure output folder exists
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    files = os.listdir(input_folder)
    files.sort()
    # Process each file in the input folder
    for filename in tqdm(files):
        if filename.endswith('.html'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            
            # Extract the RG Number from the filename
            rg_number = filename.split('_')[0]
            # print(rg_number)
            
            # Find the corresponding row in testimonies_data
            testimony_row = testimonies_data[testimonies_data['RG Number'] == rg_number]
            
            if testimony_row.empty:
                print(f"No matching data found for {filename}. Skipping this file.")
                continue
            
            # Create the YAML header
            yaml_header = create_yaml_header(testimony_row.iloc[0])
            
            # Read the input file
            with open(input_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            
            # Process the HTML content
            processed_html = process_html_with_spacy(html_content, nlp_model, yaml_header)
            
            # Write the processed content to the output file
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(processed_html)
            
            # print(f"Processed {filename}")

labels = ["dlf", "populated place", "country", "region", "interior space", "env feature", "building", "spatial object"]

nlp = spacy.blank("en")
nlp.add_pipe("gliner_spacy", config={"gliner_model": "placingholocaust/gliner_small-v2.1-holocaust", "labels": labels, "chunk_size": 250, "map_location": "mps"})

# Usage example:
process_files("../data/03_html_sentences/", "../data/04_html_ner", nlp, testimonies_metadata)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 71392.41it/s]
  state_dict = torch.load(model_file, map_location=torch.device(map_location))
  0%|          | 0/979 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
import spacy
from bs4 import BeautifulSoup
import os
from pathlib import Path

def process_html_with_spacy(html_content, nlp_model):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all sentence elements
    sentences = soup.find_all('sentence')
    
    for sentence in sentences:
        # Get the text content of the sentence
        text = sentence.get_text()
        
        # Process the text with spaCy
        doc = nlp_model(text)
        
        # Clear the sentence content
        sentence.clear()
        
        # Add annotated content
        last_end = 0
        for ent in doc.ents:
            # Add text before the entity
            sentence.append(text[last_end:ent.start_char])
            
            # Create a new span tag
            span_tag = soup.new_tag("span", attrs={"class": ent.label_})
            span_tag.string = text[ent.start_char:ent.end_char]
            sentence.append(span_tag)
            
            last_end = ent.end_char
        
        # Add any remaining text
        sentence.append(text[last_end:])
    
    # Return the modified HTML as a string
    return str(soup)

def process_files(input_folder, output_folder, nlp_model):
    # Ensure output folder exists
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    files = os.listdir(input_folder)
    files.sort()
    # Process each file in the input folder
    for filename in files[:1]:
        if filename.endswith('.html'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            
            # Read the input file
            with open(input_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            
            # Process the HTML content
            processed_html = process_html_with_spacy(html_content, nlp_model)
            
            # Write the processed content to the output file
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(processed_html)
            
            print(f"Processed {filename}")

labels = ["dlf", "populated place", "country", "region", "interior space", "env feature", "building", "spatial object"]


nlp = spacy.blank("en")
nlp.add_pipe("gliner_spacy", config={"gliner_model": "placingholocaust/gliner_small-v2.1-holocaust", "labels": labels, "chunk_size": 250})

# Usage example:
# nlp = spacy.load("en_core_web_sm")
process_files("../data/03_html_sentences/", "../data/04_html_ner", nlp)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 33554.43it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processed RG-50.030.0001_trs_en_cleaned.html
