In [1]:
from transformers import AutoTokenizer

# Initialize tokenizer (replace with your specific tokenizer)
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")



In [177]:
def extract_entities_from_tokens(text, entities):
    """
    Identify and group continuous tokens into a single entity based on start and end indexes.

    :param text: Input text for tokenization.
    :param entities: List of entities with start, end, type, and text fields.
    :return: List of merged entities with aligned start and end positions.
    """
    # Tokenize the text with offset mappings
    tokenized_text = tokenizer(text, return_offsets_mapping=True)
    tokens = tokenized_text.input_ids
    offsets = tokenized_text.offset_mapping
    
    # Initialize variables to store the results
    gold_entities = []
    index = 0
    entity = entities[index] if entities else None
    nextent = entities[index+1]
    temp_text = ""  # Temporary storage for concatenating tokens within the same entity span

    # Loop through each token and check if it aligns with an entity
    for i in range(len(tokens)):
        print(f"i={i}")
        if not entity:
            break  # Exit if no more entities

        # Check if the token's start aligns with the current entity's start
        # Offsets[i][0] is the offset_mapping start index.
        # If it is equal to the entities[index]['start'] it will enter the loop
        if offsets[i][0] == entity['start']:
            print(f"Offsets: {offsets[i][0]} + Entity start index:{entity['start']}")
            # Accumulate token text within the entity
            temp_text += tokenizer.decode([tokens[i]], skip_special_tokens=True).replace("##", "")
            print(f"Temp text: {temp_text}")
            start_token = entity['start']
            #print(f"Start token: {start_token}")
            # Create a single merged entity for the accumulated text
            datapoint = {
                "text": temp_text,           # Combined text of all sub-tokens
                "type": entity['type'],      # Entity type
                "start": entity['start'],    # Entity start position
                "end": entity['end']         # Entity end position
            }
            gold_entities.append(datapoint)
            if offsets[i][1] == nextent['start']:
                print(f"This text says that offsets[i][1]: {offsets[i][1]} is equal to next entity start: {nextent['start']}")
            # Reset for the next entity
            index += 1
            entity = entities[index] if index < len(entities) else None
    print(f"Gold entities:{gold_entities}\n")
    return gold_entities

In [181]:
def new_extract_entities_from_tokens(text, entities):
    """
    Identify and group continuous tokens into a single entity based on start and end indexes.

    :param text: Input text for tokenization.
    :param entities: List of entities with start, end, type, and text fields.
    :return: List of merged entities with aligned start and end positions.
    """
    # Tokenize the text with offset mappings
    tokenized_text = tokenizer(text, return_offsets_mapping=True)
    tokens = tokenized_text.input_ids
    offsets = tokenized_text.offset_mapping
    
    # Initialize variables to store the results
    merged_entities = []
    temp_text = "" # Temporary storage for concatenating tokens within the same entity span
    temp_types = [] # Temporary storage for entity types
    start_pos = None # Start pos of curr entity
    end_pos = None # End pos for current entity

    # Loop through each token and check if it aligns with an entity
    for entity in entities:
        token_text = entity["text"].replace("##", "")
        print(f"token text: {token_text}")
        if not temp_text or entity["start"] != end_pos:
            if temp_text:
                majority_type = max(set(temp_types), key=temp_types.count)
                merged_entities.append({
                    "text": temp_text,
                    "type": majority_type,
                    "start": start_pos,
                    "end": end_pos
                })
            temp_text = token_text
            temp_types = [entity["type"]]
            start_pos = entity["start"]
            end_pos = entity["end"]
        else:
            temp_text += token_text
            temp_types.append(entity["type"])
            end_pos = entity["end"]
    if temp_text:
        majority_type = max(set(temp_types), key=temp_types.count)
        merged_entities.append({
            "text": temp_text,
            "type": majority_type,
            "start": start_pos,
            "end": end_pos
        })
    return merged_entities

In [179]:
text = "A new ransomware-as-a-service (RaaS) operation named Cicada3301 has already listed 19 victims on its extortion portal."
entities = [
            {
                "start": 53,
                "end": 54,
                "type": "I-ORG",
                "text": "C"
            },
            {
                "start": 54,
                "end": 57,
                "type": "I-MISC",
                "text": "##ica"
            },
            {
                "start": 57,
                "end": 59,
                "type": "I-MISC",
                "text": "##da"
            },
            {
                "start": 59,
                "end": 61,
                "type": "I-ORG",
                "text": "##33"
            },
            {
                "start": 61,
                "end": 63,
                "type": "I-ORG",
                "text": "##01"
            },
            {
                "start":65,
                "end": 67,
                "type": "O",
                "text": "test"
            },
]

In [106]:
print(tokens)

[101, 138, 1207, 25057, 7109, 118, 1112, 118, 170, 118, 1555, 113, 16890, 1161, 1708, 114, 2805, 1417, 140, 4578, 1810, 23493, 24400, 1144, 1640, 2345, 1627, 5256, 1113, 1157, 4252, 2772, 2116, 10823, 119, 102]


In [108]:
print(entities[0])
print(entities[0]['start'])

{'start': 53, 'end': 54, 'type': 'I-ORG', 'text': 'C'}
53


In [91]:
entity1 = entities[4+1]['start']
entity2 = entities[4]['end']
if entity1 == entity2:
    print(f"True because: {entity1} = {entity2}")
else:
    print(f"False because: {entity1} not equal {entity2}")

False because: 65 not equal 63


In [162]:
for i in range(5):
    print(entities[i])

{'start': 53, 'end': 54, 'type': 'I-ORG', 'text': 'C'}
{'start': 54, 'end': 57, 'type': 'I-MISC', 'text': '##ica'}
{'start': 57, 'end': 59, 'type': 'I-MISC', 'text': '##da'}
{'start': 59, 'end': 61, 'type': 'I-ORG', 'text': '##33'}
{'start': 61, 'end': 63, 'type': 'I-ORG', 'text': '##01'}


In [182]:
token_aligned_entities = new_extract_entities_from_tokens(text, entities)
print(token_aligned_entities)

token text: C
token text: ica
token text: da
token text: 33
token text: 01
token text: test
[{'text': 'Cicada3301', 'type': 'I-ORG', 'start': 53, 'end': 63}, {'text': 'test', 'type': 'O', 'start': 65, 'end': 67}]
