In [20]:
#! pip install pandas transformers
import json
import pandas as pd
from transformers import AutoTokenizer

with open('../labeled_data/gold_labels/reconstructed_gold_labels_2.json', 'r') as f:
    data = json.load(f)

texts = [item['text'] for item in data]
entities = [item['entities'] for item in data]
df = pd.DataFrame({'text': texts, 'entities': entities})

In [21]:
def find_nl_positions(text):
    nl_pos = []
    current = 0
    while text.find("\n", current) != -1:
        nl_pos.append(text.find("\n", current))
        current = text.find("\n", current) +1
    return nl_pos

In [22]:
df['nl_positions'] = df['text'].apply(
    lambda x: find_nl_positions(x)
)

In [23]:
df

Unnamed: 0,text,entities,nl_positions
0,A new ransomware-as-a-service (RaaS) operation...,"[{'start': 716, 'end': 732, 'type': 'ORG', 'te...","[162, 359, 532, 702, 869, 1140, 1238, 1434, 14..."
1,\nThe popular Docker-OSX project has been remo...,"[{'start': 14, 'end': 24, 'type': 'Software', ...","[0, 183, 411, 581, 702, 848, 1102, 1288, 1445,..."
2,\nA former core infrastructure engineer at an ...,"[{'start': 82, 'end': 97, 'type': 'LOC', 'text...","[0, 221, 524, 752, 1136, 1350, 1722, 2064, 226..."
3,\nThreat actors target Middle Eastern organiza...,"[{'start': 23, 'end': 37, 'type': 'LOC', 'text...","[0, 220, 520, 682, 871, 1013, 1164, 1369, 1494..."
4,"\nSince surfacing in February 2024, RansomHub ...","[{'start': 37, 'end': 46, 'type': 'MAL-ORG', '...","[0, 156, 529, 915, 1041, 1299, 1547, 1983, 198..."
...,...,...,...
56,\n​Russian law enforcement detained almost 100...,"[{'start': 2, 'end': 10, 'type': 'LOC', 'text'...","[0, 239, 464, 787, 1076, 1241, 1472, 1719, 202..."
57,\nThe Russian state-sponsored APT29 hacking gr...,"[{'start': 5, 'end': 13, 'type': 'LOC', 'text'...","[0, 209, 389, 524, 735, 972, 1082, 1313, 1486,..."
58,\nThe South Korea-aligned cyberespionage group...,"[{'start': 6, 'end': 25, 'type': 'LOC', 'text'...","[0, 207, 366, 571, 799, 974, 1178, 1332, 1517,..."
59,\nThe Federal Communications Commission (FCC) ...,"[{'start': 6, 'end': 39, 'type': 'ORG', 'text'...","[0, 196, 436, 610, 831, 1056, 1225, 1450, 1869..."


In [25]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
df['tokenized'] = df['text'].apply(
    lambda x: tokenizer(x, return_offsets_mapping=True, truncation=True, padding=True)
)

In [26]:
def align_labels_to_tokens(text, entities, tokenized, nl_positions):
    offset_mapping = tokenized['offset_mapping']
    labels = ["O"] * len(offset_mapping) # Initialize all tokens with "O"

    for entity in entities:
        start, end, label_type = entity['start'], entity['end'], entity['type']
        nls_before_entity = 0
        for nl_pos in nl_positions:
            if nl_pos < start:
                nls_before_entity +=1
            else:
                break
        start -= nls_before_entity
        end -= nls_before_entity
        entity_started = False
        #print(entity)

        for idx, (token_start, token_end) in enumerate(offset_mapping):
            #print(token_start, token_end)
            if token_start is None or token_end is None:
                continue
            if token_start >= start and token_end <= end:
                if entity_started:
                    labels[idx] = f"I-{label_type}"
                else:
                    labels[idx] = f"B-{label_type}"
                    entity_started = True
            else:
                entity_started = False

    return labels

df['labels'] = df.apply(lambda row: align_labels_to_tokens(row['text'], row['entities'], row['tokenized'], row['nl_positions']), axis=1)

In [27]:
df

Unnamed: 0,text,entities,nl_positions,tokenized,labels
0,A new ransomware-as-a-service (RaaS) operation...,"[{'start': 716, 'end': 732, 'type': 'ORG', 'te...","[162, 359, 532, 702, 869, 1140, 1238, 1434, 14...","[input_ids, token_type_ids, attention_mask, of...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,\nThe popular Docker-OSX project has been remo...,"[{'start': 14, 'end': 24, 'type': 'Software', ...","[0, 183, 411, 581, 702, 848, 1102, 1288, 1445,...","[input_ids, token_type_ids, attention_mask, of...","[O, O, O, B-Software, I-Software, I-Software, ..."
2,\nA former core infrastructure engineer at an ...,"[{'start': 82, 'end': 97, 'type': 'LOC', 'text...","[0, 221, 524, 752, 1136, 1350, 1722, 2064, 226...","[input_ids, token_type_ids, attention_mask, of...","[O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-..."
3,\nThreat actors target Middle Eastern organiza...,"[{'start': 23, 'end': 37, 'type': 'LOC', 'text...","[0, 220, 520, 682, 871, 1013, 1164, 1369, 1494...","[input_ids, token_type_ids, attention_mask, of...","[O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O..."
4,"\nSince surfacing in February 2024, RansomHub ...","[{'start': 37, 'end': 46, 'type': 'MAL-ORG', '...","[0, 156, 529, 915, 1041, 1299, 1547, 1983, 198...","[input_ids, token_type_ids, attention_mask, of...","[O, O, O, O, O, O, O, O, O, O, B-MAL-ORG, I-MA..."
...,...,...,...,...,...
56,\n​Russian law enforcement detained almost 100...,"[{'start': 2, 'end': 10, 'type': 'LOC', 'text'...","[0, 239, 464, 787, 1076, 1241, 1472, 1719, 202...","[input_ids, token_type_ids, attention_mask, of...","[O, B-LOC, B-Event, I-Event, I-Event, I-Event,..."
57,\nThe Russian state-sponsored APT29 hacking gr...,"[{'start': 5, 'end': 13, 'type': 'LOC', 'text'...","[0, 209, 389, 524, 735, 972, 1082, 1313, 1486,...","[input_ids, token_type_ids, attention_mask, of...","[O, O, B-LOC, O, O, O, B-MAL-ORG, I-MAL-ORG, I..."
58,\nThe South Korea-aligned cyberespionage group...,"[{'start': 6, 'end': 25, 'type': 'LOC', 'text'...","[0, 207, 366, 571, 799, 974, 1178, 1332, 1517,...","[input_ids, token_type_ids, attention_mask, of...","[O, O, B-LOC, I-LOC, I-LOC, I-LOC, O, O, O, O,..."
59,\nThe Federal Communications Commission (FCC) ...,"[{'start': 6, 'end': 39, 'type': 'ORG', 'text'...","[0, 196, 436, 610, 831, 1056, 1225, 1450, 1869...","[input_ids, token_type_ids, attention_mask, of...","[O, O, B-ORG, I-ORG, I-ORG, O, B-ORG, O, B-Eve..."


In [28]:
# Save to JSON or CSV format
output_data = []
for _, row in df.iterrows():
    tokens = tokenizer.convert_ids_to_tokens(row['tokenized']['input_ids'], skip_special_tokens=False)
    labels = row['labels']
    output_data.append({'tokens': tokens, 'labels': labels})

# Save the processed data
#with open('tokenized_ner_data_6.json', 'w') as f:
    #json.dump(output_data, f, indent=4)

In [33]:
for i, label in enumerate(output_data[0]['labels']):
    if label != "O":
        print("Position: ", "\t Token: ", output_data[0]['tokens'][i] + "\t Label: ", output_data[0]['labels'][i])

Token:  C	 Label:  B-MAL-ORG
Token:  ##ica	 Label:  I-MAL-ORG
Token:  ##da	 Label:  I-MAL-ORG
Token:  ##33	 Label:  I-MAL-ORG
Token:  ##01	 Label:  I-MAL-ORG
Token:  C	 Label:  B-MAL-ORG
Token:  ##ica	 Label:  I-MAL-ORG
Token:  ##da	 Label:  I-MAL-ORG
Token:  ##33	 Label:  I-MAL-ORG
Token:  ##01	 Label:  I-MAL-ORG
Token:  B	 Label:  B-ORG
Token:  ##lee	 Label:  I-ORG
Token:  ##ping	 Label:  I-ORG
Token:  ##C	 Label:  I-ORG
Token:  ##om	 Label:  I-ORG
Token:  ##pute	 Label:  I-ORG
Token:  ##r	 Label:  I-ORG
Token:  C	 Label:  B-MAL-ORG
Token:  ##ica	 Label:  I-MAL-ORG
Token:  ##da	 Label:  I-MAL-ORG
Token:  C	 Label:  B-MAL-ORG
Token:  ##ica	 Label:  I-MAL-ORG
Token:  ##da	 Label:  I-MAL-ORG
Token:  ##33	 Label:  I-MAL-ORG
Token:  ##01	 Label:  I-MAL-ORG
Token:  conducts	 Label:  B-Event
Token:  double	 Label:  I-Event
Token:  -	 Label:  I-Event
Token:  ex	 Label:  I-Event
Token:  ##tor	 Label:  I-Event
Token:  ##tion	 Label:  I-Event
Token:  tactics	 Label:  I-Event
Token:  where	 Labe

In [12]:
def find_nl_positions(text):
    nl_pos = []
    current = 0
    while text.find("\n", current) != -1:
        nl_pos.append(text.find("\n", current))
        current = text.find("\n", current) +1
    return nl_pos

In [13]:
nl_pos

NameError: name 'nl_pos' is not defined

In [14]:
labels = align_labels_to_tokens(text, df.iloc[0]['entities'], df.iloc[0]['tokenized'], nl_pos)

NameError: name 'text' is not defined

In [15]:
for i, label in enumerate(labels):
    if label != "O":
        print(output_data[0]['tokens'][i])

A
new
ransom
##ware
-
as
-
a
-
service
(
Ra
##a
##S
)
operation
named
C
##ica
##da
##33
##01
has
already
listed
19
victims
on
its
ex
##tor
##tion
portal
,
as
it
quickly
attacked
companies
worldwide
.
The
new
c
##y
##ber
##c
##rim
##e
operation
game
that
cry
##pt
##ographic
legitimate
project
has
issued
a
forum
post
##ware
and
c
known
as
RAM
##P
are
then
used
paying
a
ransom
of
the
ma
##l
##ware
by
True
involving
about
an
stole
a
massive
$
22
##tus
b
That
b
##ot
Pa
##H
shut
-
based


IndexError: list index out of range