In [None]:
from pathlib import Path
import jsonlines
import torch

from nlp_cyber_ner.dataset import read_cyner, read_aptner, read_attacker, read_dnrti
from nlp_cyber_ner.dataset import read_iob2_file
from nlp_cyber_ner.dataset import unify_labels_aptner
from nlp_cyber_ner.dataset import clean_aptner, clean_dnrti
from nlp_cyber_ner.dataset import transform_dataset
from nlp_cyber_ner.config import PROCESSED_DATA_DIR, RAW_DATA_DIR, INTERIM_DATA_DIR, TOKENPROCESSED_DATA_DIR
from nlp_cyber_ner.dataset import Preprocess 
import copy

# For reference, the code that produces the data in tokenprocessed is in this notebook. The training scripts for the tokenmodels expect this data.

#### APTNER: Using the labels that we ended up mapping to CYNER as well as some of the ones that aren't trivial to solve (e.g. ip.address) such that the datasets maintain their identity more.

Btw, I think, given the description of vulnerability in CYNER, "Vulnerability includes both CVE ID (e.g., CVE-2012-2825) and mention of exploits (e.g., master key vulnerability).", we should keep VULID and map it to Vulnerability in CYNER - In the datasets where we actually do mapping, not relevant here, just wanted to write it down.

In [2]:
#this is just a repurposed unify_labels_aptner() function, but changed so that it drops instead of doing any mapping

def drop_irrelevant_aptner_labels(path: Path) -> None:
    """
    Reads in APTNer in its cleaned iob2-esque format (prefixes don't follow iob2 quite yet), and writes to iob2 format, dropping irrelevant labels.
    All E- labels are converted to I- labels.
    All S- labels are converted to B- labels.
    MD5, SHA1, SHA2, LOC, TIME, IP are dropped, everything else is kept.
    """

    with (
        open(path, "r", encoding="utf-8") as f,
        open(path.with_suffix(".tokenready"), "w", encoding="utf-8") as f_out,
    ):
        for line in f:
            line = line.strip()
            if line:
                print(line)
                tok = line.split()
                assert len(tok) == 2
                new_tag = tok[1]
                if tok[1] != "O":
                    prefix, label = tok[1].split("-")

                    if (label == "MD5"
                        or label == "SHA1"
                        or label == "SHA2" 
                        or label == "LOC"
                        or label == "TIME"
                        or label == "IP"):
                        label = "O"
                        f_out.write(f"{tok[0]} O\n")
                        continue

                    if prefix == "E":
                        prefix = "I"
                    elif prefix == "S":
                        prefix = "B"

                    new_tag = f"{prefix}-{label}"
                f_out.write(f"{tok[0]} {new_tag}\n")
            else:
                f_out.write("\n")

In [None]:
aptner_path = INTERIM_DATA_DIR / "APTNer"
aptner_train_path= aptner_path / "APTNERtrain.cleaned"
aptner_dev_path= aptner_path / "APTNERdev.cleaned"
aptner_test_path= aptner_path / "APTNERtest.cleaned"

drop_irrelevant_aptner_labels(aptner_train_path)
drop_irrelevant_aptner_labels(aptner_dev_path)
drop_irrelevant_aptner_labels(aptner_test_path)

# move manually to tokenprocessed after this.

In [4]:
aptner_path = TOKENPROCESSED_DATA_DIR / "APTNer"
aptner_train_path= aptner_path / "train.tokenready"
aptner_dev_path= aptner_path / "valid.tokenready"
aptner_test_path= aptner_path / "test.tokenready"
aptner_train_data = read_iob2_file(aptner_train_path)
aptner_dev_data = read_iob2_file(aptner_dev_path)
aptner_test_data = read_iob2_file(aptner_test_path)

In [5]:
aptner_train_X, aptner_train_y, aptner_dev_X, aptner_dev_y, aptner_test_X, aptner_test_y, aptner_idx2word, aptner_idx2label, aptner_max_len = \
transform_dataset(
    aptner_train_data, aptner_dev_data, aptner_test_data
)

In [6]:
aptner_train_X.shape

torch.Size([7679, 82])

In [None]:
aptner_idx2label


#### DNRTI: Same approach as for APTNER.

In [8]:
def drop_irrelevant_dnrti_labels(path: Path) -> None:
    """
    Reads in DNRTI in iob2 format, writes to iob2 format again, dropping irrelevant labels.
    Seems we agreed to drop Way, Area, Purp, Exp, Features, some time ago, so I'm dropping those here, but keeping the rest.
    This, like for aptner, includes some labels that were not originally mapped to CYNER.
    """

    with (
        open(path, "r", encoding="utf-8") as f,
        open(path.with_suffix(".tokenready"), "w", encoding="utf-8") as f_out,
    ):
        for line in f:
            line = line.strip()
            if line:
                tok = line.split()
                assert len(tok) == 2
                new_tag = tok[1]
                if tok[1] != "O":
                    prefix, label = tok[1].split("-")
                    if (label == "Way"
                        or label == "Area"
                        or label == "Purp"
                        or label == "Exp"
                        or label == "Features"):
                        label = "O"
                        f_out.write(f"{tok[0]} O\n")
                        continue
                    new_tag = f"{prefix}-{label}"
                f_out.write(f"{tok[0]} {new_tag}\n")
            else:
                f_out.write("\n")

In [9]:
dnrti_path = INTERIM_DATA_DIR / "DNRTI"
dnrti_train_path = dnrti_path / "train.cleaned"
dnrti_dev_path = dnrti_path / "valid.cleaned"
dnrti_test_path = dnrti_path / "test.cleaned"

#fun
drop_irrelevant_dnrti_labels(dnrti_train_path)
drop_irrelevant_dnrti_labels(dnrti_dev_path)
drop_irrelevant_dnrti_labels(dnrti_test_path)

# move manually to tokenprocessed after this.


In [10]:
dnrti_path = TOKENPROCESSED_DATA_DIR / "DNRTI"
dnrti_train_path = dnrti_path / "train.tokenready"
dnrti_dev_path = dnrti_path / "valid.tokenready"
dnrti_test_path = dnrti_path / "test.tokenready"

dnrti_train_data = read_iob2_file(dnrti_train_path, word_index=0, tag_index=1)
dnrti_dev_data = read_iob2_file(dnrti_dev_path, word_index=0, tag_index=1)
dnrti_test_data = read_iob2_file(dnrti_test_path, word_index=0, tag_index=1)

In [11]:
dnrti_train_X, dnrti_train_y, dnrti_dev_X, dnrti_dev_y, dnrti_test_X, dnrti_test_y, dnrti_idx2word, dnrti_idx2label, dnrti_max_len = \
transform_dataset(
    dnrti_train_data, dnrti_dev_data, dnrti_test_data
)

In [12]:
dnrti_train_X.shape

torch.Size([5251, 82])

In [None]:
dnrti_idx2label

#### For ATTACKNER, most labels don't seem trivial or irrelevant, except for location, which can probably be argued to be accomplished by more general NER models - I'll drop that here to stay consistent with what was dropped for DNRTI and APTNER. Everything else stays as is.

In [14]:
# Using the function from nlp_cyber_ner.dataset.py to get the attacker dataset in IOB2 format. Just repurposing so there is no mapping/merging 
# with CYNER 

def attacker_to_iob2_format(path: Path) -> None:
    """
    Keeping the original labels (except dropping 11/18!)
    Outputs a conll/iob2 format
    """
    with (
        jsonlines.open(path) as reader,
        open(path.with_suffix(".tokenready"), "w", encoding="utf-8") as f_out,
    ):
        for obj in reader:
            tags = obj["tags"]
            tokens = obj["tokens"]
            n = len(tokens)
            for i in range(n):
                current_tag = tags[i]
                token = tokens[i]
                if token == " ":
                    # TODO: this is kind of cleaning part, if there is time, I would put it in a separate function
                    continue
                if current_tag != "O":
                    prefix, label = current_tag.split("-")
                    if label == "LOCATION": 
                        label = "O"
                        f_out.write(f"{token} O\n")
                        continue
                    current_tag = f"{prefix}-{label}"
                f_out.write(f"{token} {current_tag}\n")
            f_out.write("\n")

In [15]:
attackner_path = RAW_DATA_DIR / "attackner"
attackner_train_path  = attackner_path / "train.json"
attackner_dev_path= attackner_path / "dev.json"
attackner_test_path= attackner_path / "test.json"


attacker_to_iob2_format(attackner_test_path)
attacker_to_iob2_format(attackner_train_path)
attacker_to_iob2_format(attackner_dev_path)

# move manually to tokenprocessed after this.

In [17]:
attackner_path = TOKENPROCESSED_DATA_DIR / "attacker"
attackner_train_path  = attackner_path / "train.tokenready"
attackner_dev_path= attackner_path / "dev.tokenready"
attackner_test_path= attackner_path / "test.tokenready"

attackner_train_data = read_iob2_file(attackner_train_path, word_index=0, tag_index=1)
attackner_dev_data = read_iob2_file(attackner_dev_path, word_index=0, tag_index=1)
attackner_test_data = read_iob2_file(attackner_test_path, word_index=0, tag_index=1)

In [18]:
attackner_train_X, attackner_train_y, attackner_dev_X, attackner_dev_y, attackner_test_X, attackner_test_y, attackner_idx2word, attackner_idx2label, attackner_max_len = \
transform_dataset(
    attackner_train_data, attackner_dev_data, attackner_test_data
)

In [19]:
attackner_train_X.shape

torch.Size([2481, 107])

In [None]:
attackner_idx2label

#### CYNER - Doesn't require any processing specific to the token method - we keep indicator dropped for the same reasons as stated for other datasets; Indicator contains subcategories of entities, email, hash, port number, etc, that we believe to be trivial. So this is just loading in the processed cyner data.

In [56]:
#Just download the bug fixed cyner datasets from github and manually replace for now.
#There is no token specific version of cyner.

cyner_path = PROCESSED_DATA_DIR / "cyner"
cyner_train_path = cyner_path / "train.unified"
cyner_dev_path = cyner_path / "valid.unified"
cyner_test_path = cyner_path / "test.unified"
cyner_train_data = read_iob2_file(cyner_train_path)
cyner_dev_data = read_iob2_file(cyner_dev_path)
cyner_test_data = read_iob2_file(cyner_test_path)

In [57]:
cyner_train_X, cyner_train_y, cyner_dev_X, cyner_dev_y, cyner_test_X, cyner_test_y, cyner_idx2word, cyner_idx2label, cyner_max_len = \
transform_dataset(
    cyner_train_data, cyner_dev_data, cyner_test_data,
)

In [58]:
cyner_idx2label

#bugged. Will need to change unify cyner function in dataset.py.

['<PAD>',
 'B-Malware',
 'I-Malware',
 'O',
 'B-System',
 'I-System',
 'B-Organization',
 'I-Organization',
 'B-Vulnerability',
 'I-Vulnerability']

In [69]:
cyner_idx2label[0:2]

['<PAD>', 'B-Malware']

In [59]:
len(cyner_idx2word)

7955

In [60]:
cyner_train_X.shape

torch.Size([2811, 106])

In [61]:
cyner_max_len

106

In [None]:
print("ok")