In [3]:
import os
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")  # points to the root of the repo

from nlp_cyber_ner.dataset import (
    read_iob2_file,
    remove_leakage,
    prepare_cross_dataset,
    Preprocess
)


[32m2025-04-06 14:20:05.868[0m | [1mINFO    [0m | [36mnlp_cyber_ner.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\johan\OneDrive\Skrivebord\Explorer\ITU\4_semester\NLP_and_deep_learning\Exam_project\repo_new_push_github\NLP-Cyber-NER[0m


In [11]:
DATASETS = {
    "aptner": (
        read_iob2_file,
        "APTNERtrain.unified",
        "APTNERdev.unified",
        "APTNERtest.unified",
        "APTNER",
        {"word_index": 0, "tag_index": 1},
        "processed"
    ),
    "dnrti": (
        read_iob2_file,
        "train.unified",
        "valid.unified",
        "test.unified",
        "dnrti",
        {"word_index": 0, "tag_index": 1},
        "processed"
    ),
    "attackner": (
        read_iob2_file,
        "train.unified",
        "dev.unified",
        "test.unified",
        "attackner",
        {"word_index": 0, "tag_index": 1},
        "processed"
    ),
    "cyner": (
        read_iob2_file,
        "train.unified",
        "valid.unified",
        "test.unified",
        "cyner",
        {"word_index": 0, "tag_index": 1},
        "processed" 
    )
}


In [13]:
all_results = {}

for train_name, (train_reader, train_file, dev_file, _, train_folder, train_kwargs, train_subdir) in DATASETS.items():
    for test_name, (test_reader, _, _, test_file, test_folder, test_kwargs, test_subdir) in DATASETS.items():
        print(f"\n Training on {train_name} | Testing on {test_name}")

    
        train_path = os.path.join("..", "data", train_subdir, train_folder, train_file)
        dev_path   = os.path.join("..", "data", train_subdir, train_folder, dev_file)
        test_path  = os.path.join("..", "data", test_subdir, test_folder, test_file)

        if not os.path.exists(train_path) or not os.path.exists(test_path):
            print(" Missing data files, skipping.")
            continue
        if not os.path.exists(dev_path):
            print(f" Dev file missing for {train_name}, using empty dev set.")
            dev_path = None

        # Run all the preparation 
        transformer, train_X, train_y, dev_X, dev_y, test_X, test_y, idx2word, idx2label = prepare_cross_dataset(
            train_reader=train_reader,
            dev_reader=train_reader,  
            test_reader=test_reader,
            train_path=train_path,
            dev_path=dev_path,
            test_path=test_path,
            remove_leakage_from_test=True,
            dev_split_ratio=None,
            reader_kwargs=train_kwargs
        )

        # Save everything to dictionary
        all_results[(train_name, test_name)] = {
            "transformer": transformer,
            "train_X": train_X,
            "train_y": train_y,
            "dev_X": dev_X,
            "dev_y": dev_y,
            "test_X": test_X,
            "test_y": test_y,
            "idx2word": idx2word,
            "idx2label": idx2label,
        }

        print(f"Stored {train_X.shape[0]} training samples for {train_name} → {test_name}")



 Training on aptner | Testing on aptner
Removed 105 leaked sentences from training data.
Removed 28 leaked sentences from training data.
Removed 105 train and 28 dev overlapping sentences.
Stored 6640 training samples for aptner → aptner

 Training on aptner | Testing on dnrti
Removed 263 leaked sentences from training data.
Removed 1 leaked sentences from training data.
Removed 263 train and 1 dev overlapping sentences.
Stored 6482 training samples for aptner → dnrti

 Training on aptner | Testing on attackner
Removed 0 leaked sentences from training data.
Removed 0 leaked sentences from training data.
Removed 0 train and 0 dev overlapping sentences.
Stored 6745 training samples for aptner → attackner

 Training on aptner | Testing on cyner
Removed 20 leaked sentences from training data.
Removed 1 leaked sentences from training data.
Removed 20 train and 1 dev overlapping sentences.
Stored 6725 training samples for aptner → cyner

 Training on dnrti | Testing on aptner
Removed 127 le

In [15]:
entry = all_results[("aptner", "dnrti")]

print("train_X shape:", entry["train_X"].shape)
print("train_y shape:", entry["train_y"].shape)
print("dev_X shape:", entry["dev_X"].shape if entry["dev_X"] is not None else "None")
print("dev_y shape:", entry["dev_y"].shape if entry["dev_y"] is not None else "None")
print("test_X shape:", entry["test_X"].shape)
print("test_y shape:", entry["test_y"].shape)


train_X shape: torch.Size([6482, 5862])
train_y shape: torch.Size([6482, 5862])
dev_X shape: torch.Size([1751, 5862])
dev_y shape: torch.Size([1751, 5862])
test_X shape: torch.Size([664, 5862])
test_y shape: torch.Size([664, 5862])
