In [None]:
import torch
import json
import spacy
import numpy as np
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report
from transformers import EarlyStoppingCallback

# Verify environment
print("Torch version:", torch.__version__)
#print("Transformers version:", transformers.__version__)

# Load spaCy for word-level tokenization
nlp = spacy.load("en_core_web_sm")

# Load the labeled data
bnsapa_dataset = load_dataset("bnsapa/cybersecurity-ner")
with open('/content/sample_data/project-3-at-2025-04-16-06-39-9eb2c540.json', 'r') as f:
    data = json.load(f)

# Debug JSON structure
print("Sample JSON entries:", data[:2])  # Inspect first two entries

# Extract text and annotations, converting to BIO format
examples = []
for task in data:
    text = task.get('data', {}).get('text', '')
    if not isinstance(text, str):
        print(f"Warning: Invalid text in task: {task}")
        continue
    annotations = task.get('annotations', [])
    if not annotations or annotations[0].get('was_cancelled', True):
        print(f"Warning: No valid annotations in task: {task['data']['text'][:50]}...")
        continue
    results = annotations[0].get('result', [])
    if not results:
        print(f"Warning: No results in task: {task['data']['text'][:50]}...")
        continue

    # Tokenize text
    doc = nlp(text)
    tokens = [token.text for token in doc]
    token_start_end = [(token.idx, token.idx + len(token.text)) for token in doc]
    labels = ['O'] * len(tokens)

    # Assign BIO labels
    for ent in results:
        try:
            label = ent['value']['labels'][0]
            start, end = ent['value']['start'], ent['value']['end']
            for i, (tok_start, tok_end) in enumerate(token_start_end):
                if tok_start >= start and tok_end <= end:
                    if i == 0 or labels[i-1] == 'O' or labels[i-1][2:] != label:
                        labels[i] = 'B-' + label
                    else:
                        labels[i] = 'I-' + label
        except (KeyError, IndexError) as e:
            print(f"Warning: Invalid annotation in task: {task['data']['text'][:50]}... Error: {e}")
            continue

    # Define label list
    label_list = ['O', 'B-Organization', 'I-Organization', 'B-System', 'I-System', 
                  'B-Malware', 'I-Malware', 'B-Indicator', 'I-Indicator', 
                  'B-Vulnerability', 'I-Vulnerability']
    label_to_id = {label: i for i, label in enumerate(label_list)}
    ner_tags = [label_to_id.get(label, 0) for label in labels]
    
    # Create example dictionary
    example = {'tokens': tokens, 'ner_tags': ner_tags}
    
    # Debug example
    print(f"Created example: tokens={len(tokens)}, ner_tags={len(ner_tags)}")
    examples.append(example)

# Validate examples
for i, ex in enumerate(examples):
    if not isinstance(ex, dict) or 'tokens' not in ex or 'ner_tags' not in ex:
        print(f"Error: Invalid example at index {i}: {ex}")
        examples[i] = None
examples = [ex for ex in examples if ex is not None]
print("Total valid examples:", len(examples))

# Create Dataset
if not examples:
    raise ValueError("No valid examples to create dataset")
dataset = Dataset.from_list(examples)
print("Type of dataset:", type(dataset))
print("Dataset size:", len(dataset))
print("First entry:", dataset[0])

from datasets import Sequence, ClassLabel

# Get the ClassLabel object from bnsapa
class_label = bnsapa_dataset["train"].features["ner_tags"].feature

# Cast your dataset's ner_tags column to use the same ClassLabel
dataset = dataset.cast_column("ner_tags", Sequence(feature=class_label))


# Debug dataset

for i, example in enumerate(dataset.select(range(3))):
    print(f"Sample {i}: tokens={len(example['tokens'])}, ner_tags={example['ner_tags']}")
    assert len(example['tokens']) == len(example['ner_tags']), f"Mismatch at index {i}"

# Check label distribution
from collections import Counter
all_labels = [label for example in dataset for label in example['ner_tags']]
label_counts = Counter(all_labels)
print("Label distribution:", {label_list[k]: v for k, v in label_counts.items()})

from datasets import concatenate_datasets

# Combine Hugging Face dataset and your 100 examples
merged_dataset = concatenate_datasets([bnsapa_dataset["train"], dataset])  # order doesn't matter if you shuffle
merged_dataset = merged_dataset.shuffle(seed=42)

# Then do train-test split
train_test_split = merged_dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Load pre-trained model and tokenizer
#model_name = "bnsapa/cybersecurity-ner"
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
#model = AutoModelForTokenClassification.from_pretrained("distilbert-base-cased", num_labels=len(label_list))
# Set up label mappings
model.config.id2label = {i: label for i, label in enumerate(label_list)}
model.config.label2id = label_to_id

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=True,
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



# Apply tokenization
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['tokens', 'ner_tags'])
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['tokens', 'ner_tags'])

# Set up data collator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

# Compute metrics for NER
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    
    results = classification_report(true_labels, true_predictions, output_dict=True)
    print("Classification report:", results)
    return {
        "precision": results["weighted avg"]["precision"],
        "recall": results["weighted avg"]["recall"],
        "f1": results["weighted avg"]["f1-score"]
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    logging_strategy='steps',
    logging_steps=10,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=17,
    weight_decay=0.01,
    fp16=True,
    dataloader_num_workers=4,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)
print("Number of labels in model:", model.config.num_labels)

for idx, ex in enumerate(dataset):
    for tag in ex["ner_tags"]:
        if tag < 0 or tag >= 11:
            print(f"🚨 Invalid tag {tag} in example {idx}: {ex['tokens']}")


# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train and evaluate
trainer.train()
eval_results = trainer.evaluate()
print("Final evaluation results:", eval_results)

# Save model
trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

# Check best model checkpoint
print("Best model checkpoint:", trainer.state.best_model_checkpoint)