In [22]:
#! pip install seqeval
#! pip install evaluate
#! pip install pandas
#! pip install datasets
#! pip install torch
#! pip install transformers
#! pip install scikit-learn
#! pip install ninja
#! pip install flash-attn
#! pip install packaging
#! pip install flash-attn --no-build-isolation

Collecting flash-attn
  Using cached flash_attn-2.7.4.post1.tar.gz (6.0 MB)


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\magnu\\AppData\\Local\\Temp\\pip-install-_h0lj359\\flash-attn_a64e364bab4b4bda827435c2541f476a\\csrc\\composable_kernel\\library\\include\\ck\\library\\tensor_operation_instance\\gpu\\grouped_conv_bwd_weight\\device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp'


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import json
import torch
import evaluate
import seqeval
import accelerate
import transformers
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split

#from flash_attn.flash_attention import FlashAttention
# Flash Attention for faster training

torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for better performance
torch.backends.cudnn.benchmark = True  # Enable CuDNN auto-tuner

In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load dataset
DATA_PATH = "tokenized_ner_data_4.json"
with open(DATA_PATH, "r") as f:
    data = json.load(f)

In [12]:
# Extract all unique label types from dataset
unique_labels = list(set(label for entry in data for label in entry["labels"]))

# Create mapping from label name -> index
label_to_id = {label: i for i, label in enumerate(unique_labels)}

# Convert dataset labels from strings to integer IDs
for entry in data:
    entry["labels"] = [label_to_id[label] for label in entry["labels"]]

In [4]:
def preprocess_data(data):
    tokenized_inputs = []
    tokenized_labels = []

    for entry in data:
        tokens = entry["tokens"]
        labels = entry["labels"]

        tokenized_inputs.append(tokens)
        tokenized_labels.append(labels)
    return {"tokens": tokenized_inputs, "labels": tokenized_labels}

dataset = Dataset.from_dict(preprocess_data(data))
train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2).values()

In [5]:
# Load tokenizer and model
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(set([label for entry in data for label in entry["labels"]]))).to(device)

# Enable Flash Attention in the model (if applicable)
#model.config.use_flash_attention = True

Some weights of ModernBertForTokenClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Process dataset with tokenizer and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=512
    )
    labels = []
    
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get mapping of subwords to words
        previous_word_id = None
        label_ids = []
        
        for word_id in word_ids:
            if word_id is None:  # Ignore special tokens (CLS, SEP, PAD)
                label_ids.append(-100)
            elif word_id != previous_word_id:  # Assign correct label only to first subword
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)  # Assign -100 to subsequent subwords
            
            previous_word_id = word_id

        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels

    # 🔹 Debugging Step: Print a Sample to Verify Alignment
    print("\n==== Sample Debugging ====")
    print("Original Tokens: ", examples["tokens"][0])
    print("Tokenized Input IDs: ", tokenized_inputs["input_ids"][0])
    print("Word IDs Mapping: ", word_ids)
    print("Aligned Labels: ", labels[0])
    print("==========================\n")
    
    return tokenized_inputs

# Convert dataset to Hugging Face Dataset format
dataset = Dataset.from_dict({"tokens": [entry["tokens"] for entry in data], "labels": [entry["labels"] for entry in data]})

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/61 [00:00<?, ? examples/s]


==== Sample Debugging ====
Original Tokens:  ['[CLS]', 'A', 'new', 'ransom', '##ware', '-', 'as', '-', 'a', '-', 'service', '(', 'Ra', '##a', '##S', ')', 'operation', 'named', 'C', '##ica', '##da', '##33', '##01', 'has', 'already', 'listed', '19', 'victims', 'on', 'its', 'ex', '##tor', '##tion', 'portal', ',', 'as', 'it', 'quickly', 'attacked', 'companies', 'worldwide', '.', 'The', 'new', 'c', '##y', '##ber', '##c', '##rim', '##e', 'operation', 'is', 'named', 'after', 'the', 'mysterious', '2012', '-', '2014', 'online', '/', 'real', '-', 'world', 'game', 'that', 'involved', 'elaborate', 'cry', '##pt', '##ographic', 'puzzles', 'and', 'used', 'the', 'same', 'logo', 'for', 'promotion', 'on', 'c', '##y', '##ber', '##c', '##rim', '##e', 'forums', '.', 'However', ',', 'there', "'", 's', 'no', 'connection', 'between', 'the', 'two', ',', 'and', 'the', 'legitimate', 'project', 'has', 'issued', 'a', 'statement', 'to', 're', '##nounce', 'any', 'association', 'and', 'con', '##de', '##m', '##n', 't

In [20]:
# Load metric
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[label for label in label_seq if label != -100] for label_seq in labels]
    true_predictions = [[pred for pred, lab in zip(pred_seq, label_seq) if lab != -100] for pred_seq, label_seq in zip(predictions, labels)]
    return seqeval.compute(predictions=true_predictions, references=true_labels)

In [21]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",     # Output directory
    eval_strategy="epoch", # Evaluate after each epoch
    save_strategy="epoch",      # Save model after each epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",       # Log directory
    logging_steps=10,
    save_total_limit=2,         # Save last 2 models only
    fp16=True,                  # Enable mixed precision training for better performance
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported for better speed
    optim="adamw_torch_fused",  # Use fused optimizer for better CUDA performance
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [23]:
# Train the model
trainer.train()

# Save the model
trainer.save_model("./ner_model")
tokenizer.save_pretrained("./ner_model")

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []