In [None]:
!pip install datasets
!pip install seqeval
!pip install evaluate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:

In [None]:
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from evaluate import load as load_metric  # Correct import for metrics

# Load dataset and fast tokenizer
dataset = load_dataset("conll2003")
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)

# Get number of labels
ner_labels = dataset['train'].features['ner_tags'].feature.names
num_labels = len(ner_labels)

# Load model with the number of labels
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Tokenize the dataset and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensure padding
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Align the label to the word
            else:
                label_ids.append(-100)  # Align subword tokens to -100 (ignored in loss calculation)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize and align labels with padding enabled
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Load the evaluation metric for NER
metric = load_metric("seqeval")

# Define compute_metrics function for evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_labels = [[ner_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [[ner_labels[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_first_step=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="tensorboard",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3453 [00:00<?, ? examples/s]



In [None]:
# %load_ext tensorboard

In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0502,0.057007,0.914533,0.925614,0.92004,0.986196


TrainOutput(global_step=878, training_loss=0.1425539295195447, metrics={'train_runtime': 1481.1496, 'train_samples_per_second': 9.48, 'train_steps_per_second': 0.593, 'total_flos': 3669099951393792.0, 'train_loss': 0.1425539295195447, 'epoch': 1.0})

In [None]:
# %tensorboard --logdir ./logs

In [None]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.057007428258657455, 'eval_precision': 0.9145327569005653, 'eval_recall': 0.9256142712891282, 'eval_f1': 0.9200401472064236, 'eval_accuracy': 0.9861960204041899, 'eval_runtime': 103.1455, 'eval_samples_per_second': 31.509, 'eval_steps_per_second': 1.978, 'epoch': 1.0}


In [None]:
# After training is complete
model_save_path = "./ner_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/sentencepiece.bpe.model',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

# Inference

In [None]:
from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizerFast

model_load_path = "./ner_model"
loaded_model = XLMRobertaForTokenClassification.from_pretrained(model_load_path)
loaded_tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_load_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

In [None]:
from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizerFast

model_load_path = "/content/clients_file"
loaded_model = XLMRobertaForTokenClassification.from_pretrained(model_load_path)
loaded_tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_load_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

In [9]:
import torch
import json

ner_labels = loaded_model.config.id2label.values()

def perform_ner(text, model, tokenizer, id2label):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label indices
    predictions = torch.argmax(outputs.logits, dim=2)

    # Get the confidence scores
    confidence_scores = torch.nn.functional.softmax(outputs.logits, dim=2)

    # Decode the predictions
    decoded_preds = []
    for token, pred, conf in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
                                 predictions[0],
                                 confidence_scores[0]):
        if token.startswith("▁"):  # XLM-RoBERTa uses '▁' to denote start of word
            label = id2label[pred.item()]
            confidence = conf[pred].item() * 100  # Convert to percentage
            decoded_preds.append({
                "token": token.lstrip("▁"),
                "label": label,
                "confidence": f"{confidence:.2f}%"
            })

    # Create JSON-like response
    response = {}
    for pred in decoded_preds:
        if pred['label'] != 'O':  # Ignore 'Outside' labels
            entity_type = pred['label'].split('-')[-1]  # Get the entity type (e.g., 'Vehicle' from 'B-Vehicle')
            if entity_type not in response:
                response[entity_type] = []
            response[entity_type].append({
                "token": pred['token'],
                "label": pred['label'],
                "confidence": pred['confidence']
            })

    return json.dumps(response, indent=2)

# Create id2label mapping
id2label = {i: label for i, label in enumerate(ner_labels)}

In [10]:
# Example usage
text = "Hulls señaló que en el sistema jurídico de la Commonwealth , en el que se basa la justicia australiana , es fundamental que la persona sea juzgada únicamente teniendo en cuenta las pruebas presentadas ante el juez ."
result = perform_ner(text, loaded_model, loaded_tokenizer, id2label)
print(result)

{
  "PER": [
    {
      "token": "Hu",
      "label": "B-PER",
      "confidence": "99.72%"
    }
  ],
  "ORG": [
    {
      "token": "Common",
      "label": "B-ORG",
      "confidence": "97.46%"
    }
  ]
}
