In [None]:
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np

dataset = load_dataset("yixuantt/FinEntity")


model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


original_label_list = sorted(list(set([tag for example in dataset["train"]["annotations"] for tag in [anno["tag"] for anno in example]])))
label_list = ["O"]
for label in original_label_list:
    label_list.append(f"B-{label}")
    label_list.append(f"I-{label}")

print(f"Label list (IOB2): {label_list}")
num_labels = len(label_list)
print(f"Number of labels: {num_labels}")


def tokenize_and_align_annotations(examples, label_list):
    tokenized_inputs = tokenizer(
        examples["content"],
        truncation=True,
        return_offsets_mapping=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )

    labels = []

    iob2_label_list = ["O"]
    for label in label_list:
        iob2_label_list.append(f"B-{label}")
        iob2_label_list.append(f"I-{label}")
    iob2_label_map = {label: i for i, label in enumerate(iob2_label_list)}


    for i in range(len(examples["content"])):
        offset_mapping = tokenized_inputs.offset_mapping[i]
        annotations = examples["annotations"][i]
        text = examples["content"][i]
        sequence_labels = []


        char_labels = [iob2_label_map["O"]] * len(text)
        for annotation in annotations:
            annotation_start_char = annotation["start"]
            annotation_end_char = annotation["end"]
            label = annotation["tag"]
            if label in label_list:
                label_id_b = iob2_label_map[f"B-{label}"]
                label_id_i = iob2_label_map[f"I-{label}"]
                
                if annotation_start_char < len(char_labels):
                     char_labels[annotation_start_char] = label_id_b
                
                for char_idx in range(annotation_start_char + 1, annotation_end_char):
                    if char_idx < len(char_labels):
                        char_labels[char_idx] = label_id_i


        previous_word_id = None
        for j, word_id in enumerate(tokenized_inputs.word_ids(batch_index=i)):
            if word_id is None:
                sequence_labels.append(-100)
            elif word_id != previous_word_id:
                
                token_start_char, token_end_char = offset_mapping[j]
                if token_start_char < len(char_labels):
                     sequence_labels.append(char_labels[token_start_char])
                else:
                     sequence_labels.append(iob2_label_map["O"]) 
            else:
                
                sequence_labels.append(-100)
            previous_word_id = word_id

        labels.append(sequence_labels)

    tokenized_inputs["labels"] = labels
    tokenized_inputs.pop("offset_mapping")
    return tokenized_inputs



train_test_split_dataset = dataset["train"].train_test_split(test_size=0.2)
tokenized_dataset = train_test_split_dataset.map(tokenize_and_align_annotations, batched=True, fn_kwargs={"label_list": original_label_list})


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=num_labels
)


metric = load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

   
    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



training_args = TrainingArguments(
    output_dir="./finentity-ner-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"], 
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()

Label list (IOB2): ['O', 'B-Negative', 'I-Negative', 'B-Neutral', 'I-Neutral', 'B-Positive', 'I-Positive']
Number of labels: 7


Map:   0%|          | 0/783 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msarvanperumalla[0m ([33msarvanperumalla-vit-ap[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.196176,0.334579,0.434466,0.378036,0.936548
2,No log,0.113706,0.351852,0.461165,0.39916,0.955901
3,No log,0.090475,0.455102,0.541262,0.494457,0.965525
4,No log,0.079326,0.592511,0.652913,0.621247,0.970495
5,No log,0.078794,0.539749,0.626214,0.579775,0.969437


TrainOutput(global_step=245, training_loss=0.17050217609016263, metrics={'train_runtime': 529.9073, 'train_samples_per_second': 7.388, 'train_steps_per_second': 0.462, 'total_flos': 198182134532976.0, 'train_loss': 0.17050217609016263, 'epoch': 5.0})

In [None]:
trainer.save_model("./finentity-ner-model")   
tokenizer.save_pretrained("./finentity-ner-model")


('./finentity-ner-model/tokenizer_config.json',
 './finentity-ner-model/special_tokens_map.json',
 './finentity-ner-model/vocab.txt',
 './finentity-ner-model/added_tokens.json',
 './finentity-ner-model/tokenizer.json')

In [24]:
ner_pipeline = pipeline(
    "ner",
    model="./finentity-ner-model",
    tokenizer="./finentity-ner-model",
    aggregation_strategy="simple"
)


Device set to use cuda:0


In [None]:
from transformers import AutoConfig, AutoModelForTokenClassification, AutoTokenizer, pipeline


label_list = ['O', 'B-Negative', 'I-Negative', 'B-Neutral', 'I-Neutral', 'B-Positive', 'I-Positive']
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}


model_path = "./finentity-ner-model"
config = AutoConfig.from_pretrained(model_path, num_labels=len(label_list), id2label=id2label, label2id=label2id)
model = AutoModelForTokenClassification.from_pretrained(model_path, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_path)


ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


texts = [
    "Tesla shares rose 15% after 1 year",
    "Apple stocks rose 5% and Goldman Sachs stocks remain constant",
    "Goldman Sachs invested $500 million in India."
]


for text in texts:
    results = ner_pipeline(text)
    print(f"\nText: {text}")
    for entity in results:
        print(f"Entity: {entity['word']} | Label: {entity['entity_group']} | Score: {entity['score']:.4f}")


Device set to use cuda:0



Text: Tesla shares rose 15% after 1 year
Entity: tesla | Label: Positive | Score: 0.4087

Text: Apple stocks rose 5% and Goldman Sachs stocks remain constant
Entity: apple | Label: Positive | Score: 0.4501
Entity: goldman sachs | Label: Positive | Score: 0.5102

Text: Goldman Sachs invested $500 million in India.
Entity: goldman sachs | Label: Neutral | Score: 0.4979
