<a href="https://colab.research.google.com/github/TasOishe/NLP/blob/main/nlp_final_project_group_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TASK 1**

In [None]:
def edit_distance(str1, str2):
    m = len(str1)
    n = len(str2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(
                    dp[i - 1][j],
                    dp[i][j - 1],
                    dp[i - 1][j - 1]
                )

    return dp[m][n]


def similarity_score(str1, str2):
    max_len = max(len(str1), len(str2))
    if max_len == 0:
        return 1.0
    distance = edit_distance(str1, str2)
    return 1.0 - (distance / max_len)


if __name__ == "__main__":
    print("Minimum Edit Distance Similarity Score Calculator")

    string1 = input("Enter the first string: ")
    string2 = input("Enter the second string: ")

    distance = edit_distance(string1, string2)
    score = similarity_score(string1, string2)
    percentage = score * 100

    print(f"Edit Distance       : {distance}")
    print(f"Similarity Score    : {score:.4f}")
    print(f"Similarity Percent  : {percentage:.2f}%")
    print("-" * 40)


Minimum Edit Distance Similarity Score Calculator
Enter the first string: i am a girl
Enter the second string: im a lady
Edit Distance       : 6
Similarity Score    : 0.4545
Similarity Percent  : 45.45%
----------------------------------------


**TASK 2**

In [None]:
from collections import Counter
import math
import nltk
from nltk.corpus import brown

nltk.download("brown")

CORPUS = [" ".join(sent) for sent in brown.sents(categories="news")]

def tokenize(text):
    return text.lower().split()

def build_ngrams(corpus):
    bigram, trigram, unigram = Counter(), Counter(), Counter()
    vocab = set()
    for sent in corpus:
        tokens = ["<s>"] + tokenize(sent) + ["</s>"]
        for i, w in enumerate(tokens):
            unigram[w] += 1
            vocab.add(w)
            if i >= 1:
                bigram[(tokens[i-1], w)] += 1
            if i >= 2:
                trigram[(tokens[i-2], tokens[i-1], w)] += 1
    return vocab, unigram, bigram, trigram

def bigram_prob(w1, w2, unigram, bigram, V):
    return (bigram.get((w1, w2), 0) + 1) / (unigram.get(w1, 0) + V)

def trigram_prob(w1, w2, w3, bigram, trigram, V):
    return (trigram.get((w1, w2, w3), 0) + 1) / (bigram.get((w1, w2), 0) + V)

def predict_next_bigram(prefix, vocab, unigram, bigram):
    last = prefix.lower().split()[-1] if prefix else "<s>"
    V = len(vocab)
    scores = [(w, bigram_prob(last, w, unigram, bigram, V)) for w in vocab]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:3]

def predict_next_trigram(prefix, vocab, bigram, trigram):
    toks = prefix.lower().split()
    if len(toks) >= 2:
        w1, w2 = toks[-2], toks[-1]
    else:
        w1, w2 = "<s>", toks[-1] if toks else "<s>"
    V = len(vocab)
    scores = [(w, trigram_prob(w1, w2, w, bigram, trigram, V)) for w in vocab]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:3]

def prob_sentence_bigram(sent, vocab, unigram, bigram):
    toks = ["<s>"] + tokenize(sent) + ["</s>"]
    V = len(vocab)
    logp = sum(math.log(bigram_prob(toks[i-1], toks[i], unigram, bigram, V)) for i in range(1, len(toks)))
    return math.exp(logp)

def prob_sentence_trigram(sent, vocab, bigram, trigram):
    toks = ["<s>"] + tokenize(sent) + ["</s>"]
    V = len(vocab)
    logp = sum(math.log(trigram_prob(toks[i-2], toks[i-1], toks[i], bigram, trigram, V)) for i in range(2, len(toks)))
    return math.exp(logp)

if __name__ == "__main__":
    vocab, unigram, bigram, trigram = build_ngrams(CORPUS)
    text = input("Enter a text: ")
    print("\n--- Predictions ---")
    print("Bigram prediction:", predict_next_bigram(text, vocab, unigram, bigram))
    print("Trigram prediction:", predict_next_trigram(text, vocab, bigram, trigram))
    print("\n--- Sentence Probability ---")
    print("Bigram sentence probability:", prob_sentence_bigram(text, vocab, unigram, bigram))
    print("Trigram sentence probability:", prob_sentence_trigram(text, vocab, bigram, trigram))


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Enter a text: pin

--- Predictions ---
Bigram prediction: [('pricking', 0.0001524971406786123), ('solidarity', 7.624857033930614e-05), ('bomb', 7.624857033930614e-05)]
Trigram prediction: [('solidarity', 7.625438462711606e-05), ('bomb', 7.625438462711606e-05), ('capable', 7.625438462711606e-05)]

--- Sentence Probability ---
Bigram sentence probability: 4.298842551688915e-09
Trigram sentence probability: 7.625438462711609e-05


**TASK 3**

In [None]:
!pip install transformers datasets seqeval evaluate


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=727a68725c66ec0567d7858c1b4824ac3a9d879ff29c9b5c514dc83b0e101634
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.6 seqeval-1.2.2


In [None]:

from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    DataCollatorForTokenClassification, TrainingArguments, Trainer
)
from datasets import load_dataset, Dataset
import numpy as np
import evaluate
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
scierc = load_dataset("nsusemiehl/SciERC")


scierc_train_sample = scierc["train"].select(range(100))
scierc_dev_sample   = scierc["validation"].select(range(100))
scierc_test_sample  = scierc["test"].select(range(100))


from datasets import DatasetDict

scierc_small = DatasetDict({
    "train": scierc_train_sample,
    "validation": scierc_dev_sample,
    "test": scierc_test_sample
})


print(scierc_small)
print(scierc_small["train"][0])
print(scierc_small["validation"][0])
print(scierc_small["test"][0])


Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'metadata'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'label', 'metadata'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label', 'metadata'],
        num_rows: 100
    })
})
{'text': 'The agreement in question involves number in [[ nouns ]] and << reflexive pronouns >> and is syntactic rather than semantic in nature because grammatical number in English , like grammatical gender in languages such as French , is partly arbitrary .', 'label': 'CONJUNCTION', 'metadata': [7, 7, 9, 10]}
{'text': 'This paper presents an [[ algorithm ]] for << computing optical flow , shape , motion , lighting , and albedo >> from an image sequence of a rigidly-moving Lambertian object under distant illumination .', 'label': 'USED-FOR', 'metadata': [4, 4, 6, 17]}
{'text': '[[ Recognition of proper nouns ]] in Japanese text has been studied as a part of the more genera

In [None]:
from google.colab import drive
import json
from pathlib import Path


drive.mount('/content/drive')


bc5cdr_path = Path("/content/drive/MyDrive/BC5CDR")


train_file = bc5cdr_path / "train.json"
test_file  = bc5cdr_path / "test.json"

with open(train_file, "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]

with open(test_file, "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]


print("Number of training samples:", len(train_data))
print("Number of test samples:", len(test_data))
print("First training sample:", train_data[0])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of training samples: 5228
Number of test samples: 5865
First training sample: {'tags': [1, 0, 0, 0, 0, 0, 1, 0], 'tokens': ['Naloxone', 'reverses', 'the', 'antihypertensive', 'effect', 'of', 'clonidine', '.']}


In [None]:
from datasets import Dataset, DatasetDict


bc5cdr_dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})


bc5cdr_dataset["train"] = bc5cdr_dataset["train"].select(range(100))
bc5cdr_dataset["test"] = bc5cdr_dataset["test"].select(range(100))


print(bc5cdr_dataset["train"][0])

{'tags': [1, 0, 0, 0, 0, 0, 1, 0], 'tokens': ['Naloxone', 'reverses', 'the', 'antihypertensive', 'effect', 'of', 'clonidine', '.']}


In [None]:
def bc5_tags_to_bio(example):
    bio_labels = []
    for t in example["tags"]:
        if t == 0:
            bio_labels.append("O")
        else:
            bio_labels.append("B-Chemical")
    return {"tokens": example["tokens"], "ner_tags": bio_labels}


bc5cdr_dataset = DatasetDict({
    "train": Dataset.from_list([bc5_tags_to_bio(x) for x in train_data[:100]]),
    "test": Dataset.from_list([bc5_tags_to_bio(x) for x in test_data[:100]])
})


print(bc5cdr_dataset["train"][0])
print(bc5cdr_dataset["train"].column_names)

{'tokens': ['Naloxone', 'reverses', 'the', 'antihypertensive', 'effect', 'of', 'clonidine', '.'], 'ner_tags': ['B-Chemical', 'O', 'O', 'O', 'O', 'O', 'B-Chemical', 'O']}
['tokens', 'ner_tags']


In [None]:

def get_label_list(dataset, label_column):
    unique_labels = set()
    for example in dataset["train"][label_column]:
        if isinstance(example, list):
            unique_labels.update(example)
        else:
            unique_labels.add(example)
    return sorted(list(unique_labels))


scierc_labels = get_label_list(scierc_small, "label")


bc5_labels = get_label_list(bc5cdr_dataset, "ner_tags")

print("SciERC labels:", scierc_labels)
print("BC5 labels:", bc5_labels)


SciERC labels: ['COMPARE', 'CONJUNCTION', 'EVALUATE-FOR', 'FEATURE-OF', 'HYPONYM-OF', 'PART-OF', 'USED-FOR']
BC5 labels: ['B-Chemical', 'O']


In [None]:
print("SciERC columns:", scierc_small["train"].column_names)
print("First train example:", scierc_small["train"][0])


SciERC columns: ['text', 'label', 'metadata']
First train example: {'text': 'The agreement in question involves number in [[ nouns ]] and << reflexive pronouns >> and is syntactic rather than semantic in nature because grammatical number in English , like grammatical gender in languages such as French , is partly arbitrary .', 'label': 'CONJUNCTION', 'metadata': [7, 7, 9, 10]}


In [None]:

def build_label_list_from_column(dataset, column):
    return sorted(set(dataset["train"][column]))

scierc_rel_labels = build_label_list_from_column(scierc_small, "label")
print("SciERC relation labels:", scierc_rel_labels)


label2id_scierc = {l: i for i, l in enumerate(scierc_rel_labels)}
id2label_scierc = {i: l for l, i in label2id_scierc.items()}


def map_scierc_label_to_id(example):
    lab_val = example["label"]
    example["label_id"] = label2id_scierc[lab_val]
    return example

scierc_small = scierc_small.map(map_scierc_label_to_id)
print("Example after mapping:", scierc_small["train"][0])


SciERC relation labels: ['COMPARE', 'CONJUNCTION', 'EVALUATE-FOR', 'FEATURE-OF', 'HYPONYM-OF', 'PART-OF', 'USED-FOR']
Example after mapping: {'text': 'The agreement in question involves number in [[ nouns ]] and << reflexive pronouns >> and is syntactic rather than semantic in nature because grammatical number in English , like grammatical gender in languages such as French , is partly arbitrary .', 'label': 'CONJUNCTION', 'metadata': [7, 7, 9, 10], 'label_id': 1}


In [None]:
def tokenize_scierc(examples, tokenizer):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


def tokenize_and_align_labels_bc5(examples, tokenizer, label2id):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:

                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


tokenized_scierc = scierc_small.map(
    lambda x: tokenize_scierc(x, tokenizer),
    batched=True
)

print("Tokenized SciERC example:", tokenized_scierc["train"][0])


bc5_labels = ["O", "B-Chemical"]
label2id_bc5 = {l: i for i, l in enumerate(bc5_labels)}
id2label_bc5 = {i: l for l, i in label2id_bc5.items()}

tokenized_bc5 = bc5cdr_dataset.map(
    lambda x: tokenize_and_align_labels_bc5(x, tokenizer, label2id_bc5),
    batched=True
)

print("Tokenized BC5CDR example:", tokenized_bc5["train"][0])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenized SciERC example: {'text': 'The agreement in question involves number in [[ nouns ]] and << reflexive pronouns >> and is syntactic rather than semantic in nature because grammatical number in English , like grammatical gender in languages such as French , is partly arbitrary .', 'label': 'CONJUNCTION', 'metadata': [7, 7, 9, 10], 'label_id': 1, 'input_ids': [101, 1996, 3820, 1999, 3160, 7336, 2193, 1999, 1031, 1031, 19211, 1033, 1033, 1998, 1026, 1026, 22259, 3512, 26028, 1028, 1028, 1998, 2003, 19962, 2696, 13306, 2738, 2084, 21641, 1999, 3267, 2138, 24402, 2193, 1999, 2394, 1010, 2066, 24402, 5907, 1999, 4155, 2107, 2004, 2413, 1010, 2003, 6576, 15275, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenized BC5CDR example: {'tokens': ['Naloxone', 'reverses', 'the', 'antihypertensive', 'effect', 'of', 'clonidine', '.'], 'ner_tags': ['B-Chemical', 'O', 'O', 'O', 'O', 'O', 'B-Chemical', 'O'], 'input_ids': [101, 6583, 4135, 22500, 2063, 7901, 2015, 1996, 3424, 10536, 4842, 25808, 3512, 3466, 1997, 18856, 10698, 10672, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification


model_scierc = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(scierc_rel_labels),
    id2label=id2label_scierc,
    label2id=label2id_scierc
)


model_bc5 = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(bc5_labels),
    id2label=id2label_bc5,
    label2id=label2id_bc5
)

print("Models initialized successfully!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Models initialized successfully!


In [None]:
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np


data_collator_bc5 = DataCollatorForTokenClassification(tokenizer)

data_collator_scierc = None


accuracy_metric = evaluate.load("accuracy")
seqeval_metric = evaluate.load("seqeval")


def compute_metrics_scierc(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=preds, references=labels)


def compute_metrics_bc5(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=2)
    true_labels = [[id2label_bc5[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label_bc5[p] for (p, l) in zip(pred, label) if l != -100]
                  for pred, label in zip(preds, labels)]
    return seqeval_metric.compute(predictions=true_preds, references=true_labels)


In [None]:

from transformers import TrainingArguments


training_args_scierc = TrainingArguments(
    output_dir="scierc-rel-bert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs_scierc",
    logging_steps=10,
    learning_rate=2e-5,
    do_train=True,
    do_eval=True,
    logging_strategy="steps"
)


training_args_bc5 = TrainingArguments(
    output_dir="bc5-ner-bert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs_bc5",
    logging_steps=10,
    learning_rate=2e-5,
    do_train=True,
    do_eval=True,
    logging_strategy="steps"
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:

from transformers import Trainer


trainer_scierc = Trainer(
    model=model_scierc,
    args=training_args_scierc,
    train_dataset=tokenized_scierc["train"],
    eval_dataset=tokenized_scierc["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_scierc
)


trainer_bc5 = Trainer(
    model=model_bc5,
    args=training_args_bc5,
    train_dataset=tokenized_bc5["train"],
    eval_dataset=tokenized_bc5["test"],
    tokenizer=tokenizer,
    data_collator=data_collator_bc5,
    compute_metrics=compute_metrics_bc5
)

print("Trainers initialized successfully!")


  trainer_scierc = Trainer(


Trainers initialized successfully!


  trainer_bc5 = Trainer(


In [None]:


from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize_text(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_scierc = scierc_small.map(tokenize_text, batched=True)


def map_labels_to_int(example):
    example["labels"] = label2id_scierc.get(example["label"], 0)
    return example

tokenized_scierc = tokenized_scierc.map(map_labels_to_int)


columns_to_remove = ["text", "label", "metadata", "label_id"]
for split in tokenized_scierc.keys():
    tokenized_scierc[split] = tokenized_scierc[split].remove_columns(
        [c for c in columns_to_remove if c in tokenized_scierc[split].column_names]
    )


model_scierc = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(scierc_labels)
)


training_args_scierc = TrainingArguments(
    output_dir="scierc-rel-bert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="scierc-logs",
    logging_steps=10
)


def compute_metrics_scierc(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


trainer_scierc = Trainer(
    model=model_scierc,
    args=training_args_scierc,
    train_dataset=tokenized_scierc["train"],
    eval_dataset=tokenized_scierc["validation"],
    compute_metrics=compute_metrics_scierc
)


trainer_scierc.train()
results_scierc = trainer_scierc.evaluate()
print("SciERC Relation Classification Results(BERT):", results_scierc)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,1.7501
20,1.6748


SciERC Relation Classification Results(BERT-BASE): {'eval_loss': 1.5045363903045654, 'eval_accuracy': 0.58, 'eval_f1': 0.4258227848101266, 'eval_precision': 0.33640000000000003, 'eval_recall': 0.58, 'eval_runtime': 0.6324, 'eval_samples_per_second': 158.133, 'eval_steps_per_second': 20.557, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np


model_scibert = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=len(scierc_labels)
)
tokenizer_scibert = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize_scierc(example):
    return tokenizer_scibert(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_scierc = scierc_small.map(tokenize_scierc, batched=True)


tokenized_scierc = tokenized_scierc.rename_column("label_id", "labels")

tokenized_scierc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    prec = precision_score(labels, predictions, average="weighted", zero_division=0)
    rec = recall_score(labels, predictions, average="weighted", zero_division=0)
    return {
        "eval_accuracy": acc,
        "eval_f1": f1,
        "eval_precision": prec,
        "eval_recall": rec
    }


training_args_scibert = TrainingArguments(
    output_dir="scierc-rel-scibert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)


trainer_scibert = Trainer(
    model=model_scibert,
    args=training_args_scibert,
    train_dataset=tokenized_scierc["train"],
    eval_dataset=tokenized_scierc["validation"],
    tokenizer=tokenizer_scibert,
    compute_metrics=compute_metrics
)

trainer_scibert.train()
metrics_scibert = trainer_scibert.evaluate(tokenized_scierc["test"])
print("SciERC Relation Classification Results (SciBERT):", metrics_scibert)


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer_scibert = Trainer(


Step,Training Loss
10,1.6218
20,1.5579


SciERC Relation Classification Results (SciBERT): {'eval_accuracy': 0.64, 'eval_f1': 0.4995121951219512, 'eval_precision': 0.4096, 'eval_recall': 0.64, 'eval_loss': 1.288955569267273, 'eval_runtime': 0.8009, 'eval_samples_per_second': 124.857, 'eval_steps_per_second': 16.231, 'epoch': 2.0}


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
import evaluate


tokenizer_bert_bc5 = AutoTokenizer.from_pretrained("bert-base-uncased")
model_bert_bc5 = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(set(label for ex in bc5cdr_dataset["train"]["ner_tags"] for label in ex))
)


all_bc5_labels = sorted(list(set(label for ex in bc5cdr_dataset["train"]["ner_tags"] for label in ex)))
label2id_bc5 = {l: i for i, l in enumerate(all_bc5_labels)}
id2label_bc5 = {i: l for l, i in label2id_bc5.items()}


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_bert_bc5(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id_bc5[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_bc5 = bc5cdr_dataset.map(tokenize_and_align_labels, batched=True)


data_collator = DataCollatorForTokenClassification(tokenizer_bert_bc5)

seqeval = evaluate.load("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label_bc5[l] for l in label if l != -100] for label in labels]
    true_preds  = [[id2label_bc5[p] for (p, l) in zip(pred, label) if l != -100]
                   for pred, label in zip(predictions, labels)]
    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


training_args_bc5 = TrainingArguments(
    output_dir="bc5-bert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="bc5-logs",
    logging_steps=10,
    report_to="none"
)


trainer_bc5_bert = Trainer(
    model=model_bert_bc5,
    args=training_args_bc5,
    train_dataset=tokenized_bc5["train"],
    eval_dataset=tokenized_bc5["test"],
    tokenizer=tokenizer_bert_bc5,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer_bc5_bert.train()
bc5_bert_results = trainer_bc5_bert.evaluate(tokenized_bc5["test"])
print("BC5CDR NER Results (BERT):", bc5_bert_results)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer_bc5_bert = Trainer(


Step,Training Loss
10,0.3331
20,0.1228


BC5CDR NER Results (BERT): {'eval_loss': 0.1274339109659195, 'eval_precision': 0.6018957345971564, 'eval_recall': 0.8141025641025641, 'eval_f1': 0.6920980926430517, 'eval_accuracy': 0.9341491841491841, 'eval_runtime': 0.3992, 'eval_samples_per_second': 250.522, 'eval_steps_per_second': 32.568, 'epoch': 2.0}


In [None]:

tokenizer_scibert_bc5 = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model_scibert_bc5 = AutoModelForTokenClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=len(all_bc5_labels)
)


def tokenize_and_align_labels_scibert(examples):
    tokenized_inputs = tokenizer_scibert_bc5(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id_bc5[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_bc5_scibert = bc5cdr_dataset.map(tokenize_and_align_labels_scibert, batched=True)


data_collator_scibert = DataCollatorForTokenClassification(tokenizer_scibert_bc5)


training_args_bc5_scibert = TrainingArguments(
    output_dir="bc5-scibert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="bc5-scibert-logs",
    logging_steps=10,
    report_to="none"
)


trainer_bc5_scibert = Trainer(
    model=model_scibert_bc5,
    args=training_args_bc5_scibert,
    train_dataset=tokenized_bc5_scibert["train"],
    eval_dataset=tokenized_bc5_scibert["test"],
    tokenizer=tokenizer_scibert_bc5,
    data_collator=data_collator_scibert,
    compute_metrics=compute_metrics
)

trainer_bc5_scibert.train()
bc5_scibert_results = trainer_bc5_scibert.evaluate(tokenized_bc5_scibert["test"])
print("BC5CDR NER Results (SciBERT):", bc5_scibert_results)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer_bc5_scibert = Trainer(


Step,Training Loss
10,0.3309
20,0.1066


BC5CDR NER Results (SciBERT): {'eval_loss': 0.08862539380788803, 'eval_precision': 0.7584269662921348, 'eval_recall': 0.8653846153846154, 'eval_f1': 0.8083832335329341, 'eval_accuracy': 0.9627039627039627, 'eval_runtime': 0.3854, 'eval_samples_per_second': 259.489, 'eval_steps_per_second': 33.734, 'epoch': 2.0}
