<a href="https://colab.research.google.com/github/TasOishe/NLP/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation of BERT and SciBERT for Relation Classification and Named Entity Recognition



In [None]:
!pip install transformers datasets seqeval evaluate


In [None]:
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    DataCollatorForTokenClassification, TrainingArguments, Trainer
)
from datasets import load_dataset, Dataset
import numpy as np
import evaluate
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
scierc = load_dataset("nsusemiehl/SciERC")


scierc_train_sample = scierc["train"].select(range(100))
scierc_dev_sample   = scierc["validation"].select(range(100))
scierc_test_sample  = scierc["test"].select(range(100))


from datasets import DatasetDict

scierc_small = DatasetDict({
    "train": scierc_train_sample,
    "validation": scierc_dev_sample,
    "test": scierc_test_sample
})


print(scierc_small)
print(scierc_small["train"][0])
print(scierc_small["validation"][0])
print(scierc_small["test"][0])


In [None]:
from google.colab import drive
import json
from pathlib import Path


drive.mount('/content/drive')


bc5cdr_path = Path("/content/drive/MyDrive/BC5CDR")


train_file = bc5cdr_path / "train.json"
test_file  = bc5cdr_path / "test.json"

with open(train_file, "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]

with open(test_file, "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]


print("Number of training samples:", len(train_data))
print("Number of test samples:", len(test_data))
print("First training sample:", train_data[0])

In [None]:
from datasets import Dataset, DatasetDict


bc5cdr_dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})


bc5cdr_dataset["train"] = bc5cdr_dataset["train"].select(range(100))
bc5cdr_dataset["test"] = bc5cdr_dataset["test"].select(range(100))


print(bc5cdr_dataset["train"][0])

In [None]:
def bc5_tags_to_bio(example):
    bio_labels = []
    for t in example["tags"]:
        if t == 0:
            bio_labels.append("O")
        else:
            bio_labels.append("B-Chemical")
    return {"tokens": example["tokens"], "ner_tags": bio_labels}


bc5cdr_dataset = DatasetDict({
    "train": Dataset.from_list([bc5_tags_to_bio(x) for x in train_data[:100]]),
    "test": Dataset.from_list([bc5_tags_to_bio(x) for x in test_data[:100]])
})


print(bc5cdr_dataset["train"][0])
print(bc5cdr_dataset["train"].column_names)

In [None]:
def get_label_list(dataset, label_column):
    unique_labels = set()
    for example in dataset["train"][label_column]:
        if isinstance(example, list):
            unique_labels.update(example)
        else:
            unique_labels.add(example)
    return sorted(list(unique_labels))


scierc_labels = get_label_list(scierc_small, "label")


bc5_labels = get_label_list(bc5cdr_dataset, "ner_tags")

print("SciERC labels:", scierc_labels)
print("BC5 labels:", bc5_labels)


In [None]:
print("SciERC columns:", scierc_small["train"].column_names)
print("First train example:", scierc_small["train"][0])


In [None]:
def build_label_list_from_column(dataset, column):
    return sorted(set(dataset["train"][column]))

scierc_rel_labels = build_label_list_from_column(scierc_small, "label")
print("SciERC relation labels:", scierc_rel_labels)


label2id_scierc = {l: i for i, l in enumerate(scierc_rel_labels)}
id2label_scierc = {i: l for l, i in label2id_scierc.items()}


def map_scierc_label_to_id(example):
    lab_val = example["label"]
    example["label_id"] = label2id_scierc[lab_val]
    return example

scierc_small = scierc_small.map(map_scierc_label_to_id)
print("Example after mapping:", scierc_small["train"][0])


In [None]:
def tokenize_scierc(examples, tokenizer):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


def tokenize_and_align_labels_bc5(examples, tokenizer, label2id):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:

                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


tokenized_scierc = scierc_small.map(
    lambda x: tokenize_scierc(x, tokenizer),
    batched=True
)

print("Tokenized SciERC example:", tokenized_scierc["train"][0])


bc5_labels = ["O", "B-Chemical"]
label2id_bc5 = {l: i for i, l in enumerate(bc5_labels)}
id2label_bc5 = {i: l for l, i in label2id_bc5.items()}

tokenized_bc5 = bc5cdr_dataset.map(
    lambda x: tokenize_and_align_labels_bc5(x, tokenizer, label2id_bc5),
    batched=True
)

print("Tokenized BC5CDR example:", tokenized_bc5["train"][0])


In [None]:
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification


model_scierc = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(scierc_rel_labels),
    id2label=id2label_scierc,
    label2id=label2id_scierc
)


model_bc5 = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(bc5_labels),
    id2label=id2label_bc5,
    label2id=label2id_bc5
)

print("Models initialized successfully!")


In [None]:
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np


data_collator_bc5 = DataCollatorForTokenClassification(tokenizer)

data_collator_scierc = None


accuracy_metric = evaluate.load("accuracy")
seqeval_metric = evaluate.load("seqeval")


def compute_metrics_scierc(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=preds, references=labels)


def compute_metrics_bc5(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=2)
    true_labels = [[id2label_bc5[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label_bc5[p] for (p, l) in zip(pred, label) if l != -100]
                  for pred, label in zip(preds, labels)]
    return seqeval_metric.compute(predictions=true_preds, references=true_labels)


In [None]:
from transformers import TrainingArguments


training_args_scierc = TrainingArguments(
    output_dir="scierc-rel-bert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs_scierc",
    logging_steps=10,
    learning_rate=2e-5,
    do_train=True,
    do_eval=True,
    logging_strategy="steps"
)


training_args_bc5 = TrainingArguments(
    output_dir="bc5-ner-bert",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs_bc5",
    logging_steps=10,
    learning_rate=2e-5,
    do_train=True,
    do_eval=True,
    logging_strategy="steps"
)


In [None]:
from transformers import Trainer


trainer_scierc = Trainer(
    model=model_scierc,
    args=training_args_scierc,
    train_dataset=tokenized_scierc["train"],
    eval_dataset=tokenized_scierc["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_scierc
)


trainer_bc5 = Trainer(
    model=model_bc5,
    args=training_args_bc5,
    train_dataset=tokenized_bc5["train"],
    eval_dataset=tokenized_bc5["test"],
    tokenizer=tokenizer,
    data_collator=data_collator_bc5,
    compute_metrics=compute_metrics_bc5
)

print("Trainers initialized successfully!")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize_text(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_scierc = scierc_small.map(tokenize_text, batched=True)


def map_labels_to_int(example):
    example["labels"] = label2id_scierc.get(example["label"], 0)
    return example

tokenized_scierc = tokenized_scierc.map(map_labels_to_int)


columns_to_remove = ["text", "label", "metadata", "label_id"]
for split in tokenized_scierc.keys():
    tokenized_scierc[split] = tokenized_scierc[split].remove_columns(
        [c for c in columns_to_remove if c in tokenized_scierc[split].column_names]
    )


model_scierc = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(scierc_labels)
)


training_args_scierc = TrainingArguments(
    output_dir="scierc-rel-bert",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="scierc-logs",
    logging_steps=10
)


def compute_metrics_scierc(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


trainer_scierc = Trainer(
    model=model_scierc,
    args=training_args_scierc,
    train_dataset=tokenized_scierc["train"],
    eval_dataset=tokenized_scierc["validation"],
    compute_metrics=compute_metrics_scierc
)


trainer_scierc.train()
results_scierc = trainer_scierc.evaluate()
print("SciERC Relation Classification Results(BERT):", results_scierc)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np


model_scibert = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=len(scierc_labels)
)
tokenizer_scibert = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize_scierc(example):
    return tokenizer_scibert(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_scierc = scierc_small.map(tokenize_scierc, batched=True)


tokenized_scierc = tokenized_scierc.rename_column("label_id", "labels")

tokenized_scierc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    prec = precision_score(labels, predictions, average="weighted", zero_division=0)
    rec = recall_score(labels, predictions, average="weighted", zero_division=0)
    return {
        "eval_accuracy": acc,
        "eval_f1": f1,
        "eval_precision": prec,
        "eval_recall": rec
    }


training_args_scibert = TrainingArguments(
    output_dir="scierc-rel-scibert",
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)


trainer_scibert = Trainer(
    model=model_scibert,
    args=training_args_scibert,
    train_dataset=tokenized_scierc["train"],
    eval_dataset=tokenized_scierc["validation"],
    tokenizer=tokenizer_scibert,
    compute_metrics=compute_metrics
)

trainer_scibert.train()
metrics_scibert = trainer_scibert.evaluate(tokenized_scierc["test"])
print("SciERC Relation Classification Results (SciBERT):", metrics_scibert)


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
import evaluate


tokenizer_bert_bc5 = AutoTokenizer.from_pretrained("bert-base-uncased")
model_bert_bc5 = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(set(label for ex in bc5cdr_dataset["train"]["ner_tags"] for label in ex))
)


all_bc5_labels = sorted(list(set(label for ex in bc5cdr_dataset["train"]["ner_tags"] for label in ex)))
label2id_bc5 = {l: i for i, l in enumerate(all_bc5_labels)}
id2label_bc5 = {i: l for l, i in label2id_bc5.items()}


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_bert_bc5(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id_bc5[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_bc5 = bc5cdr_dataset.map(tokenize_and_align_labels, batched=True)


data_collator = DataCollatorForTokenClassification(tokenizer_bert_bc5)

seqeval = evaluate.load("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label_bc5[l] for l in label if l != -100] for label in labels]
    true_preds  = [[id2label_bc5[p] for (p, l) in zip(pred, label) if l != -100]
                   for pred, label in zip(predictions, labels)]
    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


training_args_bc5 = TrainingArguments(
    output_dir="bc5-bert",
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="bc5-logs",
    logging_steps=10,
    report_to="none"
)


trainer_bc5_bert = Trainer(
    model=model_bert_bc5,
    args=training_args_bc5,
    train_dataset=tokenized_bc5["train"],
    eval_dataset=tokenized_bc5["test"],
    tokenizer=tokenizer_bert_bc5,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer_bc5_bert.train()
bc5_bert_results = trainer_bc5_bert.evaluate(tokenized_bc5["test"])
print("BC5CDR NER Results (BERT):", bc5_bert_results)


In [None]:

tokenizer_scibert_bc5 = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model_scibert_bc5 = AutoModelForTokenClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=len(all_bc5_labels)
)


def tokenize_and_align_labels_scibert(examples):
    tokenized_inputs = tokenizer_scibert_bc5(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id_bc5[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_bc5_scibert = bc5cdr_dataset.map(tokenize_and_align_labels_scibert, batched=True)


data_collator_scibert = DataCollatorForTokenClassification(tokenizer_scibert_bc5)


training_args_bc5_scibert = TrainingArguments(
    output_dir="bc5-scibert",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="bc5-scibert-logs",
    logging_steps=10,
    report_to="none"
)


trainer_bc5_scibert = Trainer(
    model=model_scibert_bc5,
    args=training_args_bc5_scibert,
    train_dataset=tokenized_bc5_scibert["train"],
    eval_dataset=tokenized_bc5_scibert["test"],
    tokenizer=tokenizer_scibert_bc5,
    data_collator=data_collator_scibert,
    compute_metrics=compute_metrics
)

trainer_bc5_scibert.train()
bc5_scibert_results = trainer_bc5_scibert.evaluate(tokenized_bc5_scibert["test"])
print("BC5CDR NER Results (SciBERT):", bc5_scibert_results)
