In [5]:
import warnings
warnings.filterwarnings("ignore")
import torch

print("CUDA Available:", torch.cuda.is_available())

CUDA Available: True


## Load and Preprocess Data
Parse the .conll files into a format suitable for training.

In [7]:
def load_conll_data(file_path):
    sentences, labels = [], []
    with open(file_path, 'r') as file:
        sentence, label = [], []
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:  # If the sentence is not empty, add it
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.split()
                if len(parts) == 1:  # Handle lines with a single token
                    token = parts[0]
                    tag = "O"  # Assign a default tag
                    sentence.append(token)
                    label.append(tag)
                elif len(parts) >= 2:  # Handle lines with at least two columns
                    token = parts[0]
                    tag = parts[-1]
                    sentence.append(token)
                    label.append(tag)
                else:
                    print(f"Skipping invalid line: {line}")
        if sentence:  # Add the last sentence if the file doesn't end with a blank line
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

train_sentences, train_labels = load_conll_data("train.conll")
test_sentences, test_labels = load_conll_data("test.conll")

## Train a CRF Baseline
Use the CRF model with token-level features.

In [12]:
import sklearn_crfsuite
from sklearn_crfsuite import CRF, metrics
from sklearn.model_selection import train_test_split
import pandas as pd

# Define CRF features
def extract_features(sentence, index):
    word = sentence[index]
    features = {
        "word.lower()": word.lower(),
        "word[-3:]": word[-3:],
        "word[-2:]": word[-2:],
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit(),
    }
    if index > 0:
        features.update({
            "-1:word.lower()": sentence[index - 1].lower(),
            "-1:word.istitle()": sentence[index - 1].istitle(),
            "-1:word.isupper()": sentence[index - 1].isupper(),
        })
    else:
        features["BOS"] = True  # Beginning of sentence

    if index < len(sentence) - 1:
        features.update({
            "+1:word.lower()": sentence[index + 1].lower(),
            "+1:word.istitle()": sentence[index + 1].istitle(),
            "+1:word.isupper()": sentence[index + 1].isupper(),
        })
    else:
        features["EOS"] = True  # End of sentence
    return features

def sentence_to_features(sentence):
    return [extract_features(sentence, i) for i in range(len(sentence))]

# Convert training and testing sentences to feature format
X_train = [sentence_to_features(s) for s in train_sentences]
y_train = train_labels
X_test = [sentence_to_features(s) for s in test_sentences]
y_test = test_labels

# Ensure consistent lengths of X_test and y_test
min_len = min(len(X_test), len(y_test))
X_test = X_test[:min_len]
y_test = y_test[:min_len]

# Train the CRF model
crf = CRF(algorithm="lbfgs", c1=0.5, c2=0.01, max_iterations=2)

crf.fit(X_train, y_train)

# Predict on the test set
y_pred = [crf.predict([xseq])[0] for xseq in X_test]

# Evaluate the model and print accuracy
report = metrics.flat_classification_report(y_test, y_pred, output_dict=True)
accuracy = report['accuracy']
print(f"Accuracy: {accuracy}")

Accuracy: 0.9947930939983557


## Train a BiLSTM-CRF
Use the Flair library for the BiLSTM-CRF model.

In [16]:
import logging
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings, CharacterEmbeddings, StackedEmbeddings

# Set logging level to suppress output during training
logging.basicConfig(level=logging.ERROR)

# Define column mapping for the NER data
# The first column is the text and the second column contains the NER tags
columns = {0: "text", 1: "ner"}

# Specify the folder where the .conll files are stored
data_folder = "/content/"

# Load the corpus with train, test, and dev files in .conll format
corpus = ColumnCorpus(data_folder, columns,
                      train_file="train.conll", test_file="test.conll")

# Define embeddings (you can use different types depending on your task)
embedding_types = [
    WordEmbeddings("glove"),  # Use 'glove', 'bert', or other embedding types available in Flair
    CharacterEmbeddings(),    # Character-level embeddings
]

# Stack embeddings for the model
embeddings = StackedEmbeddings(embeddings=embedding_types)

# Initialize the BiLSTM-CRF model with appropriate parameters
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
                        tag_dictionary=corpus.make_tag_dictionary(tag_type="ner"),
                        tag_type="ner", use_crf=True)

# Train the model using the training data
trainer = ModelTrainer(tagger, corpus)

# Train the model with suppressed logging output
trainer.train("resources/taggers/bilstm-crf", max_epochs=1)

# Evaluate the model on the test set
result = tagger.evaluate(corpus.test, gold_label_type="ner")

# Print the accuracy
print(f"Accuracy: {result.detailed_results}")

2025-01-05 21:08:48,498 Reading data from /content
2025-01-05 21:08:48,503 Train: /content/train.conll
2025-01-05 21:08:48,509 Dev: None
2025-01-05 21:08:48,511 Test: /content/test.conll
2025-01-05 21:08:51,418 No dev split found. Using 10% (i.e. 572 samples) of the train split as dev data
2025-01-05 21:08:58,249 SequenceTagger predicts: Dictionary with 3 tags: O, <START>, <STOP>
2025-01-05 21:08:58,265 ----------------------------------------------------------------------------------------------------
2025-01-05 21:08:58,266 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=150, out_features=150, bias=True)
  (rnn):

100%|██████████| 9/9 [00:03<00:00,  2.56it/s]

2025-01-05 21:10:16,270 DEV : loss 5.559390459097813e-08 - f1-score (micro avg)  0.0
2025-01-05 21:10:16,311  - 0 epochs without improvement





2025-01-05 21:10:17,148 ----------------------------------------------------------------------------------------------------
2025-01-05 21:10:17,154 Testing using last state of model ...


100%|██████████| 18/18 [00:06<00:00,  2.76it/s]

2025-01-05 21:10:23,721 
Results:
- F-score (micro) 0.0
- F-score (macro) 0.0
- Accuracy 0.0

By class:
              precision    recall  f1-score   support

    10033371     0.0000    0.0000    0.0000      47.0
           0     0.0000    0.0000    0.0000      33.0
    10028411     0.0000    0.0000    0.0000      18.0
    10016256     0.0000    0.0000    0.0000      14.0
    10016766     0.0000    0.0000    0.0000       9.0
    10043890     0.0000    0.0000    0.0000       8.0
    10023614     0.0000    0.0000    0.0000       8.0
    10033473     0.0000    0.0000    0.0000       7.0
    10028294     0.0000    0.0000    0.0000       7.0
    10040617     0.0000    0.0000    0.0000       7.0
    10019211     0.0000    0.0000    0.0000       7.0
    10012378     0.0000    0.0000    0.0000       6.0
    10016974     0.0000    0.0000    0.0000       6.0
    10028350     0.0000    0.0000    0.0000       6.0
    10023477     0.0000    0.0000    0.0000       6.0
    10033432     0.0000    0.00


100%|██████████| 35/35 [00:06<00:00,  5.10it/s]

Accuracy: 
Results:
- F-score (micro) 0.0
- F-score (macro) 0.0
- Accuracy 0.0

By class:
              precision    recall  f1-score   support

    10033371     0.0000    0.0000    0.0000      47.0
           0     0.0000    0.0000    0.0000      33.0
    10028411     0.0000    0.0000    0.0000      18.0
    10016256     0.0000    0.0000    0.0000      14.0
    10016766     0.0000    0.0000    0.0000       9.0
    10043890     0.0000    0.0000    0.0000       8.0
    10023614     0.0000    0.0000    0.0000       8.0
    10033473     0.0000    0.0000    0.0000       7.0
    10028294     0.0000    0.0000    0.0000       7.0
    10040617     0.0000    0.0000    0.0000       7.0
    10019211     0.0000    0.0000    0.0000       7.0
    10012378     0.0000    0.0000    0.0000       6.0
    10016974     0.0000    0.0000    0.0000       6.0
    10028350     0.0000    0.0000    0.0000       6.0
    10023477     0.0000    0.0000    0.0000       6.0
    10033432     0.0000    0.0000    0.0000  




## Train BioBERT
Fine-tune BioBERT using the Hugging Face Transformers library.

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from seqeval.metrics import classification_report

# Tokenize and align labels for BioBERT
def tokenize_and_align_labels(tokenizer, sentences, labels, label2id, max_length=128):
    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True, max_length=max_length, return_tensors="pt")

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word ids for each token
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Padding token, so we ignore it
            elif word_idx != previous_word_idx:
                if word_idx < len(label):  # Ensure the word index is within the bounds of the label list
                    label_ids.append(label2id.get(label[word_idx], -100))  # Convert label to integer
                else:
                    label_ids.append(-100)  # If the word_idx is out of range for labels
            else:
                label_ids.append(label_ids[-1])  # Continuation of the previous word's label
            previous_word_idx = word_idx

        # Pad label list to match the tokenized sentence length
        while len(label_ids) < len(tokenized_inputs['input_ids'][i]):
            label_ids.append(-100)

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Train the BioBERT model
def train_biobert(train_sentences, train_labels, test_sentences, test_labels, label2id):
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

    # Tokenize and align labels
    train_dataset = tokenize_and_align_labels(tokenizer, train_sentences, train_labels, label2id)
    test_dataset = tokenize_and_align_labels(tokenizer, test_sentences, test_labels, label2id)

    # Convert to Hugging Face Dataset format
    train_dataset = Dataset.from_dict(train_dataset)
    test_dataset = Dataset.from_dict(test_dataset)

    # Load the pre-trained BioBERT model for token classification
    model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=len(label2id))

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=0.05,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=10,
        save_total_limit=2,
        report_to="none"
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    # Start training
    trainer.train()
    trainer.save_model("./biobert_model")

    # Evaluate the model after training
    results = trainer.evaluate()

    # Print evaluation results (loss, accuracy, etc.)
    print(f"Evaluation results: {results}")

    # Print accuracy
    accuracy = results.get("eval_accuracy", "Accuracy not available")
    print(f"Accuracy: {accuracy}")

    # Detailed evaluation with classification report
    predictions, true_labels, _ = trainer.predict(test_dataset)
    predictions = [list(p) for p in predictions]
    true_labels = [list(t) for t in true_labels]

    # Convert predictions and true_labels to the appropriate format for seqeval
    predictions = [[str(p) for p in pred] for pred in predictions]
    true_labels = [[str(t) for t in label] for label in true_labels]

    print(classification_report(true_labels, predictions))

# Define the label2id mapping
def create_label2id(train_labels, test_labels):
    unique_labels = set(label for sublist in train_labels + test_labels for label in sublist)
    return {label: i for i, label in enumerate(unique_labels)}

# Load CoNLL data
def load_conll_data(file_path):
    sentences, labels = [], []
    with open(file_path, 'r') as file:
        sentence, label = [], []
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:  # If the sentence is not empty, add it
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.split()
                if len(parts) == 1:  # Handle lines with a single token
                    token = parts[0]
                    tag = "O"  # Assign a default tag
                    sentence.append(token)
                    label.append(tag)
                elif len(parts) >= 2:  # Handle lines with at least two columns
                    token = parts[0]
                    tag = parts[-1]
                    sentence.append(token)
                    label.append(tag)
                else:
                    print(f"Skipping invalid line: {line}")
        if sentence:  # Add the last sentence if the file doesn't end with a blank line
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

train_sentences, train_labels = load_conll_data("train.conll")
test_sentences, test_labels = load_conll_data("test.conll")

label2id = create_label2id(train_labels, test_labels)

# Train the model
train_biobert(train_sentences, train_labels, test_sentences, test_labels, label2id)

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
0,4.5298,2.691654


Evaluation results: {'eval_loss': 2.6916539669036865, 'eval_runtime': 8.2639, 'eval_samples_per_second': 134.076, 'eval_steps_per_second': 8.471, 'epoch': 0.05027932960893855}
Accuracy: Accuracy not available
