In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class CoNLLDataset(Dataset):
    def __init__(self, data, tokenizer, label2id, max_seq_length=128, other_tag="NON_NER"):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_seq_length = max_seq_length
        self.other_tag=other_tag

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        words = row['tokens'].tolist()
        labels = row['ner_tags'].tolist()
        encoding     = self.tokenizer.encode_plus(
            words,
            is_pretokenized=True,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()


        label_ids = [self.label2id[label] for label in
                     labels]
        label_ids = label_ids[:self.max_seq_length]  # truncate to max_len
        label_ids += [self.label2id[self.other_tag]] * (self.max_seq_length - len(label_ids))  # pad with 'O' label

        return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": torch.tensor(label_ids)
            }

In [2]:
%pip install torch transformers seqeval pytorch-crf pytorch-lightning pandas tensorboard


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install lightning

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import pandas as pd
from torchcrf import CRF
from torch.optim import AdamW
import lightning as pl
from seqeval.metrics import  accuracy_score, f1_score, precision_score, recall_score
from seqeval.scheme import IOB2
from torch.utils.data import DataLoader



class BERTCRF(nn.Module):
    def __init__(self, num_labels, bert_model_name='bert-base-uncased'):
        super(BERTCRF, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)


    def forward(self, input_ids, attention_mask, tags=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        emissions = self.classifier(sequence_output)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=attention_mask.byte())
            return loss
        else:
            return self.crf.decode(emissions, attention_mask.byte())


class NERModel(pl.LightningModule):
    def __init__(self, model, train_dataset, val_dataset, test_dataset, label_map, learning_rate=2e-5):
        super(NERModel, self).__init__()
        self.model = model
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.learning_rate = learning_rate
        self.label_map = label_map
        self.inverse_label_map = {v: k for k, v in label_map.items()}
        self.train_outputs= []
        self.test_outputs = []
        self.val_outputs = []

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask, labels)

    def _step(self, batch):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        loss = self.model(input_ids, attention_mask, labels)
        predictions = self.model(input_ids, attention_mask)
        # Convert predictions and labels to list of tags
        true_tags = [[self.inverse_label_map[tag] for j, tag in enumerate(label) if (attention_mask[i][j]).item()==1] for i,label in enumerate(labels.cpu().numpy())]
        pred_tags = [[self.inverse_label_map[tag] for tag in prediction] for prediction in predictions]
        return loss, true_tags, pred_tags

    def training_step(self, batch, batch_idx):
        loss, true_tags, pred_tags = self._step(batch)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, true_tags, pred_tags = self._step(batch)
        self.log('val_loss', loss, prog_bar=True)
        self.val_outputs.append( {'val_loss': loss, 'true_tags': true_tags, 'pred_tags': pred_tags})

    def on_validation_epoch_end(self):
        outputs = self.val_outputs
        true_tags = [tag for output in outputs for tag in output['true_tags']]
        pred_tags = [tag for output in outputs for tag in output['pred_tags']]
        val_acc = accuracy_score(true_tags, pred_tags)
        val_precision = precision_score(true_tags, pred_tags)
        val_recall = recall_score(true_tags, pred_tags)
        val_f1 = f1_score(true_tags, pred_tags)
        self.log('val_acc', val_acc, prog_bar=True)
        self.log('val_precision', val_precision, prog_bar=True)
        self.log('val_recall', val_recall, prog_bar=True)
        self.log('val_f1', val_f1, prog_bar=True)
        self.val_outputs.clear()

    def test_step(self, batch, batch_idx):
        loss, true_tags, pred_tags = self._step(batch)
        self.log('test_loss', loss, prog_bar=True)
        self.test_outputs.append({'test_loss': loss, 'true_tags': true_tags, 'pred_tags': pred_tags})

    def on_test_epoch_end(self):
        outputs = self.test_outputs
        true_tags = [tag for output in outputs for tag in output['true_tags']]
        pred_tags = [tag for output in outputs for tag in output['pred_tags']]
        val_acc = accuracy_score(true_tags, pred_tags)
        val_precision = precision_score(true_tags, pred_tags, mode='strict', scheme=IOB2)
        val_recall = recall_score(true_tags, pred_tags, mode='strict', scheme=IOB2)
        val_f1 = f1_score(true_tags, pred_tags, mode='strict', scheme=IOB2)
        self.log('test_acc', val_acc, prog_bar=True)
        self.log('test_precision', val_precision, prog_bar=True)
        self.log('test_recall', val_recall, prog_bar=True)
        self.log('test_f1', val_f1, prog_bar=True)
        self.test_outputs.clear()

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.learning_rate)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=128, shuffle=True, num_workers=7)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=128, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=128)








csv_file = "/home/ttembhre/NER_tagged_data.csv"
data = pd.read_csv(csv_file)
data['ner_tags'] = data['NER_tagged_data'].apply(lambda x: [y[1] for y in ast.literal_eval(x)])
data['tokens'] = data['NER_tagged_data'].apply(lambda x: [y[0] for y in ast.literal_eval(x)])
data = data[['tokens', 'ner_tags']]
data.to_parquet("data.parquet")


In [None]:
import ast
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger

data= pd.read_parquet("data.parquet")
all_tags = pd.unique(data['ner_tags'].explode())
label2id = {s:i for i, s in enumerate(all_tags)}
bert_model_name = "bert-base-cased"

print(data.head())
print(label2id)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename='ner-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=True,
    mode='min'
)
tensorboard_logger = TensorBoardLogger('tb_logs', name='ner_model')

callbacks=[checkpoint_callback, early_stopping_callback]

# Instantiate the dataset
dataset = CoNLLDataset(data, tokenizer, label2id, max_seq_length=128)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.85*len(dataset)), len(dataset) - int(0.85*len(dataset))])
num_labels = len(label2id)
bertcrfmodel = BERTCRF(num_labels, bert_model_name)
ner_model = NERModel(bertcrfmodel, train_dataset, val_dataset, val_dataset, label2id)
trainer = pl.Trainer(max_epochs=10, callbacks=callbacks, logger=tensorboard_logger)
trainer.fit(ner_model)

                                              tokens  \
0  [Valencia, player, ,, diakhaby, did, n't, pose...   
1  [Self, -, proclaimed, antifa, member, &, resea...   
2  [I, can, not, trust, self, -, proclaim, ", ant...   
3  [We, have, seen, you, people, in, action, and,...   
4  [We, do, n't, know, about, others, ,, but, our...   

                                            ner_tags  
0  [PERSON, NON_NER, NON_NER, NON_NER, NON_NER, N...  
1  [NON_NER, NON_NER, NON_NER, ORG, ORG, ORG, ORG...  
2  [NON_NER, NON_NER, NON_NER, NON_NER, NON_NER, ...  
3  [NON_NER, NON_NER, NON_NER, NON_NER, NON_NER, ...  
4  [NON_NER, NON_NER, NON_NER, NON_NER, NON_NER, ...  
{'PERSON': 0, 'NON_NER': 1, 'GPE': 2, 'NORP': 3, 'DATE': 4, 'ORG': 5, 'CARDINAL': 6, 'MONEY': 7, 'PERCENT': 8, 'TIME': 9, 'ORDINAL': 10, 'LOC': 11, 'FAC': 12, 'PRODUCT': 13, 'QUANTITY': 14, 'WORK_OF_ART': 15, 'LAW': 16, 'LANGUAGE': 17, 'EVENT': 18}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type    | Params
----------------------------------
0 | model | BERTCRF | 108 M 
----------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.301   Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 147.792


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 25.784 >= min_delta = 0.0. New best score: 122.008
