In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
from transformers import AutoTokenizer,BertTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch
from transformers import BertTokenizer,BertForTokenClassification, BertConfig
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
TRAIN_PATH='/kaggle/input/conll003-englishversion/train.txt'
TEST_PATH='/kaggle/input/conll003-englishversion/test.txt'
VALID_PATH='/kaggle/input/conll003-englishversion/valid.txt'
LABEL_ARRAY=['O','B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
BATCH_SIZE=16
NUM_EPOCHS=5
MAX_LENGTH=256
LEARNING_RATE=0.000001

In [5]:
def preprocess(data):
    sentences=[]
    labels=[]
    current_sentence = []
    current_labels = []
    for line in data:
        if line == '\n':
            sentences.append(' '.join(current_sentence))
            labels.append(current_labels)
            current_sentence = []
            current_labels = []
        else:
            parts = line.strip().split()
            current_sentence.append(parts[0])
            current_labels.append(parts[-1])
    sentences=sentences[1:]
    labels=labels[1:]
    return sentences,labels

In [6]:
class NERDataset(Dataset):
    def __init__(self,data_path=TRAIN_PATH):
        super(NERDataset, self).__init__()
        with open(data_path,'r') as f:
            data=f.readlines()
            self.data=data[1:]
        self.labencoder=LabelEncoder()
        labelarray = LABEL_ARRAY
        self.labencoder.fit(labelarray)
        self.model_name = "bert-base-cased"
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.sentences,self.labels=preprocess(self.data)
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self,idx):
        sen=self.sentences[idx]
        lab=self.labels[idx]
        lab=['O' if i == 'B-LOC' or i == 'I-LOC' else i for i in lab]
        lab=self.labencoder.transform(lab)
        l=[*lab, *[7]*(MAX_LENGTH-len(lab))]
        self.features=self.tokenizer.encode_plus(sen,add_special_tokens=True,max_length=MAX_LENGTH,pad_to_max_length=True,return_attention_mask=True,return_token_type_ids=False,return_tensors='pt')
#         return torch.tensor(self.features['input_ids'].squeeze()),torch.tensor(self.features['attention_mask'].squeeze()),torch.tensor(l)
        return {
            'input_ids': self.features['input_ids'].squeeze(),
            'attention_mask': self.features['attention_mask'].squeeze(),
#             'token_type_ids': self.features[idx]['token_type_ids'],
            #'labels': self.features[idx]['labels'],
            'labels':torch.tensor(l)
        }

In [7]:
train_dataset=NERDataset()
test_dataset=NERDataset(data_path=TEST_PATH)
valid_dataset=NERDataset(data_path=VALID_PATH)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
model=BertForTokenClassification.from_pretrained('bert-base-cased',num_labels = 8)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import DataCollatorForTokenClassification
import logging
from transformers import Trainer
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=500,
    do_train=True,
    evaluation_strategy='steps',
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [12]:
model.to(device)
# optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [12]:
lossfn=nn.CrossEntropyLoss()
total_training_loss=0.0

In [13]:
for i in train_loader:
    ins,a,l=i
    print(ins,ins.shape)
    print(a,a.shape)
    print(l,l.shape)
    break

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tensor([[  101, 12342, 13821,  ...,     0,     0,     0],
        [  101,  8274,  1116,  ...,     0,     0,     0],
        [  101, 13743,  1475,  ...,     0,     0,     0],
        ...,
        [  101,  8007,  1113,  ...,     0,     0,     0],
        [  101, 16752, 12475,  ...,     0,     0,     0],
        [  101,   107,  1284,  ...,     0,     0,     0]]) torch.Size([16, 256])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) torch.Size([16, 256])
tensor([[1, 6, 6,  ..., 7, 7, 7],
        [6, 6, 6,  ..., 7, 7, 7],
        [6, 6, 2,  ..., 7, 7, 7],
        ...,
        [6, 6, 6,  ..., 7, 7, 7],
        [6, 6, 2,  ..., 7, 7, 7],
        [6, 6, 6,  ..., 7, 7, 7]]) torch.Size([16, 256])


In [20]:
training_loss=[]
model.train()
print("Training Model")
for epoch in tqdm(range(NUM_EPOCHS)):
    for i,batch in enumerate(train_loader):
        print(f"batch {i} / {len(train_loader)} training.....")
        inp,attn,l= batch
        optimizer.zero_grad()
        inp=torch.tensor(inp,device=device)
        attn=torch.tensor(attn,device=device)
        l=torch.tensor(l,device=device)
        l = torch.nn.functional.one_hot(l, num_classes=8)
        l=torch.tensor(l,device=device,dtype=float)
        logits=model(inp,attn)
        loss=lossfn(logits.logits,l)
        loss.backward()
        optimizer.step()
        total_training_loss+=loss.item()
    avg_loss = total_training_loss / len(train_loader)
    training_loss.append(avg_loss)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")
print("Training Done")
torch.save(model,'model.pth')
print("Testing Model")

Training Model


  0%|          | 0/5 [00:00<?, ?it/s]

batch 0 / 937 training.....
batch 1 / 937 training.....
batch 2 / 937 training.....
batch 3 / 937 training.....
batch 4 / 937 training.....
batch 5 / 937 training.....
batch 6 / 937 training.....
batch 7 / 937 training.....
batch 8 / 937 training.....
batch 9 / 937 training.....
batch 10 / 937 training.....
batch 11 / 937 training.....
batch 12 / 937 training.....
batch 13 / 937 training.....
batch 14 / 937 training.....
batch 15 / 937 training.....
batch 16 / 937 training.....
batch 17 / 937 training.....
batch 18 / 937 training.....
batch 19 / 937 training.....
batch 20 / 937 training.....
batch 21 / 937 training.....
batch 22 / 937 training.....
batch 23 / 937 training.....
batch 24 / 937 training.....
batch 25 / 937 training.....
batch 26 / 937 training.....
batch 27 / 937 training.....
batch 28 / 937 training.....
batch 29 / 937 training.....
batch 30 / 937 training.....
batch 31 / 937 training.....
batch 32 / 937 training.....
batch 33 / 937 training.....
batch 34 / 937 training.

 20%|██        | 1/5 [06:11<24:44, 371.23s/it]

Epoch 1 - Average Loss: 173.6766
batch 0 / 937 training.....
batch 1 / 937 training.....
batch 2 / 937 training.....
batch 3 / 937 training.....
batch 4 / 937 training.....
batch 5 / 937 training.....
batch 6 / 937 training.....
batch 7 / 937 training.....
batch 8 / 937 training.....
batch 9 / 937 training.....
batch 10 / 937 training.....
batch 11 / 937 training.....
batch 12 / 937 training.....
batch 13 / 937 training.....
batch 14 / 937 training.....
batch 15 / 937 training.....
batch 16 / 937 training.....
batch 17 / 937 training.....
batch 18 / 937 training.....
batch 19 / 937 training.....
batch 20 / 937 training.....
batch 21 / 937 training.....
batch 22 / 937 training.....
batch 23 / 937 training.....
batch 24 / 937 training.....
batch 25 / 937 training.....
batch 26 / 937 training.....
batch 27 / 937 training.....
batch 28 / 937 training.....
batch 29 / 937 training.....
batch 30 / 937 training.....
batch 31 / 937 training.....
batch 32 / 937 training.....
batch 33 / 937 train

 40%|████      | 2/5 [12:22<18:33, 371.24s/it]

Epoch 2 - Average Loss: 345.6105
batch 0 / 937 training.....
batch 1 / 937 training.....
batch 2 / 937 training.....
batch 3 / 937 training.....
batch 4 / 937 training.....
batch 5 / 937 training.....
batch 6 / 937 training.....
batch 7 / 937 training.....
batch 8 / 937 training.....
batch 9 / 937 training.....
batch 10 / 937 training.....
batch 11 / 937 training.....
batch 12 / 937 training.....
batch 13 / 937 training.....
batch 14 / 937 training.....
batch 15 / 937 training.....
batch 16 / 937 training.....
batch 17 / 937 training.....
batch 18 / 937 training.....
batch 19 / 937 training.....
batch 20 / 937 training.....
batch 21 / 937 training.....
batch 22 / 937 training.....
batch 23 / 937 training.....
batch 24 / 937 training.....
batch 25 / 937 training.....
batch 26 / 937 training.....
batch 27 / 937 training.....
batch 28 / 937 training.....
batch 29 / 937 training.....
batch 30 / 937 training.....
batch 31 / 937 training.....
batch 32 / 937 training.....
batch 33 / 937 train

 40%|████      | 2/5 [13:22<20:03, 401.16s/it]


KeyboardInterrupt: 

In [None]:
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for i,batch in enumerate(val_loader):
        inp,attn,l= batch
        inp=torch.tensor(inp,device=device)
        attn=torch.tensor(attn,device=device)
        l=torch.tensor(l,device=device)
        logits = model(inp, attn)
        print(inp.shape)
        total_correct += (logits == l).sum().item()
        total_samples += l.size(0) * l.size(1)

    accuracy = total_correct / total_samples
    print(f"Validation Accuracy: {accuracy}")
print("Testing Done")

In [None]:
plt.plot(np.array(training_loss))
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.title('Training History')
plt.show()