In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
from transformers import AutoTokenizer,BertTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch
from transformers import BertTokenizer,BertModel, BertConfig
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
TRAIN_PATH='/kaggle/input/conll003-englishversion/train.txt'
TEST_PATH='/kaggle/input/conll003-englishversion/test.txt'
VALID_PATH='/kaggle/input/conll003-englishversion/valid.txt'
LABEL_ARRAY=['O','B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
BATCH_SIZE=16
NUM_EPOCHS=50
MAX_LENGTH=512
LEARNING_RATE=0.0001

In [5]:
def preprocess(data):
    sentences=[]
    labels=[]
    current_sentence = []
    current_labels = []
    for line in data:
        if line == '\n':
            sentences.append(' '.join(current_sentence))
            labels.append(current_labels)
            current_sentence = []
            current_labels = []
        else:
            parts = line.strip().split()
            current_sentence.append(parts[0])
            current_labels.append(parts[-1])
    sentences=sentences[1:]
    labels=labels[1:]
    return sentences,labels

In [6]:
class NERDataset(Dataset):
    def __init__(self,data_path=TRAIN_PATH):
        super(NERDataset, self).__init__()
        with open(data_path,'r') as f:
            data=f.readlines()
            self.data=data[1:]
        self.labencoder=LabelEncoder()
        labelarray = LABEL_ARRAY
        self.labencoder.fit(labelarray)
        self.model_name = "bert-base-cased"
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.sentences,self.labels=preprocess(self.data)
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self,idx):
        sen=self.sentences[idx]
        lab=self.labels[idx]
        lab=['O' if i == 'B-LOC' or i == 'I-LOC' else i for i in lab]
        lab=self.labencoder.transform(lab)
        l=[*lab, *[0]*(MAX_LENGTH-len(lab))]
        self.features=self.tokenizer.encode_plus(sen,add_special_tokens=True,max_length=MAX_LENGTH,pad_to_max_length=True,return_attention_mask=True,return_token_type_ids=False,return_tensors='pt')
        return torch.tensor(self.features['input_ids'].squeeze()),torch.tensor(self.features['attention_mask'].squeeze()),torch.tensor(l)

In [7]:
train_dataset=NERDataset()
test_dataset=NERDataset(data_path=TEST_PATH)
valid_dataset=NERDataset(data_path=VALID_PATH)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
class NERModel(nn.Module):
    def __init__(self, num_labels=len(LABEL_ARRAY)):
        super(NERModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(self.bert.config.hidden_size, 256)
        self.relu=nn.ReLU()
        self.linear2 = nn.Linear(256, 32)
        self.softmax = nn.Softmax(num_labels)
    def forward(self, i,a):
        outputs= self.bert(i,a)
        sequence_output = outputs.last_hidden_state
#         print(sequence_output.shape)
        logits = self.linear1(sequence_output)
        logits = self.relu(logits)
        logits=self.linear2(logits)
        print(logits.shape)
        logits = self.softmax(logits)
        print(logits.shape)
        return logits

In [10]:
model = NERModel(num_labels=len(LABEL_ARRAY))
model.to(device)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [11]:
lossfn=nn.CrossEntropyLoss()
total_training_loss=0.0

In [15]:
for i in train_loader:
    ins,a,l=i
    print(ins,ins.shape)
    print(a,a.shape)
    print(l,l.shape)
    break

tensor([[  101,  1275,   119,  ...,     0,     0,     0],
        [  101, 14444, 18445,  ...,     0,     0,     0],
        [  101,   157, 10781,  ...,     0,     0,     0],
        ...,
        [  101,   107,   146,  ...,     0,     0,     0],
        [  101,  1252,  1175,  ...,     0,     0,     0],
        [  101,  1124,  1125,  ...,     0,     0,     0]]) torch.Size([16, 512])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) torch.Size([16, 512])
tensor([[6, 2, 5,  ..., 0, 0, 0],
        [1, 4, 6,  ..., 0, 0, 0],
        [1, 4, 6,  ..., 0, 0, 0],
        ...,
        [6, 6, 6,  ..., 0, 0, 0],
        [6, 6, 6,  ..., 0, 0, 0],
        [6, 6, 6,  ..., 0, 0, 0]]) torch.Size([16, 512])


In [12]:
training_loss=[]
model.train()
print("Training Model")
for epoch in tqdm(range(NUM_EPOCHS)):
    for i,batch in enumerate(train_loader):
#         print(f"batch {i} / {len(train_loader)} training.....")
        inp,attn,l= batch
        optimizer.zero_grad()
        inp=torch.tensor(inp,device=device)
        attn=torch.tensor(attn,device=device)
        l=torch.tensor(l,device=device)
        logits=model(inp,attn)
        loss=lossfn(logits,l[:,:len(LABEL_ARRAY)])
        loss.backward()
        optimizer.step()
        total_training_loss+=loss.item()
    avg_loss = total_training_loss / len(train_loader)
    training_loss.append(avg_loss)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")
print("Training Done")
torch.save(model,'model.pth')
print("Testing Model")

Training Model


  0%|          | 0/50 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|          | 0/50 [00:01<?, ?it/s]

torch.Size([16, 512, 32])





IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 7)

In [None]:
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for i,batch in enumerate(val_loader):
        inp,attn,l= batch
        inp=torch.tensor(inp,device=device)
        attn=torch.tensor(attn,device=device)
        l=torch.tensor(l,device=device)
        logits = model(inp, attn)
        print(inp.shape)
        total_correct += (logits == l).sum().item()
        total_samples += l.size(0) * l.size(1)

    accuracy = total_correct / total_samples
    print(f"Validation Accuracy: {accuracy}")
print("Testing Done")

In [None]:
plt.plot(np.array(training_loss))
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.title('Training History')
plt.show()