In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [2]:
!pip install transformers seqeval[gpu]



In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
data = pd.read_csv("/kaggle/input/ner-dataset/ner_datasetreference.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
data['Tag'].value_counts()

Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [6]:
data = data.ffill()
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [7]:
data["sentence"] = data.groupby("Sentence #")["Word"].transform(lambda x: ' | '.join(x))
data["word_labels"] = data.groupby("Sentence #")["Tag"].transform(lambda x: ' | '.join(x))

In [8]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,Thousands,NNS,O,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
1,Sentence: 1,of,IN,O,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
2,Sentence: 1,demonstrators,NNS,O,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
3,Sentence: 1,have,VBP,O,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
4,Sentence: 1,marched,VBN,O,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...


In [28]:
data['Tag'].unique()
label2id = {}
id2label = {}
i = 0
for x in data['Tag'].unique():
    label2id[x] = i
    id2label[i] = x
    i = i+1
label2id, id2label

({'O': 0,
  'B-geo': 1,
  'B-gpe': 2,
  'B-per': 3,
  'I-geo': 4,
  'B-org': 5,
  'I-org': 6,
  'B-tim': 7,
  'B-art': 8,
  'I-art': 9,
  'I-per': 10,
  'I-gpe': 11,
  'I-tim': 12,
  'B-nat': 13,
  'B-eve': 14,
  'I-eve': 15,
  'I-nat': 16},
 {0: 'O',
  1: 'B-geo',
  2: 'B-gpe',
  3: 'B-per',
  4: 'I-geo',
  5: 'B-org',
  6: 'I-org',
  7: 'B-tim',
  8: 'B-art',
  9: 'I-art',
  10: 'I-per',
  11: 'I-gpe',
  12: 'I-tim',
  13: 'B-nat',
  14: 'B-eve',
  15: 'I-eve',
  16: 'I-nat'})

In [10]:
df = data.drop(columns=['Word', 'POS', 'Tag','Sentence #'])
df.head()

Unnamed: 0,sentence,word_labels
0,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
1,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
2,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
3,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
4,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...


In [11]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,sentence,word_labels
0,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
24,Families | of | soldiers | killed | in | the |...,O | O | O | O | O | O | O | O | O | O | O | O ...
54,They | marched | from | the | Houses | of | Pa...,O | O | O | O | O | O | O | O | O | O | O | B-...
68,Police | put | the | number | of | marchers | ...,O | O | O | O | O | O | O | O | O | O | O | O ...
83,The | protest | comes | on | the | eve | of | ...,O | O | O | O | O | O | O | O | O | O | O | B-...


In [12]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,sentence,word_labels
0,Thousands | of | demonstrators | have | marche...,O | O | O | O | O | O | B-geo | O | O | O | O ...
1,Families | of | soldiers | killed | in | the |...,O | O | O | O | O | O | O | O | O | O | O | O ...
2,They | marched | from | the | Houses | of | Pa...,O | O | O | O | O | O | O | O | O | O | O | B-...
3,Police | put | the | number | of | marchers | ...,O | O | O | O | O | O | O | O | O | O | O | O ...
4,The | protest | comes | on | the | eve | of | ...,O | O | O | O | O | O | O | O | O | O | O | B-...


In [42]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()
    text_labels = text_labels.strip()

    for word, label in zip(sentence.split(' | '), text_labels.split(" | ")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * len(tokenized_word))

    return tokenized_sentence, labels

In [15]:
['0'] + ["xwdw"]

['0', 'xwdw']

In [31]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, label2id):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        
    def __getitem__(self, index):
        sentence = self.data['sentence'].iloc[index]
        word_labels = self.data['word_labels'].iloc[index]

        # Tokenize and align labels
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # Add special tokens
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
        labels = ["O"] + labels + ["O"]  # 'O' for outside tokens

        # Truncate if exceeding max length
        if len(tokenized_sentence) > self.max_len:
            tokenized_sentence = tokenized_sentence[:self.max_len - 1] + ["[SEP]"]
            labels = labels[:self.max_len - 1] + ["O"]

        # Pad sequences if needed
        while len(tokenized_sentence) < self.max_len:
            tokenized_sentence.append("[PAD]")
            labels.append("O")

        # Attention mask (1 for real tokens, 0 for padding)
        attn_mask = [1 if token != "[PAD]" else 0 for token in tokenized_sentence]

        # Convert tokens and labels to IDs
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        label_ids = [self.label2id[label] for label in labels]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)


In [129]:
print(tokenizer.convert_tokens_to_ids("[PAD]"))
print(tokenizer.convert_tokens_to_ids("[SEP]"))
print(tokenizer.convert_tokens_to_ids("[CLS]"))

0
102
101


In [32]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [33]:
train_df

Unnamed: 0,sentence,word_labels
0,Parliament | was | due | to | open | Monday | ...,B-org | O | O | O | O | B-tim | O | O | O | O ...
1,Iraqi | legislators | have | been | grappling ...,B-gpe | O | O | O | O | O | O | O | O | O | O ...
2,The | singer | arrived | in | the | southern |...,O | O | O | O | O | O | B-gpe | O | O | B-tim ...
3,The | director | of | the | Liberia | Electric...,O | O | O | O | B-geo | I-geo | I-geo | O | B-...
4,Israel | has | assassinated | Hamas | founder ...,B-geo | O | O | B-org | O | O | O | O | O | O ...
...,...,...
38083,""" | You | do | me | a | grave | injustice | , ...",O | O | O | O | O | O | O | O | O | O | O | O ...
38084,The | U.S. | State | Department | says | one |...,O | B-org | I-org | I-org | O | O | O | O | O ...
38085,"The | committee | is | to | select | 2,000 | p...",O | O | O | O | O | O | O | O | O | B-org | I-...
38086,Those | groups | were | shut | down | by | U.S...,O | O | O | O | O | O | B-org | O | O | O | B-...


In [30]:
training_set = CustomDataset(train_df, tokenizer, MAX_LEN, label2id)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN, label2id)

In [35]:
training_set[0]

{'ids': tensor([  101,  3323,  2001,  2349,  2000,  2330,  6928,  1010,  2021,  2008,
          5219,  2001,  2404,  2125,  2004, 11895,  1005,  2009,  2229,  1010,
         18883, 14560,  1998, 13970, 17811,  2699,  2000,  5993,  2006,  2040,
          2097,  3710,  2004,  3539,  2704,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [43]:
training_loader = DataLoader(training_set, batch_size = TRAIN_BATCH_SIZE, shuffle = True, num_workers = 0)
testing_loader = DataLoader(testing_set, batch_size = TEST_BATCH_SIZE, shuffle = False, num_workers = 0)

In [44]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [67]:
loss = 0
count = 0

with torch.no_grad():  # Prevents computation graph buildup
    for x in testing_set:
        ids = x["ids"].unsqueeze(0).to(device)
        mask = x["mask"].unsqueeze(0).to(device)
        targets = x["targets"].unsqueeze(0).to(device)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss += outputs.loss.item()  # Convert loss tensor to scalar

        count += 1

        # Free memory
        del ids, mask, targets, outputs
        torch.cuda.empty_cache()

print(loss / count if count > 0 else 0)  # Avoid division by zero

2.8942694085607306


In [69]:
import math

result = -math.log(1/17)
print(result)

2.833213344056216


In [70]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
outputs

TokenClassifierOutput(loss=tensor(2.7202, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.1914, -0.5881, -0.1467,  ...,  0.1348, -0.3043, -0.0886],
         [ 0.1020, -0.2817,  0.1577,  ...,  0.1842,  0.0858, -0.1644],
         [-0.0664, -0.4145, -0.0142,  ..., -0.2610,  0.0591,  0.4420],
         ...,
         [ 0.0926, -0.1537, -0.2456,  ..., -0.2243, -0.1140, -0.1150],
         [ 0.0819, -0.1092, -0.2511,  ..., -0.1300, -0.1716, -0.0267],
         [ 0.0378, -0.1661, -0.1913,  ..., -0.2972, -0.1454, -0.1949]]],
       device='cuda:0', grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [71]:
outputs[0]

tensor(2.7202, device='cuda:0', grad_fn=<NllLossBackward0>)

In [72]:
outputs[1].shape

torch.Size([1, 128, 17])

In [74]:
outputs[1][0].

torch.Size([128, 17])

In [100]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [99]:
import torch
from sklearn.metrics import accuracy_score

def train(model, training_loader, optimizer, device, max_grad_norm=1.0):
    model.train()  # Set model to training mode

    total_loss, total_accuracy = 0, 0
    num_steps, num_examples = 0, 0
    all_preds, all_labels = [], []

    for step, batch in enumerate(training_loader):
        # Move data to device
        input_ids = batch['ids'].to(device, dtype=torch.long)
        attention_mask = batch['mask'].to(device, dtype=torch.long)
        target_labels = batch['targets'].to(device, dtype=torch.long)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=target_labels)
        loss, logits = outputs.loss, outputs.logits
        total_loss += loss.item()

        # Step-wise accuracy calculation
        flat_labels = target_labels.view(-1)
        active_logits = logits.view(-1, model.num_labels)
        predictions = torch.argmax(active_logits, axis=1)
        valid_accuracy = attention_mask.view(-1) == 1
        masked_labels = torch.masked_select(flat_labels, valid_accuracy)
        masked_preds = torch.masked_select(predictions, valid_accuracy)

        all_labels.extend(masked_labels)
        all_preds.extend(masked_preds)

        batch_accuracy = accuracy_score(masked_labels.cpu().numpy(), masked_preds.cpu().numpy())
        total_accuracy += batch_accuracy

        # Backward pass with gradient clipping
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()

        num_steps += 1
        num_examples += target_labels.size(0)

        if step % 100 == 0:
            print(f"Training Loss per 100 steps: {total_loss / num_steps}")

    avg_loss = total_loss / num_steps
    avg_accuracy = total_accuracy / num_steps
    print(f"Training Loss: {avg_loss}")
    print(f"Training Accuracy: {avg_accuracy}")

    return all_labels, all_preds

def validate(model, validation_loader, device):
    model.eval()  # Set model to evaluation mode

    total_loss, total_accuracy = 0, 0
    num_steps, num_examples = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for step, batch in enumerate(validation_loader):
            # Move data to device
            input_ids = batch['ids'].to(device, dtype=torch.long)
            attention_mask = batch['mask'].to(device, dtype=torch.long)
            target_labels = batch['targets'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=target_labels)
            loss, logits = outputs.loss, outputs.logits
            total_loss += loss.item()

            # Step-wise accuracy calculation
            flat_labels = target_labels.view(-1)
            active_logits = logits.view(-1, model.num_labels)
            predictions = torch.argmax(active_logits, axis=1)
            valid_accuracy = attention_mask.view(-1) == 1
            masked_labels = torch.masked_select(flat_labels, valid_accuracy)
            masked_preds = torch.masked_select(predictions, valid_accuracy)

            all_labels.extend(masked_labels)
            all_preds.extend(masked_preds)

            batch_accuracy = accuracy_score(masked_labels.cpu().numpy(), masked_preds.cpu().numpy())
            total_accuracy += batch_accuracy

            num_steps += 1
            num_examples += target_labels.size(0)

            if step % 100 == 0:
                print(f"Validation Loss per 100 steps: {total_loss / num_steps}")

    avg_loss = total_loss / num_steps
    avg_accuracy = total_accuracy / num_steps
    print(f"Validation Loss: {avg_loss}")
    print(f"Validation Accuracy: {avg_accuracy}")

    return all_labels, all_preds

In [101]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train_labels, train_preds = train(model, training_loader, optimizer, device)
    val_labels, val_preds = validate(model, testing_loader, device)

Training epoch: 1
Training Loss per 100 steps: 2.8713955879211426
Training Loss per 100 steps: 0.4922046587608828
Training Loss per 100 steps: 0.3004158263219826
Training Loss per 100 steps: 0.22456399126108303
Training Loss per 100 steps: 0.18549804672357298
Training Loss per 100 steps: 0.16066191021256343
Training Loss per 100 steps: 0.1429838609886348
Training Loss per 100 steps: 0.12959711419779824
Training Loss per 100 steps: 0.11984057831560516
Training Loss per 100 steps: 0.11212057309156981
Training Loss per 100 steps: 0.10590815999043676
Training Loss per 100 steps: 0.09984094703817008
Training Loss per 100 steps: 0.09503537321309687
Training Loss per 100 steps: 0.09072772476164986
Training Loss per 100 steps: 0.08728892435396168
Training Loss per 100 steps: 0.0841294562155263
Training Loss per 100 steps: 0.08119479804035093
Training Loss per 100 steps: 0.07862273861625632
Training Loss per 100 steps: 0.07641862449830895
Training Loss per 100 steps: 0.07440255344970283
Trainin

In [125]:
from sklearn.metrics import classification_report

val_labels_str = [id2label[label.item()] for label in val_labels]
val_preds_str = [id2label[label.item()] for label in val_preds]

# Print the classification report with string labels
print(classification_report(val_labels_str, val_preds_str))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.50      0.02      0.03       113
       B-eve       0.83      0.27      0.41        70
       B-geo       0.87      0.86      0.86     11575
       B-gpe       0.95      0.91      0.93      3437
       B-nat       0.55      0.19      0.28        64
       B-org       0.75      0.67      0.71      6823
       B-per       0.81      0.87      0.84      5220
       B-tim       0.93      0.83      0.88      4298
       I-art       0.00      0.00      0.00        46
       I-eve       1.00      0.09      0.16        46
       I-geo       0.85      0.67      0.75      1772
       I-gpe       1.00      0.55      0.71        53
       I-nat       0.00      0.00      0.00        10
       I-org       0.76      0.61      0.68      4237
       I-per       0.81      0.95      0.88      6432
       I-tim       0.88      0.72      0.79      1301
           O       0.99      0.99      0.99    206918

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))


In [130]:
print(tokenizer.convert_tokens_to_ids("[PAD]"))
print(tokenizer.convert_tokens_to_ids("[SEP]"))
print(tokenizer.convert_tokens_to_ids("[CLS]"))

0
102
101


In [133]:
sentence = "India has a capital called Mumbai. On wednesday, the president will give a presentation.India has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentationIndia has a capital called Mumbai. On wednesday, the president will give a presentation"

tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

{'input_ids': tensor([[  101,  2634,  2038,  1037,  3007,  2170,  8955,  1012,  2006,  9317,
          1010,  1996,  2343,  2097,  2507,  1037,  8312,  1012,  2634,  2038,
          1037,  3007,  2170,  8955,  1012,  2006,  9317,  1010,  1996,  2343,
          2097,  2507,  1037,  8312, 22254,  2401,  2038,  1037,  3007,  2170,
          8955,  1012,  2006,  9317,  1010,  1996,  2343,  2097,  2507,  1037,
          8312, 22254,  2401,  2038,  1037,  3007,  2170,  8955,  1012,  2006,
          9317,  1010,  1996,  2343,  2097,  2507,  1037,  8312, 22254,  2401,
          2038,  1037,  3007,  2170,  8955,  1012,  2006,  9317,  1010,  1996,
          2343,  2097,  2507,  1037,  8312, 22254,  2401,  2038,  1037,  3007,
          2170,  8955,  1012,  2006,  9317,  1010,  1996,  2343,  2097,  2507,
          1037,  8312, 22254,  2401,  2038,  1037,  3007,  2170,  8955,  1012,
          2006,  9317,  1010,  1996,  2343,  2097,  2507,  1037,  8312, 22254,
          2401,  2038,  1037,  3007,  

In [145]:
inputs['input_ids'][0].shape

torch.Size([128])

In [150]:
sentence = "India has a capital called Mumbai. On wednesday, the president will give a presentation"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)

# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

india has a capital called mumbai . on wednesday , the president will give a presentation
['B-geo', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [160]:
!sudo apt-get install git-lfs
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [165]:
model_name = "bert-finetuned-named-entity-recognition"

# Upload tokenizer to the hub
tokenizer.push_to_hub(
    repo_id="ParitKansal/{}".format(model_name),  # Correct repo_id format
    commit_message="Add tokenizer",
    use_temp_dir=True,
)

# Upload model to the hub
model.push_to_hub(
    repo_id="ParitKansal/{}".format(model_name),  # Correct repo_id format
    commit_message="Add model",
    use_temp_dir=True,
)


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ParitKansal/bert-finetuned-named-entity-recognition/commit/908040e09372f8ca5e696c462dd8478abe29f264', commit_message='Add model', commit_description='', oid='908040e09372f8ca5e696c462dd8478abe29f264', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ParitKansal/bert-finetuned-named-entity-recognition', endpoint='https://huggingface.co', repo_type='model', repo_id='ParitKansal/bert-finetuned-named-entity-recognition'), pr_revision=None, pr_num=None)