This code mounts google drive to the golab notebook. It gives us to get access with google drive and data there.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive

Mounted at /content/gdrive


Going to the path of the project

In [2]:
cd /content/gdrive/MyDrive/NLP_projects

/content/gdrive/MyDrive/NLP_projects


### **Importing initial libraries**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os

In [4]:
import zipfile
import os

# Specify the name of your ZIP file
zip_file_name = '/content/gdrive/MyDrive/NLP_projects/Named Entity Recognition words Dataset (Uzbek language).zip'

# Create a directory to extract the contents
extract_dir = 'NER_dataset'
os.makedirs(extract_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f'Files have been extracted to: {extract_dir}')

Files have been extracted to: NER_dataset


In [61]:
### Loading dataset

df = pd.read_excel('/content/gdrive/MyDrive/NLP_projects/NER_dataset/Named Entity Recognition words Dataset (Uzbek language)/NER dictionary.xlsx')
df.head()

Unnamed: 0,Sentence,Word,Part of Speech,Entity Type,Unnamed: 4
0,Sentence 1,O‘zbekiston,NOUN,Country,B-BIOES
1,Sentence 1,Respublikasi,NOUN,Country_B,I-BIOES
2,Sentence 1,Prezidentining,,Position,E-BIOES
3,Sentence 1,2023,,Date,B-BIOES
4,Sentence 1,yil,NOUN,,E-BIOES


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18996 entries, 0 to 18995
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sentence        18996 non-null  object
 1   Word            18996 non-null  object
 2   Part of Speech  13122 non-null  object
 3   Entity Type     2022 non-null   object
 4   Unnamed: 4      18996 non-null  object
dtypes: object(5)
memory usage: 742.2+ KB


In [6]:
df['Unnamed: 4'].unique() ### showing unique Tag labels

array(['B-BIOES', 'I-BIOES', 'E-BIOES', 'S-BIOES', 'O-BIOES', 'BB-BIOES',
       ' B-BIOES', 'I-BIOE'], dtype=object)

### **Import main Libraries**

In [7]:
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [8]:
df.head()

Unnamed: 0,Sentence,Word,Part of Speech,Entity Type,Unnamed: 4
0,Sentence 1,O‘zbekiston,NOUN,Country,B-BIOES
1,Sentence 1,Respublikasi,NOUN,Country_B,I-BIOES
2,Sentence 1,Prezidentining,,Position,E-BIOES
3,Sentence 1,2023,,Date,B-BIOES
4,Sentence 1,yil,NOUN,,E-BIOES


In [9]:
df.count()

Sentence          18996
Word              18996
Part of Speech    13122
Entity Type        2022
Unnamed: 4        18996
dtype: int64

In [62]:
len(df.Sentence.unique())

1043

As we can see, there are more then 1000 sentences in the dataset, comprising almost 20000 words and tags. This corresponds to approximately 20 words per sentence.

Let's have a look at the different NER tags, and their frequency:

In [10]:
print("Number of tags: {}".format(len(df['Unnamed: 4'].unique())))
frequencies = df['Unnamed: 4'].value_counts()
frequencies

Number of tags: 8


Unnamed: 4
O-BIOES     15972
S-BIOES      1034
B-BIOES       701
E-BIOES       698
I-BIOES       586
I-BIOE          3
BB-BIOES        1
 B-BIOES        1
Name: count, dtype: int64

In [11]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('BIO', 18994), ('-BI', 2)]


Let's remove "I-BIOE", "BB-BIOES", and "B-BIOES" named entities, as performance on them will probably be not comparable to the other named entities.

In [12]:
entities_to_remove = ["I-BIOE", "BB-BIOES", "B-BIOES"]
df = df[~df['Unnamed: 4'].isin(entities_to_remove)]
df.head()


Unnamed: 0,Sentence,Word,Part of Speech,Entity Type,Unnamed: 4
1,Sentence 1,Respublikasi,NOUN,Country_B,I-BIOES
2,Sentence 1,Prezidentining,,Position,E-BIOES
4,Sentence 1,yil,NOUN,,E-BIOES
6,Sentence 1,yanvardagi,,Date,E-BIOES
7,Sentence 1,Respublika,NOUN,Country_B,S-BIOES


In [18]:
data = df
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
# Ensure all entries in the 'Word' column are strings
data['Word'] = data['Word'].astype(str)
data.head()

Unnamed: 0,Sentence,Word,Part of Speech,Entity Type,Unnamed: 4
1,Sentence 1,Respublikasi,NOUN,Country_B,I-BIOES
2,Sentence 1,Prezidentining,NOUN,Position,E-BIOES
4,Sentence 1,yil,NOUN,Position,E-BIOES
6,Sentence 1,yanvardagi,NOUN,Date,E-BIOES
7,Sentence 1,Respublika,NOUN,Country_B,S-BIOES


In [19]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence','Word','Unnamed: 4']].groupby(['Sentence'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence','Word','Unnamed: 4']].groupby(['Sentence'])['Unnamed: 4'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence,Word,Part of Speech,Entity Type,Unnamed: 4,sentence,word_labels
1,Sentence 1,Respublikasi,NOUN,Country_B,I-BIOES,Respublikasi Prezidentining yil yanvardagi Res...,"I-BIOES,E-BIOES,E-BIOES,E-BIOES,S-BIOES,O-BIOE..."
2,Sentence 1,Prezidentining,NOUN,Position,E-BIOES,Respublikasi Prezidentining yil yanvardagi Res...,"I-BIOES,E-BIOES,E-BIOES,E-BIOES,S-BIOES,O-BIOE..."
4,Sentence 1,yil,NOUN,Position,E-BIOES,Respublikasi Prezidentining yil yanvardagi Res...,"I-BIOES,E-BIOES,E-BIOES,E-BIOES,S-BIOES,O-BIOE..."
6,Sentence 1,yanvardagi,NOUN,Date,E-BIOES,Respublikasi Prezidentining yil yanvardagi Res...,"I-BIOES,E-BIOES,E-BIOES,E-BIOES,S-BIOES,O-BIOE..."
7,Sentence 1,Respublika,NOUN,Country_B,S-BIOES,Respublikasi Prezidentining yil yanvardagi Res...,"I-BIOES,E-BIOES,E-BIOES,E-BIOES,S-BIOES,O-BIOE..."


In [20]:
# Update label2id dictionary to include 'O' label
label2id = {
    'B-BIOES': 0,
    'I-BIOES': 1,
    'E-BIOES': 2,
    'S-BIOES': 3,
    'O-BIOES': 4,
    'BB-BIOES': 5,
    ' B-BIOES': 6,
    'I-BIOE': 7,
    'O': 8  # Add 'O' label here
}

# Update id2label dictionary accordingly
id2label = {v: k for k, v in label2id.items()}
label2id

{'B-BIOES': 0,
 'I-BIOES': 1,
 'E-BIOES': 2,
 'S-BIOES': 3,
 'O-BIOES': 4,
 'BB-BIOES': 5,
 ' B-BIOES': 6,
 'I-BIOE': 7,
 'O': 8}

In [21]:
# Let's only keep the "sentence" and "word_labels" columns, and drop duplicates:

data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Respublikasi Prezidentining yil yanvardagi Res...,"I-BIOES,E-BIOES,E-BIOES,E-BIOES,S-BIOES,O-BIOE..."
1,ta’lim fan va innovatsiyalar vazirligi tizimid...,"I-BIOES,I-BIOES,I-BIOES,I-BIOES,E-BIOES,O-BIOE..."
2,ta’lim fan va innovatsiyalar vazirligining tas...,"I-BIOES,I-BIOES,I-BIOES,I-BIOES,E-BIOES,O-BIOE..."
3,rivojlanish agentligining tashkiliy tuzilmasi ...,"I-BIOES,E-BIOES,O-BIOES,S-BIOES,O-BIOES,O-BIOE..."
4,Oliy ta’lim fan va innovatsiyalar sohasini tra...,"O-BIOES,O-BIOES,O-BIOES,O-BIOES,O-BIOES,O-BIOE..."


In [22]:
len(data)

856

In [23]:
# let's see a random sentence sample
data.iloc[41].sentence

'Ilmiy loyihalarning oraliq va yakuniy monitoringi yuritiladi'

In [24]:
# let's see corresponding tags
data.iloc[41].word_labels

'O-BIOES,O-BIOES,O-BIOES,O-BIOES,O-BIOES,O-BIOES,O-BIOES'

In [39]:
### Setting up hyperparameters of the model.

MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') ## Loading pre-trained bert model

In [40]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [41]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [42]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (856, 2)
TRAIN Dataset: (685, 2)
TEST Dataset: (171, 2)


In [43]:
training_set[0]

{'ids': tensor([  101,  5003,  2480, 18569,  9152,  6844,  2213,  2316,  8017,  2072,
         12436, 24501, 14289, 16558,  7556,  5332,  2003, 11039,  9711,  2243,
          5428,  5332,  8945,  1520,  1048,  5289,  1060,  2389, 19062,  3217,
         21146,  5339,  3630,  9067,  2906,  5582,  2316,  8017,  2072,  1051,
          1520, 19387, 21369,  2850,  1062,  3593,  4305, 26139,  9805,  4143,
          3654, 17710, 27887,  8943, 24501, 14289, 16558,  7556, 11493,  2075,
          1060,  2389, 19062,  3217, 21146,  5339,  3630,  9067,  8486, 25933,
          2140, 18816, 27266,  2072,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [44]:
training_set[0]["ids"]

tensor([  101,  5003,  2480, 18569,  9152,  6844,  2213,  2316,  8017,  2072,
        12436, 24501, 14289, 16558,  7556,  5332,  2003, 11039,  9711,  2243,
         5428,  5332,  8945,  1520,  1048,  5289,  1060,  2389, 19062,  3217,
        21146,  5339,  3630,  9067,  2906,  5582,  2316,  8017,  2072,  1051,
         1520, 19387, 21369,  2850,  1062,  3593,  4305, 26139,  9805,  4143,
         3654, 17710, 27887,  8943, 24501, 14289, 16558,  7556, 11493,  2075,
         1060,  2389, 19062,  3217, 21146,  5339,  3630,  9067,  8486, 25933,
         2140, 18816, 27266,  2072,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [45]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
ma          O-BIOES
##z         O-BIOES
##kur       O-BIOES
ni          O-BIOES
##zo        O-BIOES
##m         O-BIOES
band        O-BIOES
##lar       O-BIOES
##i         O-BIOES
va          O-BIOES
res         E-BIOES
##pu        E-BIOES
##bl        E-BIOES
##ika       E-BIOES
##si        E-BIOES
is          O-BIOES
##ht        O-BIOES
##iro       O-BIOES
##k         O-BIOES
##chi       O-BIOES
##si        O-BIOES
bo          O-BIOES
‘           O-BIOES
l           O-BIOES
##gan       O-BIOES
x           O-BIOES
##al        O-BIOES
##qa        O-BIOES
##ro        O-BIOES


In [46]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### **Defining the model**
Here we define the model, BertForTokenClassification, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head).

Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset. This is also printed as a warning when you run the code cell below.

In [47]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Training Model**

In [48]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.2439, grad_fn=<NllLossBackward0>)

In [49]:
### This looks good. Let's also verify that the logits of the neural network have a shape of (batch_size, sequence_length, num_labels):

tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 9])

Next, we define the optimizer. Here, we are just going to use Adam with a default learning rate. One can also decide to use more advanced ones such as AdamW (Adam with weight decay fix), which is included in the Transformers repository, and a learning rate scheduler, but we are not going to do that here.

In [50]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [51]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [52]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.2029948234558105
Training loss per 100 training steps: 0.5976336967236925
Training loss epoch: 0.4506086937563364
Training accuracy epoch: 0.7919282123956989
Training epoch: 2
Training loss per 100 training steps: 0.2526845932006836
Training loss per 100 training steps: 0.19584930345120996
Training loss epoch: 0.17175440207600248
Training accuracy epoch: 0.8943063724036414
Training epoch: 3
Training loss per 100 training steps: 0.1251387894153595
Training loss per 100 training steps: 0.13297030400706106
Training loss epoch: 0.12259888384209643
Training accuracy epoch: 0.9178332700607901


### **Evaluating the model**
Now that we've trained our model, we can evaluate its performance on the held-out test set (which is 20% of the data). Note that here, no gradient updates are performed, the model just outputs its logits.

In [55]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids']
            mask = batch['mask']
            targets = batch['targets']

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [56]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.020955262705683708
Validation Loss: 0.11736704644118977
Validation Accuracy: 0.9189707308128539


As we can see the model performance on validation dataset is more then 90%.


What is important is looking at the precision, recall and f1-score of the individual tags. For this, we use the seqeval Python library:

In [59]:
# !pip install seqeval

from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

       BIOES       0.68      0.70      0.69      1035

   micro avg       0.68      0.70      0.69      1035
   macro avg       0.68      0.70      0.69      1035
weighted avg       0.68      0.70      0.69      1035



In [60]:
sentence = "Ilmiy loyihalarning oraliq va yakuniy monitoringi yuritiladi"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

ilmiy loyihalarning oraliq va yakuniy monitoringi yuritiladi
['O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O-BIOES', 'O']


# **Thank you for your attention**