In [None]:
!pip install transformers

In [None]:
!pip install onnxruntime
!pip install onnx

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
import torch.optim as optim

from transformers import XLMRobertaForTokenClassification, AutoConfig

from torch.nn import KLDivLoss
import torch.nn.functional as F

In [2]:
df_ukraine = pd.read_csv('/content/drive/MyDrive/iasa_nlp/uk_geo_dataset.csv')

df_ukraine.drop_duplicates(inplace = True)

In [3]:
tokenizer = AutoTokenizer.from_pretrained('ukr-models/uk-ner')

# fine-tuned XLMRoberta on pretty small sample of uk_geo_dataset
model_ukr = torch.load('/content/drive/MyDrive/iasa_nlp/ukr_model.pt', map_location=torch.device('cpu'))

# text processing

In [4]:
def get_iob_tags(text, locs, orgs, pers):
    tokens = tokenizer.tokenize(text)
    tokenized_text = tokenizer.encode(text, add_special_tokens=False)
    iob_tags = ['O'] * len(tokens)

    locs = eval(locs)
    orgs = eval(orgs)
    pers = eval(pers)

    def update_tags(tags, markers, label):
        for marker in markers:
            start_char, end_char = marker
            start_token, end_token = None, None

            char_count = 0
            found_start = False
            for i, token in enumerate(tokens):

                if char_count >= start_char and not found_start:
                    start_token = i
                    found_start = True

                char_count += len(token)
                if char_count >= end_char:
                    end_token = i
                    break

            # This handles situations where markers might not align with token boundaries.
            if start_token is None or end_token is None:
                continue

            # Check if the start_token is the space token
            if tokens[start_token] == '▁' and (start_token+1) < len(tokens):
                tags[start_token+1] = "B-" + label
                for j in range(start_token+2, end_token+1):
                    tags[j] = "I-" + label
            else:
                tags[start_token] = "B-" + label
                for j in range(start_token+1, end_token+1):
                    tags[j] = "I-" + label

    update_tags(iob_tags, locs, "LOC")
    update_tags(iob_tags, orgs, "ORG")
    update_tags(iob_tags, pers, "PER")

    return iob_tags

In [5]:
df_eval = df_ukraine.iloc[500:600]

In [6]:
df_ukraine = df_ukraine.iloc[:500]

Creating rags based on the labels given in the dataset

In [7]:
df_ukraine['IOB_tags'] = df_ukraine.apply(lambda row: get_iob_tags(row['text'], row['loc_markers'], row['org_markers'], row['per_markers']), axis=1)

In [8]:
df_eval['IOB_tags'] = df_eval.apply(lambda row: get_iob_tags(row['text'], row['loc_markers'], row['org_markers'], row['per_markers']), axis=1)

Tokenizing my text

In [9]:
# Ukranian
uk_train_tokenized_text = df_ukraine['text'].apply(lambda x: tokenizer.tokenize(x)).tolist()
uk_train_labels =  df_ukraine['IOB_tags'].tolist()

In [10]:
# Eval
uk_eval_tokenized_texts = df_eval['text'].apply(lambda x: tokenizer.tokenize(x)).tolist()
uk_eval_true_labels =  df_eval['IOB_tags'].tolist()

Dataset creation

In [65]:
# Mapiing tags
config = AutoConfig.from_pretrained("ukr-models/uk-ner")

text_label_to_model_label = {
    "O": "LABEL_0",
    "B-PER": "LABEL_1",
    "I-PER": "LABEL_2",
    "B-ORG": "LABEL_3",
    "I-ORG": "LABEL_4",
    "B-LOC": "LABEL_5",
    "I-LOC": "LABEL_6"
}

label2idx = config.label2id

tag2idx = {text_label: label2idx[model_label] for text_label, model_label in text_label_to_model_label.items()}

tag2idx["-100"] = -100


In [66]:
class NERDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Convert tokens and tags to their respective IDs
        input_ids = tokenizer.convert_tokens_to_ids(text)
        tag_ids = [tag2idx.get(l) for l in label]

        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * len(input_ids)

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'tag_ids': torch.tensor(tag_ids, dtype=torch.long)
        }


In [67]:
from torch.nn.utils.rnn import pad_sequence

def pad_collate_fn(batch):
    all_input_ids = [item['input_ids'] for item in batch]
    all_attention_masks = [item['attention_mask'] for item in batch]
    all_tag_ids = [item['tag_ids'] for item in batch]

    # Pad the sequences
    padded_input_ids = pad_sequence(all_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_attention_masks = pad_sequence(all_attention_masks, batch_first=True, padding_value=0)
    padded_tag_ids = pad_sequence(all_tag_ids, batch_first=True, padding_value=tag2idx["-100"])

    return {
        'input_ids': padded_input_ids,
        'attention_mask': padded_attention_masks,
        'tag_ids': padded_tag_ids
    }


In [68]:
train_dataset = NERDataset(uk_train_tokenized_text, uk_train_labels)
val_dataset = NERDataset(uk_eval_tokenized_texts, uk_eval_true_labels)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=pad_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, collate_fn=pad_collate_fn)

# Model optimization

In [69]:
# Defining the teacher model
teacher_model = model_ukr

Creating the student model that is half the size the original  

In [70]:
def create_student_model(teacher_model, teacher_config):

    student_config = teacher_config.num_hidden_layers//2

    student_model = AutoModelForTokenClassification(student_config)

    # Distill weights from teacher to student
    distill_xlm_roberta_weights(teacher=teacher_model.roberta, student=student_model.roberta)

    return student_model

In [71]:
def create_student_model(teacher_model):
    # Load and modify the configuration from the teacher model
    teacher_config = teacher_model.config
    student_config = AutoConfig.from_pretrained(teacher_model.config.name_or_path)
    student_config.num_hidden_layers //= 2

    # Create the student model using the modified configuration
    student_model = AutoModelForTokenClassification.from_config(student_config)

    # Distill weights from teacher to student
    distill_xlm_roberta_weights(teacher=teacher_model, student=student_model)

    return student_model

def distill_xlm_roberta_weights(teacher, student):
    if isinstance(teacher, XLMRobertaForTokenClassification) and isinstance(student, XLMRobertaForTokenClassification):
        teacher_encoder = teacher.roberta
        student_encoder = student.roberta
        distill_encoder_weights(teacher_encoder, student_encoder)
        student.classifier.load_state_dict(teacher.classifier.state_dict())

def distill_encoder_weights(teacher, student):
    teacher_layers = list(teacher.encoder.layer)
    student_layers = list(student.encoder.layer)
    for i in range(len(student_layers)):
        student_layers[i].load_state_dict(teacher_layers[2 * i].state_dict())


In [72]:
student_model = create_student_model(teacher_model)

In [73]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 3
optimizer = optim.AdamW(teacher_model.parameters(), lr=5e-5)

 training a smaller "student" model to get the weights of a larger "teacher" model.

In [74]:
teacher_model = teacher_model.to(device)
teacher_model.eval()  # Teacher model is always in evaluation

criterion = torch.nn.CrossEntropyLoss(ignore_index=tag2idx["-100"]).to(device)

temperature = 2.0  # Temperature for softmax

for epoch in range(EPOCHS):
    student_model.train()
    total_loss = 0

    for i, batch in enumerate(train_dataloader):
        # Get input and target tensors from the batch and move them to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tag_ids = batch['tag_ids'].to(device)

        optimizer.zero_grad()

        student_outputs = student_model(input_ids, attention_mask=attention_mask).logits
        student_logits = student_outputs.view(-1, student_outputs.shape[-1])

        # Forward pass for the teacher model
        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask).logits
            teacher_logits = teacher_outputs.view(-1, teacher_outputs.shape[-1])

        # Soften the logits and calculate the distillation loss
        loss_soft = KLDivLoss(reduction='batchmean')(F.log_softmax(student_logits / temperature, dim=-1),
                                                     F.softmax(teacher_logits / temperature, dim=-1))

        # Calculate the hard loss, which is the usual Cross Entropy loss with true labels
        loss_hard = criterion(student_logits, tag_ids.view(-1))

        loss = loss_soft + loss_hard

        total_loss += loss.item()

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        if (i + 1) % 10 == 0:  # Print loss every 10 batches
            print(f"Epoch [{epoch + 1}/{EPOCHS}], Step [{i + 1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss for epoch {epoch + 1}: {avg_train_loss:.4f}")

Epoch [1/3], Step [10/32], Loss: 1.9676
Epoch [1/3], Step [20/32], Loss: 1.8149
Epoch [1/3], Step [30/32], Loss: 1.9854
Average training loss for epoch 1: 1.9635
Epoch [2/3], Step [10/32], Loss: 1.9393
Epoch [2/3], Step [20/32], Loss: 1.9928
Epoch [2/3], Step [30/32], Loss: 1.9743
Average training loss for epoch 2: 1.9586
Epoch [3/3], Step [10/32], Loss: 2.0235
Epoch [3/3], Step [20/32], Loss: 2.0700
Epoch [3/3], Step [30/32], Loss: 1.9686
Average training loss for epoch 3: 1.9592


Converting model to ONNX format

In [75]:
example_text = df_ukraine.iloc[0].text
inputs = tokenizer(example_text, return_tensors="pt")

dummy_input = inputs["input_ids"]

student_model.eval()
student_model.cpu()

# Export the model
output_onnx_file = "/content/drive/MyDrive/iasa_nlp/student_model.onnx"
torch.onnx.export(student_model,
                  dummy_input,
                  output_onnx_file,
                  export_params=True,
                  opset_version=11,
                  do_constant_folding=True,
                  input_names=['input_ids'],
                  output_names=['output'],
                  dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
                                'output': {0: 'batch_size', 1: 'sequence'}})

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

model_path = "/content/drive/MyDrive/iasa_nlp/student_model.onnx"
quantized_model_path = "/content/drive/MyDrive/iasa_nlp/student_model_quantized.onnx"


quantize_dynamic(model_path,
                 quantized_model_path,
                 weight_type=QuantType.QUInt8)


# Evaluation

Evaluation for teacher model

In [88]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import torch

teacher_model.eval()
teacher_predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['tag_ids'].to(device)

        # Forward pass
        outputs = teacher_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        teacher_predictions.extend(predictions.view(-1).cpu().numpy())
        true_labels.extend(labels.view(-1).cpu().numpy())

teacher_predictions = [pred for pred, label in zip(teacher_predictions, true_labels) if label != -100]
true_labels = [label for label in true_labels if label != -100]


precision, recall, f1, _ = precision_recall_fscore_support(true_labels, teacher_predictions, average='weighted')


In [89]:
f1

0.9478666932284122

Distilled, quantized and converted to ONNX model

In [91]:
import onnxruntime as ort

# Load the quantized ONNX model
quantized_model_path = "/content/drive/MyDrive/iasa_nlp/student_model_quantized.onnx"
ort_session = ort.InferenceSession(quantized_model_path)

distilled_predictions = []

for batch in val_dataloader:
    input_ids = batch['input_ids'].numpy()
    # Run ONNX inference
    ort_inputs = {ort_session.get_inputs()[0].name: input_ids}
    ort_outputs = ort_session.run(None, ort_inputs)
    logits = ort_outputs[0]
    batch_predictions = np.argmax(logits, axis=-1)

    distilled_predictions.extend(batch_predictions.reshape(-1))

# Filter and calculate metrics as before
distilled_predictions = [pred for pred, label in zip(distilled_predictions, true_labels) if label != -100]

distilled_precision, distilled_recall, distilled_f1, _ = precision_recall_fscore_support(true_labels, distilled_predictions, average='weighted')

distilled_f1

  _warn_prf(average, modifier, msg_start, len(result))


0.8667483818041759

The F1 score went down for the optimizied model

### Time measurement

In [92]:
import time

# Measure inference time for the teacher model
teacher_model.eval()
start_time = time.time()

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = teacher_model(input_ids, attention_mask=attention_mask)

teacher_inference_time = time.time() - start_time

In [93]:
start_time = time.time()

for batch in val_dataloader:
    input_ids = batch['input_ids'].numpy()

    ort_inputs = {ort_session.get_inputs()[0].name: input_ids}
    ort_outputs = ort_session.run(None, ort_inputs)

distilled_inference_time = time.time() - start_time

In [94]:
print(f"Teacher Model Inference Time: {teacher_inference_time} seconds")
print(f"Quantized Distilled Model Inference Time: {distilled_inference_time} seconds")

Teacher Model Inference Time: 6.094416379928589 seconds
Quantized Distilled Model Inference Time: 2.5610992908477783 seconds


Optimized model beats the original one in inference time

CONCLUSION: I compared the original model an the model that was optimized, and the F1 score for optimized went down but can be better. Possible steps to improve the optimization:
- check each step separately to understand at which point the accuracy decreased dramatically.
- I tried converting to ONNX and then quantizing the model. Perhaps it would make sense to first apply PyTorch quantization and then convert to ONNX
- Play with quantization, I simply tried convert to int8