# Install necessary library

In [3]:
! pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=3511e5a59adb7a6757666f9e1c2bd2e5e0db0bd636bfbb2a16dc71c8fbebf5bf
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


# Load dataset

In [None]:
dataset = load_dataset("risqaliyevds/uzbek_ner", split="train")

# Find best learning rate

In [2]:
# import necessary packages
import torch
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import numpy as np
from seqeval.metrics import classification_report

# Step 1: Load dataset from Hugging Face
def load_hf_dataset():
    dataset = load_dataset("risqaliyevds/uzbek_ner", split="train")
    return dataset

# Step 2: Preprocess the dataset for NER (convert to BIO format with specific labels)
def preprocess_dataset(dataset):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
    allowed_entities = {'PERSON', 'DATE', 'LOC', 'ORG', 'LAW'}
    label_set = set(['O'])

    def process_example(example):
        text = example['text']
        ner = example['ner']

        tokens = tokenizer(
            text,
            truncation=True,
            max_length=512,
            return_offsets_mapping=True
        )
        token_labels = ['O'] * len(tokens['input_ids'])

        if ner is None:
            pass
        else:
            for entity_type, entities in ner.items():
                if entity_type not in allowed_entities:
                    continue
                    
                label_set.add(f'B-{entity_type}')
                label_set.add(f'I-{entity_type}')

                if entities is None or not isinstance(entities, (list, tuple)):
                    continue

                for entity in entities:
                    if not isinstance(entity, str):
                        continue
                    start = text.find(entity)
                    if start == -1:
                        continue
                    end = start + len(entity)

                    for i, (offset_start, offset_end) in enumerate(tokens['offset_mapping']):
                        if offset_start >= start and offset_end <= end:
                            if offset_start == start:
                                token_labels[i] = f'B-{entity_type}'
                            else:
                                token_labels[i] = f'I-{entity_type}'

        return {
            'input_ids': tokens['input_ids'],
            'attention_mask': tokens['attention_mask'],
            'labels': token_labels
        }

    processed_dataset = dataset.map(process_example, remove_columns=['text', 'ner'])
    label_list = sorted(list(label_set))
    label2id = {label: idx for idx, label in enumerate(label_list)}

    def convert_labels(example):
        try:
            labels = [label2id[label] for label in example['labels']]
            padded_labels = labels + [-100] * (512 - len(labels))
            example['labels'] = padded_labels
        except KeyError as e:
            raise
        return example

    processed_dataset = processed_dataset.map(convert_labels)
    return processed_dataset, label_list, label2id, tokenizer

# Step 3: Fine-tune XLM-RoBERTa
def fine_tune_model(dataset, label_list, label2id, tokenizer):
    model = XLMRobertaForTokenClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=len(label_list),
        id2label={i: label for i, label in enumerate(label_list)},
        label2id=label2id
    )

    learning_rates = [1e-5, 2e-5, 3e-5, 5e-5]
    best_trainer = None
    best_model = None
    best_lr = None
    best_eval_results = None

    def data_collator(features):
        batch = tokenizer.pad(
            features,
            padding=True,
            return_tensors="pt"
        )
        max_len = batch['input_ids'].shape[1]
        batch['labels'] = torch.tensor(
            [f['labels'][:max_len] + [-100] * (max_len - len(f['labels'][:max_len])) for f in features],
            dtype=torch.long
        )
        return batch

    train_test_split = dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']

    for lr in learning_rates:
        print(f"Testing learning rate: {lr}")
        training_args = TrainingArguments(
            output_dir=f'./results_lr_{lr}',
            num_train_epochs=1,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            per_device_eval_batch_size=4,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir=f'./logs_lr_{lr}',
            logging_steps=100,
            fp16=True,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            report_to="none",
            learning_rate=lr,
            lr_scheduler_type="linear"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            compute_metrics=lambda p: compute_metrics(p, label_list)
        )

        trainer.train()
        eval_results = trainer.evaluate()
        print(f"Learning rate {lr} - Evaluation results: {eval_results}")

        # Track the best performing model based on F1 score
        if best_eval_results is None or eval_results['eval_f1'] > best_eval_results['eval_f1']:
            best_trainer = trainer
            best_model = model
            best_lr = lr
            best_eval_results = eval_results

    return best_trainer, best_model, best_lr, best_eval_results

# Step 4: Compute metrics
def compute_metrics(pred, label_list):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    results = classification_report(true_labels, pred_labels, output_dict=True)
    return {
        "precision": results["micro avg"]["precision"],
        "recall": results["micro avg"]["recall"],
        "f1": results["micro avg"]["f1-score"],
    }

# Main execution
if __name__ == "__main__":
    dataset = load_hf_dataset()
    processed_dataset, label_list, label2id, tokenizer = preprocess_dataset(dataset)
    trainer, model, best_lr, eval_results = fine_tune_model(processed_dataset, label_list, label2id, tokenizer)
    print(f"Best learning rate {best_lr} - Final evaluation results: {eval_results}")
    model.save_pretrained("./ner_model")
    tokenizer.save_pretrained("./ner_model")

README.md:   0%|          | 0.00/3.05k [00:00<?, ?B/s]

uzbek_ner.json:   0%|          | 0.00/24.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19609 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/19609 [00:00<?, ? examples/s]

Map:   0%|          | 0/19609 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing learning rate: 1e-05


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1445,0.138469,0.500501,0.482579,0.491376


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Learning rate 1e-05 - Evaluation results: {'eval_loss': 0.1384693831205368, 'eval_precision': 0.5005005005005005, 'eval_recall': 0.48257890165041983, 'eval_f1': 0.49137634514274486, 'eval_runtime': 23.8043, 'eval_samples_per_second': 82.38, 'eval_steps_per_second': 20.627, 'epoch': 1.0}
Testing learning rate: 2e-05




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.131,0.126477,0.549881,0.624554,0.584843


Learning rate 2e-05 - Evaluation results: {'eval_loss': 0.12647707760334015, 'eval_precision': 0.5498810333106731, 'eval_recall': 0.6245536145159734, 'eval_f1': 0.5848434181390936, 'eval_runtime': 23.6741, 'eval_samples_per_second': 82.833, 'eval_steps_per_second': 20.74, 'epoch': 1.0}
Testing learning rate: 3e-05




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.13,0.126789,0.565236,0.631792,0.596664


Learning rate 3e-05 - Evaluation results: {'eval_loss': 0.1267893761396408, 'eval_precision': 0.5652361626802521, 'eval_recall': 0.6317922980407297, 'eval_f1': 0.5966639321848509, 'eval_runtime': 23.5914, 'eval_samples_per_second': 83.123, 'eval_steps_per_second': 20.813, 'epoch': 1.0}
Testing learning rate: 5e-05




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1286,0.127545,0.577286,0.626484,0.600879


Learning rate 5e-05 - Evaluation results: {'eval_loss': 0.1275448054075241, 'eval_precision': 0.5772856634649591, 'eval_recall': 0.6264839301225751, 'eval_f1': 0.6008794260587828, 'eval_runtime': 23.6476, 'eval_samples_per_second': 82.926, 'eval_steps_per_second': 20.763, 'epoch': 1.0}
Best learning rate 5e-05 - Final evaluation results: {'eval_loss': 0.1275448054075241, 'eval_precision': 0.5772856634649591, 'eval_recall': 0.6264839301225751, 'eval_f1': 0.6008794260587828, 'eval_runtime': 23.6476, 'eval_samples_per_second': 82.926, 'eval_steps_per_second': 20.763, 'epoch': 1.0}


### Best learning rate 5e-05

### **default learning rate 5e-5**

# Find best **scheduler** and **weight decays**

In [5]:
import torch
from transformers import TrainingArguments, Trainer, XLMRobertaTokenizerFast, XLMRobertaForTokenClassification
from datasets import load_dataset
import itertools
import numpy as np
from seqeval.metrics import classification_report

# Parametrlar ro'yxati
lr_scheduler_types = ["linear", "cosine", "constant"]
weight_decays = [0.01, 0.1, 0.001]

# Datasetni yuklash
def load_hf_dataset():
    dataset = load_dataset("risqaliyevds/uzbek_ner", split="train")
    return dataset

# Datasetni tayyorlash
def preprocess_dataset(dataset):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
    allowed_entities = {'PERSON', 'DATE', 'LOC', 'ORG', 'LAW'}
    label_set = set(['O'])

    def process_example(example):
        text = example['text']
        ner = example['ner']
        tokens = tokenizer(text, truncation=True, max_length=512, return_offsets_mapping=True)
        token_labels = ['O'] * len(tokens['input_ids'])
        
        if ner:
            for entity_type, entities in ner.items():
                if entities and entity_type in allowed_entities:
                    label_set.add(f'B-{entity_type}')
                    label_set.add(f'I-{entity_type}')
                    for entity in entities:
                        start = text.find(entity)
                        end = start + len(entity)
                        for i, (offset_start, offset_end) in enumerate(tokens['offset_mapping']):
                            if offset_start >= start and offset_end <= end:
                                token_labels[i] = f'B-{entity_type}' if offset_start == start else f'I-{entity_type}'
        
        return {'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask'], 'labels': token_labels}
    
    processed_dataset = dataset.map(process_example, remove_columns=['text', 'ner'])
    label_list = sorted(list(label_set))
    label2id = {label: idx for idx, label in enumerate(label_list)}
    
    def convert_labels(example):
        example['labels'] = [label2id[label] for label in example['labels']] + [-100] * (512 - len(example['labels']))
        return example
    
    processed_dataset = processed_dataset.map(convert_labels)
    return processed_dataset, label_list, label2id, tokenizer

# Modelni fine-tune qilish
def fine_tune_model(dataset, label_list, label2id, tokenizer, scheduler, wd):
    model = XLMRobertaForTokenClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=len(label_list),
        id2label={i: label for i, label in enumerate(label_list)},
        label2id=label2id
    )
    
    def data_collator(features):
        batch = tokenizer.pad(features, padding=True, return_tensors="pt")
        max_len = batch['input_ids'].shape[1]
        batch['labels'] = torch.tensor(
            [f['labels'][:max_len] + [-100] * (max_len - len(f['labels'][:max_len])) for f in features], dtype=torch.long
        )
        return batch
    
    train_test_split = dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']
    
    training_args = TrainingArguments(
        output_dir="./results_temp",  # Natijalarni saqlamaslik
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=wd,
        logging_dir=None,  # Loglarni o‘chirish
        logging_steps=500,
        fp16=True,
        evaluation_strategy="epoch",
        save_strategy="no",  # Modelni saqlamaslik
        load_best_model_at_end=False,
        report_to="none",
        lr_scheduler_type=scheduler
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=lambda p: compute_metrics(p, label_list)
    )
    
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_f1"]

# Baholash metrikasi
def compute_metrics(pred, label_list):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    results = classification_report(true_labels, pred_labels, output_dict=True)
    return {"f1": results["micro avg"]["f1-score"]}

# Hyperparameter tuning
if __name__ == "__main__":
    dataset = load_hf_dataset()
    processed_dataset, label_list, label2id, tokenizer = preprocess_dataset(dataset)
    
    best_f1 = 0
    best_params = None
    
    for idx, (scheduler, wd) in enumerate(itertools.product(lr_scheduler_types, weight_decays)):
        print(f"Testing combination {idx+1}/{len(lr_scheduler_types) * len(weight_decays)}: Scheduler={scheduler}, WD={wd}")
        f1_score = fine_tune_model(processed_dataset, label_list, label2id, tokenizer, scheduler, wd)
        print(f"Combination {idx+1} - F1 Score: {f1_score}")
        
        if f1_score > best_f1:
            best_f1 = f1_score
            best_params = (scheduler, wd)
    
    print(f"\nBest combination: Scheduler={best_params[0]}, Weight Decay={best_params[1]}")
    print(f"Best F1 Score: {best_f1}")


Map:   0%|          | 0/19609 [00:00<?, ? examples/s]

Map:   0%|          | 0/19609 [00:00<?, ? examples/s]

Testing combination 1/9: Scheduler=linear, WD=0.01


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.1601,0.146766,0.549754


Combination 1 - F1 Score: 0.5497542739530391
Testing combination 2/9: Scheduler=linear, WD=0.1


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1589,0.15247,0.545524


Combination 2 - F1 Score: 0.5455235524817611
Testing combination 3/9: Scheduler=linear, WD=0.001


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1576,0.152044,0.546296


Combination 3 - F1 Score: 0.5462962962962964
Testing combination 4/9: Scheduler=cosine, WD=0.01


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1561,0.153646,0.54423


Combination 4 - F1 Score: 0.5442298674169757
Testing combination 5/9: Scheduler=cosine, WD=0.1


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1559,0.153202,0.537439


Combination 5 - F1 Score: 0.5374389466978127
Testing combination 6/9: Scheduler=cosine, WD=0.001


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1561,0.153378,0.54378


  _warn_prf(average, modifier, msg_start, len(result))


Combination 6 - F1 Score: 0.5437799753830483
Testing combination 7/9: Scheduler=constant, WD=0.01


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1655,0.161418,0.533405


  _warn_prf(average, modifier, msg_start, len(result))


Combination 7 - F1 Score: 0.5334048168921147
Testing combination 8/9: Scheduler=constant, WD=0.1


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1653,0.161431,0.532701


  _warn_prf(average, modifier, msg_start, len(result))


Combination 8 - F1 Score: 0.5327014218009479
Testing combination 9/9: Scheduler=constant, WD=0.001


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1657,0.161938,0.526644


  _warn_prf(average, modifier, msg_start, len(result))


Combination 9 - F1 Score: 0.5266436848444573

Best combination: Scheduler=linear, Weight Decay=0.01
Best F1 Score: 0.5497542739530391


### Best combination: Scheduler=linear, Weight Decay=0.01
### Best F1 Score: 0.5497542739530391