In [1]:
! pip install seqeval
!pip install unidecode

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=de20e46ee3b9c55dc375b7985225b53aaaf4a15e5069856ba48b01fa7f4717f9
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pa

In [2]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import re

In [4]:
import pandas as pd

df = pd.read_json("hf://datasets/risqaliyevds/uzbek_ner/uzbek_ner.json")

### Convert Roman numerals and numbers replace to words.

In [10]:
# 1. Roman digits convert to words.
def roman2digit(s):
    roman = {'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000,'IV':4,'IX':9,'XL':40,'XC':90,'CD':400,'CM':900}
    i = 0
    num = 0
    while i < len(s):
        if i+1<len(s) and s[i:i+2] in roman:
            num+=roman[s[i:i+2]]
            i+=2
        else:
            num+=roman[s[i]]
            i+=1
    return num

def replace_roman(match):
    matched = match.group(2)
    # IIV is not roman digit. It means "Ichki ishlar vazirligi"
    if matched == 'IIV':
        return match.group(1)+'iiv'+match.group(3)
    number = roman2digit(matched)
    word = num2word(number)
    if word[-1] == 'i' or word[-1] == 'a':
        word += 'nchi'
    else:
        word += 'inchi'
    if match.group(1)=='-' or match.group(3)=='-':
        return ' '+word+' '
    else:
        return match.group(1)+word+match.group(3)


# 2. Numbers to words
def three_digit(a):
    yuz = a // 100
    on = a // 10 % 10
    bir = a % 10
    word = ''
    # yuzlar xonasi

    if yuz == 1:
        word +="bir"
    elif yuz == 2:
        word += "ikki"
    elif yuz == 3:
        word += "uch"
    elif yuz == 4:
        word += "toʻrt"
    elif yuz == 5:
        word += "besh"
    elif yuz == 6:
        word += "olti"
    elif yuz == 7:
        word += "yetti"
    elif yuz == 8:
        word += "sakkiz"
    elif yuz == 9:
        word += "toʻqqiz"
    if yuz != 0:
        word += " yuz"

    # o'nlar xonasi

    if on == 1:
        word += " oʻn"
    elif on == 2:
        word += " yigirma"
    elif on == 3:
        word += " oʻttiz"
    elif on == 4:
        word += " qirq"
    elif on == 5:
        word += " ellik"
    elif on == 6:
        word += " oltmish"
    elif on == 7:
        word += " yetmish"
    elif on == 8:
        word += " sakson"
    elif on == 9:
        word += " toʻqson"

    # birlar xonasi

    if bir == 1:
        word +=" bir"
    elif bir == 2:
        word += " ikki"
    elif bir == 3:
        word += " uch"
    elif bir == 4:
        word += " toʻrt"
    elif bir == 5:
        word += " besh"
    elif bir == 6:
        word += " olti"
    elif bir == 7:
        word += " yetti"
    elif bir == 8:
        word += " sakkiz"
    elif bir == 9:
        word += " toʻqqiz"
    
    return word

def num2word(n):
    if n == 0:
        return 'nol'
    names = ["", "ming", "million", "milliard", "trillion", "kvadrillion", "kvintillion", "sekstillion", "septillion", "oktalon", "nonalon", "dekalon", "endekalon", "dodekalon"]
    digit = 0
    word = ''
    d = n
    while d > 0:
        d //= 10
        digit += 1

    if digit % 3 == 0:
        x = 0
    else:
        x = 1
    while n > 0:
        if x:
            k = n // 10 ** (digit - digit % 3)
            n %= 10 ** (digit - digit % 3)
        else:
            k = n // 10 ** (digit-3)
            n %= 10 ** (digit-3)
        word += three_digit(k)+' '
        if x:
            word += names[digit//3]+' '
        else:
            word += names[digit//3-1]+' '
        if x:
            digit -= digit % 3
            x = 0
        else:
            digit -= 3

    return word.strip()

def float_num2word(n):
    tens = [' oʻndan ', 'yuzdan ', 'mingdan ', ' oʻn mingdan ', ' yuz mingdan ', ' milliondan ']
    whole = n.split('.')[0]
    frac = n.split('.')[1]
    if frac == '0':
        return num2word(int(whole))
    return num2word(int(whole)) + ' butun ' + tens[len(frac) - 1]+num2word(int(frac))

### List of characters that should remain in the dataset, convert to lowercase

In [11]:

def replace_digit(match):
    if len(match.groups()) == 3:
        # I. ':' orqali ajratilgan raqamlar -> "u" yoki "yu" qo‘shish sharti bilan.
        first = num2word(int(match.group(1)))
        second = num2word(int(match.group(3)))

        if match.group(3)[-1] in "134579":  # Agar oxirgi raqam 1, 3, 4, 5, 7, 9 bo'lsa
            return first + "u " + second
        else:
            return first + "yu " + second

    elif len(match.groups()) == 2:
        # II. Son va '-' bo‘lsa uni tartib raqam shakliga o‘tkazish
        num = num2word(int(match.group(1)))
        if num[-1] in "ai":  # Agar oxirgi harf 'a' yoki 'i' bo‘lsa
            return num + "nchi "
        else:
            return num + "inchi "
    
    else:
        # III. Barcha boshqa sonlarni oddiy so‘z shakliga o‘tkazish
        return num2word(int(match.group()))


# Tozalash funksiyasi
def clean_text(df):
    characters = [
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 

        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 
        'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 
        'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 

        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 
        'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 
        's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 

        ' ', "ʻ" ,"′", "ʼ", "’", "'", "‘", '\n', '\-',':','>'
    ]

    for i in tqdm(df.index):
        # Raqamli yozuvlarni almashtirish
        if re.search(r"\d+\:\d+", df.loc[i, 'text']):
            df.loc[i, 'text'] = re.sub(r"(\d+)(\:)(\d+)", replace_digit, df.loc[i, 'text'])
        if re.search(r"\d(\.|,)\d", df.loc[i, 'text']):
            df.loc[i, 'text'] = re.sub(r"(\d+)(\.|,)(\d+)", replace_digit, df.loc[i, 'text'])
        if re.search(r"\d+-", df.loc[i, 'text']):
            df.loc[i, 'text'] = re.sub(r"(\d+)(-)", replace_digit, df.loc[i, 'text'])
        if re.search(r"\d+", df.loc[i, 'text']):
            df.loc[i, 'text'] = re.sub(r"\d+", replace_digit, df.loc[i, 'text'])

        # Belgilarni tozalash
        df.loc[i, 'text'] = re.sub(f"[^{''.join(characters)}]", r" ", df.loc[i, 'text'])

        df.loc[i, 'text'] = re.sub(r'-',' ', df.loc[i, 'text'])
        df.loc[i, 'text'] = df.loc[i, 'text'].lower().strip()

    return df  # Tozalangan DataFrame`ni qaytarish


### Remove unnecessary characters

In [12]:
clean_text(df)

100%|██████████| 19609/19609 [00:10<00:00, 1948.09it/s]


Unnamed: 0,text,ner
0,shvetsiya hukumati stokholmdagi asosiy piyodal...,"{'GPE': ['Shvetsiya', 'O‘zbekiston', 'Shvetsiy..."
1,turkiya prezidenti rajab toyyib erdo‘g‘an aqsh...,"{'GPE': ['O‘zbekiston', 'Suriya', 'AQSh', 'Vas..."
2,stokholm markazida yuk mashinasi orqali sodir ...,"{'LOC': ['Stokholm', 'Stokgolm'], 'GPE': ['O‘z..."
3,vest hem bosh murabbiyi slaven bilich o‘z vaz...,"{'GPE': ['O‘zbekiston', 'Angliya'], 'ORG': ['V..."
4,aqsh prezidenti donald trampning nabirasi be...,"{'PERSON': ['Donald Tramp', 'Ivanka Tramp', 'S..."
...,...,...
19604,ikki ming yigirma ikkinchi ikki ming yigirma...,"{'GPE': ['O‘zbekiston', 'Qoraqalpog‘iston Resp..."
19605,o‘zbekistonda erkaklar o‘rtacha yigirma oltiu ...,"{'GPE': ['O‘zbekiston', 'Qoraqalpog‘iston', 'A..."
19606,konstitutsion islohotlar muhokamasiga oid yig‘...,"{'LOC': ['Toshkent', 'O‘zbekiston'], 'ORG': ['..."
19607,toshkent shahrida issiq suv ta’minoti vaqtinch...,"{'LOC': ['Toshkent', 'Mirobod', 'Yakkasaroy', ..."


**Create empty dataframe**

In [23]:
df_ner = pd.DataFrame()

### Clean the values of the NER column

In [24]:
df_list=[]
for i in df['ner']:
    list_set = list(i.values())
    for j in list_set:
        df_list.append('>'.join(j))

df_ner['text'] = df_list

### Remove unnecessary characters

In [27]:
clean_text(df_ner)

100%|██████████| 86569/86569 [00:26<00:00, 3277.68it/s]


Unnamed: 0,text
0,shvetsiya>o‘zbekiston>shvetsiya bosh vaziri st...
1,drottninggatanda>stokholmdagi
2,spendrups kompaniyasi
3,shvetsiya bosh vaziri stefan lyoven
4,o‘zbekiston>suriya>aqsh>vashington
...,...
86564,toshiem
86565,o‘zbekiston>hindiston>assam>megxalaya
86566,hindiston shimoli sharqi>shimoli sharqiy assam...
86567,o‘zbekiston prezidenti>hindiston prezidenti>hi...


### Restore the NER column to its original structure

In [30]:
index=0

for i in tqdm(df.index):
    for j in df['ner'][i].keys():
        ner_item = df_ner.text[index].split('>')
        df['ner'][i][j] = ner_item
        index += 1


100%|██████████| 19609/19609 [00:01<00:00, 16965.88it/s]


#### DataFrame to hugging face dataset format

In [32]:
dataset = Dataset.from_pandas(df)

### Fine tuning

The model will be fine-tuned to recognize PERSON, DATE, LOC, ORG, and GPE entities. Use best hyperparametrs

In [34]:
# import necessary packages
import torch
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import numpy as np
from seqeval.metrics import classification_report

# Preprocess the dataset for NER (convert to BIO format with specific labels)
def preprocess_dataset(dataset):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
    # Define the allowed entity types
    allowed_entities = {'PERSON', 'DATE', 'LOC', 'ORG', 'GPE'}
    label_set = set(['O'])  # Start with 'O' label

    def process_example(example):
        text = example['text']
        ner = example['ner']

        # Tokenize with padding and truncation enabled
        tokens = tokenizer(
            text,
            truncation=True,
            max_length=512,
            return_offsets_mapping=True
        )
        token_labels = ['O'] * len(tokens['input_ids'])  # Default all to 'O'

        if ner is None:
            pass
        else:
            for entity_type, entities in ner.items():
                # Only process allowed entity types
                if entity_type not in allowed_entities:
                    continue
                    
                label_set.add(f'B-{entity_type}')
                label_set.add(f'I-{entity_type}')

                if entities is None or not isinstance(entities, (list, tuple)):
                    continue

                for entity in entities:
                    if not isinstance(entity, str):
                        continue
                    start = text.find(entity)
                    if start == -1:
                        continue
                    end = start + len(entity)

                    for i, (offset_start, offset_end) in enumerate(tokens['offset_mapping']):
                        if offset_start >= start and offset_end <= end:
                            if offset_start == start:
                                token_labels[i] = f'B-{entity_type}'
                            else:
                                token_labels[i] = f'I-{entity_type}'

        return {
            'input_ids': tokens['input_ids'],
            'attention_mask': tokens['attention_mask'],
            'labels': token_labels
        }

    # Apply preprocessing
    processed_dataset = dataset.map(process_example, remove_columns=['text', 'ner'])
    label_list = sorted(list(label_set))
    label2id = {label: idx for idx, label in enumerate(label_list)}

    # Convert labels to IDs and pad with -100
    def convert_labels(example):
        try:
            labels = [label2id[label] for label in example['labels']]
            # Pad labels to match max_length (512) with -100
            padded_labels = labels + [-100] * (512 - len(labels))
            example['labels'] = padded_labels
        except KeyError as e:
            raise
        return example

    processed_dataset = processed_dataset.map(convert_labels)
    return processed_dataset, label_list, label2id, tokenizer

# Step 3: Fine-tune XLM-RoBERTa
def fine_tune_model(dataset, label_list, label2id, tokenizer):
    model = XLMRobertaForTokenClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=len(label_list),
        id2label={i: label for i, label in enumerate(label_list)},
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=500,
        fp16=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none"
    )

    # Use the tokenizer's padding directly in the collator
    def data_collator(features):
        batch = tokenizer.pad(
            features,
            padding=True,  # Dynamic padding to longest in batch
            return_tensors="pt"
        )
        # Ensure labels are padded to match input_ids length
        max_len = batch['input_ids'].shape[1]
        batch['labels'] = torch.tensor(
            [f['labels'][:max_len] + [-100] * (max_len - len(f['labels'][:max_len])) for f in features],
            dtype=torch.long
        )
        return batch

    train_test_split = dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=lambda p: compute_metrics(p, label_list)
    )

    trainer.train()
    return trainer, model

# Step 4: Compute metrics
def compute_metrics(pred, label_list):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    results = classification_report(true_labels, pred_labels, output_dict=True)
    return {
        "precision": results["micro avg"]["precision"],
        "recall": results["micro avg"]["recall"],
        "f1": results["micro avg"]["f1-score"],
    }

# Main execution
if __name__ == "__main__":
    processed_dataset, label_list, label2id, tokenizer = preprocess_dataset(dataset)
    trainer, model = fine_tune_model(processed_dataset, label_list, label2id, tokenizer)
    eval_results = trainer.evaluate()
    print("Evaluation results:", eval_results)
    model.save_pretrained("./ner_model")
    tokenizer.save_pretrained("./ner_model")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/19609 [00:00<?, ? examples/s]

Map:   0%|          | 0/19609 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1949,0.178818,0.531551,0.539191,0.535344
2,0.1669,0.167024,0.535358,0.59957,0.565647
3,0.1361,0.166734,0.571025,0.613795,0.591638


Evaluation results: {'eval_loss': 0.16673429310321808, 'eval_precision': 0.5710248811082776, 'eval_recall': 0.6137946362424063, 'eval_f1': 0.5916378048345057, 'eval_runtime': 24.0253, 'eval_samples_per_second': 81.622, 'eval_steps_per_second': 20.437, 'epoch': 3.0}


In [57]:
from transformers import pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

text = "'Samarqanddanmikan yoki Toshkentdanmikan anig'ini bilmadim' dedi Shohruh umirov bilan bordi"
ner = nlp(text)

for entity in ner:
    print(entity)

Device set to use cuda:0


{'entity': 'B-LOC', 'score': 0.74778676, 'index': 2, 'word': 'Sam', 'start': 1, 'end': 4}
{'entity': 'I-LOC', 'score': 0.76890993, 'index': 3, 'word': 'ar', 'start': 4, 'end': 6}
{'entity': 'I-LOC', 'score': 0.77585006, 'index': 4, 'word': 'qan', 'start': 6, 'end': 9}
{'entity': 'I-LOC', 'score': 0.7489506, 'index': 5, 'word': 'd', 'start': 9, 'end': 10}
{'entity': 'B-LOC', 'score': 0.76504, 'index': 10, 'word': '▁Toshkent', 'start': 24, 'end': 32}
{'entity': 'B-PERSON', 'score': 0.9199773, 'index': 23, 'word': '▁Sho', 'start': 65, 'end': 68}
{'entity': 'I-PERSON', 'score': 0.925115, 'index': 24, 'word': 'h', 'start': 68, 'end': 69}
{'entity': 'I-PERSON', 'score': 0.926187, 'index': 25, 'word': 'ruh', 'start': 69, 'end': 72}
{'entity': 'I-PERSON', 'score': 0.93724865, 'index': 26, 'word': '▁um', 'start': 73, 'end': 75}
{'entity': 'I-PERSON', 'score': 0.9402853, 'index': 27, 'word': 'i', 'start': 75, 'end': 76}
{'entity': 'I-PERSON', 'score': 0.9250853, 'index': 28, 'word': 'rov', 'star

### login hugging face

In [60]:
from huggingface_hub import login

login(token="TOKEN")

### Upload model to hugging face

In [63]:
from huggingface_hub import HfApi

username = "tukhtashevshohruh"  # Hugging Face username
repo_name = "xlm-roberta-base-lowercase-high-accuracy"  # Hugging Face-dagi yangi repo nomi
full_repo_name = f"{username}/{repo_name}"

# Hugging Face-da yangi repo yaratish (agar yo'q bo'lsa)
api = HfApi()
api.create_repo(repo_id=full_repo_name, private=False)

# Model va tokenizatorni Hugging Face-ga yuklash
api.upload_folder(
    folder_path="./ner_model",  # Saqlangan model papkasi
    repo_id=full_repo_name,
    commit_message="Fine-tuned model uploaded 11.03.2025"
)


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tukhtashevshohruh/xlm-roberta-base-lowercase-high-accuracy/commit/8d5eb45576a8fc35cff8484e1134d6050507b159', commit_message='Fine-tuned model uploaded 11.03.2025', commit_description='', oid='8d5eb45576a8fc35cff8484e1134d6050507b159', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tukhtashevshohruh/xlm-roberta-base-lowercase-high-accuracy', endpoint='https://huggingface.co', repo_type='model', repo_id='tukhtashevshohruh/xlm-roberta-base-lowercase-high-accuracy'), pr_revision=None, pr_num=None)