<a href="https://colab.research.google.com/github/Tencho0/NLP/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
!nvidia-smi


Tue Dec  9 16:03:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P0             29W /   70W |    3442MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [60]:
# ===============================
# Инсталация
# ===============================
!pip install -q transformers datasets evaluate seqeval sentencepiece torch accelerate

In [61]:
# ===============================
# Импорти
# ===============================
import json
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
from seqeval.metrics import classification_report
import re


In [63]:
# ===============================
# Зареждане на JSON
# ===============================
file_path = "/content/generated_addresses_1000.json"

with open(file_path, "r", encoding="utf-8") as f:
    real_addresses = json.load(f)


In [64]:
# ===============================
# Rule-based parser
# ===============================
def normalize_address(addr):
    s = addr.strip()
    s = s.replace(",,", ",")
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"[\,\.\;]+", ",", s)
    s = re.sub(r"\s+", " ", s)
    s = s.strip(", ").lower()
    s = re.sub(r"\bул\.?\b", "улица", s)
    s = re.sub(r"\bбул\.?\b", "булевард", s)
    s = re.sub(r"\bж\.?к\.?\b|\bжк\b", "жилищен комплекс", s)
    s = re.sub(r"\bкв\.?\b", "квартал", s)
    s = re.sub(r"\bгр\.?\b", "град", s)
    s = re.sub(r"\bап\.?\b|\bапт\b", "апартамент", s)
    s = re.sub(r"\bвх\.?\b", "вход", s)
    return s.strip()

pattern = re.compile(
    r"(?P<type>улица|булевард|жилищен комплекс|квартал)\s*"
    r"(?P<name>[\w\-\.\s]+?)\s*"
    r"(?P<number>\d+[A-Za-z\-]?)?"
    r"(?:[,\s]+(?:бл|блок)\s*(?P<block>\d+))?"
    r"(?:[,\s]+(?:вход|вх)\s*(?P<entrance>[А-ЯA-Za-z]))?"
    r"(?:[,\s]+(?:апартамент|ап)\s*(?P<apt>\d+))?.*?"
    r"(?:град\s*)?(?P<city>[А-Яа-я\s]+)?",
    re.IGNORECASE
)

def rule_parse(addr):
    s = normalize_address(addr)
    m = pattern.search(s)
    if not m:
        return {}
    res = {k:(m.group(k) or "").strip() for k in ["type","name","number","block","entrance","apt","city"]}
    return {
        "street_type": res["type"],
        "street_name": res["name"],
        "number": res["number"],
        "block": res["block"],
        "entrance": res["entrance"],
        "apartment": res["apt"],
        "city": res["city"]
    }



In [65]:
# ===============================
# Hugging Face Dataset
# ===============================
train_dataset = Dataset.from_list(real_addresses)
val_dataset = Dataset.from_list(real_addresses[:20])


In [66]:
# ===============================
# Tokenizer + mapping
# ===============================
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Mapping labels
unique_labels = list({l for item in real_addresses for l in item["labels"]})
label2id = {l:i for i,l in enumerate(unique_labels)}
id2label = {i:l for l,i in label2id.items()}

# Tokenization + alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=32,
        is_split_into_words=True
    )
    labels_batch = []
    for i, label_seq in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_seq[word_idx]])
            else:
                label_ids.append(label2id[label_seq[word_idx]] if label_seq[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx
        labels_batch.append(label_ids)
    tokenized_inputs["labels"] = labels_batch
    return tokenized_inputs

train_tok = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tok = val_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [67]:
# ===============================
# Модел
# ===============================
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
# ===============================
# Проверка на версията на Transformers библиотеката
# ===============================
print(transformers.__version__)


4.57.3


In [69]:
# ===============================
# Обновяване на Transformers библиотеката
# ===============================
!pip install --upgrade transformers




In [72]:
# ===============================
# Trainer
# ===============================
training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [73]:
# ===============================
# Тренировка
# ===============================
trainer.train()

Step,Training Loss


TrainOutput(global_step=315, training_loss=0.06688247559562562, metrics={'train_runtime': 241.2869, 'train_samples_per_second': 20.722, 'train_steps_per_second': 1.305, 'total_flos': 81658927680000.0, 'train_loss': 0.06688247559562562, 'epoch': 5.0})

In [75]:

# ===============================
# Hybrid parse за всички адреси
# ===============================
def hybrid_parse(addr_tokens):
    res = rule_parse(" ".join(addr_tokens))
    if res.get("street_name") and res.get("number"):
        res["source"] = "rule"
        return res
    # ML fallback
    device = next(model.parameters()).device
    tokens = tokenizer(addr_tokens, is_split_into_words=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**tokens)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()
    if isinstance(predictions, int):
        predictions = [predictions]
    labels = [id2label[p] for p in predictions]
    res.update({"ml_labels": labels, "source": "ml"})
    return res

# Приложение на всички адреси
hybrid_results = []
for addr_item in real_addresses:
    res = hybrid_parse(addr_item["tokens"])
    hybrid_results.append(res)

# Принтиране на първите 5 резултата
for res in hybrid_results[:5]:
    print(res)


{'street_type': 'жилищен комплекс', 'street_name': 'в', 'number': '', 'block': '', 'entrance': '', 'apartment': '', 'city': 'асил левски', 'ml_labels': ['B-CITY', 'B-STREET_TYPE', 'B-STREET_TYPE', 'B-STREET_TYPE', 'B-STREET_NAME', 'I-STREET_TYPE', 'I-STREET_TYPE', 'B-NUMBER', 'B-ENTRANCE', 'B-CITY', 'B-CITY'], 'source': 'ml'}
{'street_type': 'улица', 'street_name': 'б', 'number': '', 'block': '', 'entrance': '', 'apartment': '', 'city': 'отев', 'ml_labels': ['B-CITY', 'B-STREET_TYPE', 'B-STREET_NAME', 'I-STREET_NAME', 'I-STREET_NAME', 'B-NUMBER', 'B-CITY', 'B-CITY'], 'source': 'ml'}
{'street_type': 'улица', 'street_name': 'б', 'number': '', 'block': '', 'entrance': '', 'apartment': '', 'city': 'отев', 'ml_labels': ['B-NUMBER', 'B-STREET_TYPE', 'B-STREET_NAME', 'B-STREET_NAME', 'I-STREET_NAME', 'B-NUMBER', 'B-CITY', 'B-CITY'], 'source': 'ml'}
{'street_type': 'булевард', 'street_name': 'н', 'number': '', 'block': '', 'entrance': '', 'apartment': '', 'city': 'езависимост', 'ml_labels': ['