In [2]:
import re

In [1]:
def get_words_and_columns(s: str) -> list[str]:
    result_list = []
    curr_str = []
    for i in range(len(s)):
        if s[i].isalpha():
            curr_str.append(s[i])
        else:
            if len(curr_str) != 0:
                result_list.append(''.join(curr_str))
            if s[i] == ',':
                result_list.append(',')
            curr_str = []
        
    if len(curr_str) != 0:
        result_list.append(''.join(curr_str))

    return result_list
            


In [None]:
import docx
import json
import pandas as pd
import nltk

import random
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [None]:
# функция для генерации
correct_addresses = []
republics = ['Чувашская', 'Чеченская', 'Кабардино-Балкарская']
names_rep = ['республика', 'область']
city = ['Москва', 'Санкт-Петебург', 'Чебоксары']
street_name = ['Моховая', 'Тверская', 'Невская']
def generate_address():
    for i in range(1000):
        new_str = random.choice(republics) + ' ' + random.choice(names_rep) + ', город ' + random.choice(city) + ', улица ' + random.choice(street_name)
    
    correct_addresses.append(new_str)


In [None]:
# функция для скрытия слов
def masking(s, word):
    new_s = s.replace(word, '[MASK]')
    return new_s

In [None]:
# нейросеть для обработки

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize(addresses):
    return tokenizer(addresses, padding=True, truncation=True, return_tensors="pt")

for original in correct_addresses:
    print(f"Original: {original}")
    print()

inputs = tokenize([masking(i, 'Республика') for i in correct_addresses])
labels = tokenize(correct_addresses)

class AddressCorrector(nn.Module):
    def __init__(self):
        super(AddressCorrector, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=tokenizer.vocab_size)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

model = AddressCorrector()
optimizer = AdamW(model.parameters(), lr=5e-5)

dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels['input_ids'])

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

def train(model, dataloader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        for batch in dataloader:
            input_ids, attention_mask, label_ids = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_ids)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print(f"Epoch {epoch}, Loss: {loss.item()}")

train(model, dataloader, optimizer)

def predict(model, address):
    model.eval()
    inputs = tokenize([address])
    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    predicted_ids = torch.argmax(outputs.logits, dim=-1)
    return tokenizer.decode(predicted_ids[0])


In [None]:
curr_addresses = pd.read_excel('source_addresses_test.xlsx')['Адрес']

In [None]:
good_amount = 0
with open('corrected.txt', 'w') as file_to:
    for test_address in curr_addresses:
        new_test_address = test_address.replace('республика', '[MASK]')
        corrected_address = predict(model, new_test_address)
        if corrected_address == test_address:
            good_amount += 1
        print(f"Original: {new_test_address}")
        print(f"Corrected: {corrected_address}", file=file_to)


In [None]:
# второй вариант обработки, приведен для примера

string = '428023 Чувашская Република, город Чебоксары, бульвар Миттова, '
words = string.split()
for i in range(len(words)):
    new_word = words[i].strip(',')
    dist = nltk.edit_distance('Республика', new_word)
    if 1 <= dist <= 3:
        if words[i][-1] == ',':
            words[i] = 'Республика' + ','
        else:
            words[i] = 'Республика'

' '.join(words)