In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.zip

--2021-04-30 16:01:16--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8344906764 (7.8G) [application/zip]
Saving to: ‘wiki.ru.zip’


2021-04-30 16:04:36 (39.7 MB/s) - ‘wiki.ru.zip’ saved [8344906764/8344906764]



In [None]:
!unzip wiki.ru.zip

Archive:  wiki.ru.zip
  inflating: wiki.ru.vec             
  inflating: wiki.ru.bin             


In [2]:
import re
import shutil

import torch
import numpy as np
import pandas as pd

from torch import nn, optim
from gensim.models import FastText
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm

In [None]:
model = FastText.load_fasttext_format('wiki.ru.bin')

In [3]:
shutil.copy('/content/drive/MyDrive/Contur/kontur_srs_internship_test_task.zip', './')

'./kontur_srs_internship_test_task.zip'

In [4]:
!unzip kontur_srs_internship_test_task.zip

Archive:  kontur_srs_internship_test_task.zip
  inflating: train.txt               
  inflating: __MACOSX/._train.txt    
  inflating: test.txt                
  inflating: __MACOSX/._test.txt     


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def read_file(path_to_file):
    data = []
    with open(path_to_file) as f:
        for line in f:
            data.append(line.strip())
    return data

In [6]:
train = pd.DataFrame(read_file('.//train.txt'), columns=['text'])
test = pd.DataFrame(read_file('.//test.txt'), columns=['text'])

In [None]:
train, val = train_test_split(train, test_size=0.1)

In [None]:
print('Train shape:     ', train.shape[0],
      '\nValidation shape:', val.shape[0])

Train shape:      4709229 
Validation shape: 523248


In [None]:
def simple_tokenizer(text):
    tokens = re.findall(r'[\w]+', text)
    return tokens

In [None]:
def get_labels(tokens):
    result = []
    for token in tokens:
        if token.islower():
            result.append(0)
        elif token.isupper():
            result.append(2)
        else:
            result.append(1)
    return result

In [None]:
dict_transfrom = {
    0:'lower',
    1:'capitalize',
    2:'upper',
    3:'pad'
}

In [None]:
def get_vec_and_labels(text, max_size):
        tokens = simple_tokenizer(text)
        labels = get_labels(tokens)
        vecs = []
        for token in tokens:
            try:
                if token in model.wv.vocab:
                    vecs.append(model.wv[token.lower()])
                else:
                    vecs.append(model.wv.word_vec(token.lower()))
            except:
                vecs.append(model.wv['<unk>'])
        lenght = len(labels)
        if max_size is None:
            return vecs, tokens, labels
        elif len(labels) < max_size:
            labels.extend([3 for _ in range(max_size - lenght)])
            vecs.extend([model.wv['<pad>'] for _ in range(max_size - lenght)])
        else:
            labels = labels[:max_size]
            vecs = vecs[:max_size]
        return vecs, labels

In [None]:
class RegDataset(Dataset):
    def __init__(self, text, text2vec_label, max_size=60):
        self.text = text
        self.transform = text2vec_label
        self.maxsize = max_size
    def __len__(self):
        return len(self.text)

    def __getitem__(self, ind):
        vecs, labels = self.transform(self.text.iloc[ind, 0], self.maxsize)
        return torch.tensor(vecs), torch.tensor(labels, dtype=torch.long)

In [None]:
train_dataset = RegDataset(train, get_vec_and_labels, 20)
val_dataset = RegDataset(val, get_vec_and_labels, 20)
test_dataset = RegDataset(val, get_vec_and_labels, 60)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1000)
test_loader = DataLoader(test_dataset, batch_size=1000)

In [None]:
class RNN_Reg(nn.Module):
    def __init__(self, in_size, hid_dim, n_layers, dropout, bidirectional):
        super().__init__()
        self.rnn = nn.LSTM(
            input_size=in_size,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout,
            bidirectional=bidirectional
        )
        self.lin = nn.Sequential(
            nn.Linear(hid_dim * (2 if bidirectional else 1), 100),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(100, 4)
        )
    def forward(self, x):
        # x shape: batch, seq, hid_dim
        seq_len = x.size(1)
        x = x.permute(1, 0, 2)
        output, (h_n, c_n) = self.rnn(x)
        output = output.permute(1, 2, 0)
        output_model = torch.zeros((output.size(0), 4, output.size(2))).to(x.device)
        for j in range(seq_len):
            output_model[:, :, j] = self.lin(output[:, :, j])
             
        return output_model

In [None]:
NN = RNN_Reg(300, 200, 2, 0.5, True).to(device)

In [None]:
loss_func = nn.CrossEntropyLoss(weight=torch.tensor([1, 1, 1, 1e-10]).to(device))
opt = optim.Adam(NN.parameters(), lr=3e-4)

In [None]:
torch.cuda.empty_cache()

In [None]:
max_epochs = 1
patience = 3

In [None]:
min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    NN.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        with torch.set_grad_enabled(True):

            opt.zero_grad()
            data = batch[0].to(device)
            label = batch[1].to(device)

            output = NN(data)

            loss = loss_func(output, label)
            loss.backward()
            train_loss += loss.item()

            opt.step()

    train_loss /= len(train_loader)

    val_loss = 0.0
    NN.eval()

    pbar = tqdm(enumerate(val_loader), total=len(val_loader), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        with torch.no_grad():
            data = batch[0].to(device)
            label = batch[1].to(device)

            output = NN(data)

            loss = loss_func(output, label)
            val_loss += loss.item()        

    val_loss /= len(val_loader)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = NN.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print(f'Epoch: {epoch}, Training Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}')
NN.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=4710.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=524.0), HTML(value='')))

Epoch: 1, Training Loss: 0.249313, Validation Loss: 0.208511


HBox(children=(FloatProgress(value=0.0, max=4710.0), HTML(value='')))

KeyboardInterrupt: ignored

In [None]:
def change_register_corpus(text):
    with torch.no_grad():
        vecs, tokens, _ = get_vec_and_labels(text, None)
        input = torch.tensor(vecs).unsqueeze(0).to(device)
        output = torch.argmax(NN(input), dim=1).squeeze(0).cpu().tolist()
        sub_string = re.sub(r'[\w]+', '/', text)
        true_tokens = change_register(output, tokens)
        return change_tokens(true_tokens, sub_string)

In [None]:
def change_register(transform, tokens):
    for i in range(len(tokens)):
        if transform[i] == 0:
            tokens[i] = tokens[i].lower()
        elif transform[i] == 1:
            tokens[i] = tokens[i].capitalize()
        elif transform[i] == 2:
            tokens[i] = tokens[i].upper()
        elif transform[i] == 3:
            tokens[i] = tokens[i].lower()
    return tokens

In [None]:
def change_tokens(tokens, text):
    for token in tokens:
        text = re.sub(r'/', token, text, count=1)
    return text

In [None]:
test['predicted'] = test.text.apply(change_register_corpus)

In [None]:
test.to_csv('result.csv')

In [None]:
with open('test_right.txt', 'w') as f:
    for i in range(test.shape[0]):
        f.write(test.iloc[i, 1]+'\n')

In [8]:
result = pd.DataFrame(read_file('.//result.txt'), columns=['text'])

In [9]:
test['result'] = result['text']

In [10]:
test.head(10)

Unnamed: 0,text,result
0,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ РАДИО...,Общество с ограниченной ответственностью Радио...
1,"ГБУ ""АЗНАКАЙЛЕС""","ГБУ ""Азнакайлес"""
2,"ОАО ""АЗНАКАЕВСКОЕ ПАТП""","ОАО ""Азнакаевское патп"""
3,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АЗНА...","Общество с ограниченной ответственностью ""Азна..."
4,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АЛНАЗ""","Общество с ограниченной ответственностью ""Алназ"""
5,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""МАГИ...","Общество с ограниченной ответственностью ""Маги..."
6,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЭНТЕР""","Общество с ограниченной ответственностью ""Энтер"""
7,"ЗАКРЫТОЕ АКЦИОНЕРНОЕ ОБЩЕСТВО ""АЗГЕОФИЗИКА""","Закрытое акционерное общество ""Азгеофизика"""
8,ГИАГИНСКАЯ РАЙОННАЯ ОБЩЕСТВЕННАЯ ОРГАНИЗАЦИЯ А...,Гиагинская районная общественная организация А...
9,"РО ПП ""КОПРФ"" В РА","РО ПП ""Копрф"" в ра"
