In [2]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from scripts.custom_dataset import CustomDataset
from scripts.model import MHAModel
from scripts.tokenizer import SeparatorTokenizer
from scripts.vectorizer import Vectorizer
from scripts.vocabulary import Vocabulary
import pandas as pd
import numpy as np
import json
import os

In [3]:
TEST_PROPORTION = 0.0
EVAL_PROPORTION = 0.0

SHUFFLE = True
DROP_LAST = True
EPOCHS = 0
LEARNING_RATE = 0.00001

LR_SCHEDULER_FACTOR = 0.5
LR_SCHEDULER_PATIENCE = 2

USE_PRETRAINED = False

BATCH_SIZE = 16
BIAS = True
EMBEDDING_DIM = 128
ATTENTION_DIM = 512
NUM_HEAD = 8
NUM_ENCODER_LAYERS = 4
ENCODER_FC_HIDDEN_DIM = ATTENTION_DIM*4 # Как в классическом трансформере
CLASSIFIER_FC_HIDDEN_DIM = ATTENTION_DIM*2
DROPOUT = 0.1
TEMPERATURE = 0.7
BATCH_FIRST = True

MODEL_SAVE_FILEPATH = 'data/model_params.pt'
DATASET_PATH = 'D:/Files/Datasets/UD_Russian-SynTagRus-master'

RANDOM_STATE = 42

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
def find_max_source_len(dataframe:pd.DataFrame)->int:
    '''Возвращает максимальную длину входной последовательности в датафрейме'''
    max_source_tokens = 0
    for i in range(len(dataframe)):
        max_source_tokens = max(len(dataframe.loc[i, 'source_tokens']), max_source_tokens)
    return max_source_tokens

In [5]:
def generate_batches(dataset:CustomDataset, batch_size:int, shuffle:bool=True, drop_last:bool=True, device='cpu'):
    '''Создает батчи из датасета и переносит данные на девайс.'''
    dataloader = DataLoader(dataset, batch_size, shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [6]:
def save_results_to_file(model, model_filepath:str, train_states:list=None, validation_states:list=None):
    '''Сохраняет параметры модели и метрики обучения в файлы.'''
    torch.save(model, model_filepath)
    if train_states is not None:
        with open("data/train_states.json", "w", encoding="utf-8") as file:
            json.dump(train_states, file, indent=4, ensure_ascii=False)

    if validation_states is not None:
        with open("data/validation_states.json", "w", encoding="utf-8") as file:
            json.dump(validation_states, file, indent=4, ensure_ascii=False)

In [7]:
def preprocess_df(df:pd.DataFrame, source_column_name:str):
    for row in range(len(df)):
        cur_row = df[source_column_name].iloc[row]
        for i in range(len(cur_row)):
            df[source_column_name].iloc[row][i] = df[source_column_name].iloc[row][i].lower()

In [22]:
def normalize_sizes(prediction, target, target_names):
    for key, _ in target_names.items():
        if len(prediction[key].size()) == 3:
            prediction[key] = prediction[key].reshape(-1, prediction[key].size(-1))
        if len(target[key].size()) == 2:
            target[key] = target[key].reshape(target[key].size(-1))
    return prediction, target

In [None]:
def compute_loss(prediction:dict[str:torch.tensor], target:dict[str:list[int]]):
    loss = torch.nn.functional.cross_entropy()
    pass

In [8]:
train_df = pd.read_parquet(os.path.join(DATASET_PATH, 'ru_syntagrus-ud-train.parquet'))

In [9]:
MAX_SOURCE_LENGTH = find_max_source_len(train_df) + 2 # Прибавляем 2 для учета доп. токенов BOS и EOS
target_names = ['upos']
source_name = 'source_tokens'
preprocess_df(train_df, source_name)

In [10]:
train_df

Unnamed: 0,source_tokens,lemmas,upos,xpos,feats,head,deprel,misc
0,"[анкета, .]","[анкета, .]","[NOUN, PUNCT]","[None, None]","[{'Animacy': 'Inan', 'Case': 'Nom', 'Gender': ...","[0, 1]","[root, punct]","[{'SpaceAfter': 'No'}, None]"
1,"[начальник, областного, управления, связи, сем...","[начальник, областной, управление, связь, Семе...","[NOUN, ADJ, NOUN, NOUN, PROPN, PROPN, AUX, NOU...","[None, None, None, None, None, None, None, Non...","[{'Animacy': 'Anim', 'Case': 'Nom', 'Gender': ...","[8, 3, 1, 3, 1, 5, 8, 0, 8, 11, 8, 13, 11, 11,...","[nsubj, amod, nmod, nmod, appos, flat:name, co...","[None, None, None, None, None, None, None, Non..."
2,"[в, приемной, его, с, утра, ожидали, посетител...","[в, приемная, он, с, утро, ожидать, посетитель...","[ADP, NOUN, PRON, ADP, NOUN, VERB, NOUN, PUNCT...","[None, None, None, None, None, None, None, Non...","[None, {'Animacy': 'Inan', 'Case': 'Loc', 'Gen...","[2, 6, 6, 5, 6, 0, 6, 13, 13, 13, 13, 13, 7, 1...","[case, obl, obj, case, obl, root, nsubj, punct...","[None, None, None, None, None, None, {'SpaceAf..."
3,"[однако, стиль, работы, семена, еремеевича, за...","[однако, стиль, работа, Семен, Еремеевич, закл...","[ADV, NOUN, NOUN, PROPN, PROPN, VERB, ADP, PRO...","[None, None, None, None, None, None, None, Non...","[{'Degree': 'Pos'}, {'Animacy': 'Inan', 'Case'...","[6, 6, 2, 3, 4, 0, 8, 6, 11, 11, 8, 13, 11, 16...","[advmod, nsubj, nmod, nmod, flat:name, root, c...","[None, None, None, None, None, None, None, {'S..."
4,"[приемная, была, обставлена, просто, ,, но, по...","[приемная, быть, обставить, просто, ,, но, по-...","[NOUN, AUX, VERB, ADV, PUNCT, CCONJ, ADV, PUNCT]","[None, None, None, None, None, None, None, None]","[{'Animacy': 'Inan', 'Case': 'Nom', 'Gender': ...","[3, 3, 0, 3, 7, 7, 4, 3]","[nsubj:pass, aux:pass, root, advmod, punct, cc...","[None, None, None, {'SpaceAfter': 'No'}, None,..."
...,...,...,...,...,...,...,...,...
69626,"[благодаря, расширению, контактов, ,, возможно...","[благодаря, расширение, контакт, ,, возможност...","[ADP, NOUN, NOUN, PUNCT, NOUN, PUNCT, VERB, AD...","[None, None, None, None, None, None, None, Non...","[None, {'Animacy': 'Inan', 'Case': 'Dat', 'Gen...","[2, 32, 2, 5, 2, 7, 5, 10, 10, 7, 10, 13, 2, 1...","[case, obl, nmod, punct, conj, punct, acl, amo...","[None, None, {'SpaceAfter': 'No'}, None, {'Spa..."
69627,"[на, современном, этапе, итальянско, -, россий...","[на, современный, этап, итальянско, -, российс...","[ADP, ADJ, NOUN, ADJ, PUNCT, ADJ, NOUN, ADP, N...","[None, None, None, None, None, None, None, Non...","[None, {'Case': 'Loc', 'Degree': 'Pos', 'Gende...","[3, 3, 16, 6, 4, 7, 16, 9, 7, 12, 10, 13, 9, 1...","[case, amod, obl, compound, punct, amod, nsubj...","[None, None, None, {'SpaceAfter': 'No'}, {'Spa..."
69628,"[этому, способствуют, и, известная, отстраненн...","[это, способствовать, и, известный, отстраненн...","[PRON, VERB, PART, ADJ, NOUN, PUNCT, VERB, ADJ...","[None, None, None, None, None, None, None, Non...","[{'Animacy': 'Inan', 'Case': 'Dat', 'Gender': ...","[2, 0, 5, 5, 2, 7, 5, 9, 7, 13, 12, 13, 7, 13,...","[iobj, root, advmod, amod, nsubj, punct, acl, ...","[None, None, None, None, {'SpaceAfter': 'No'},..."
69629,"[в, xxi, столетии, итальянская, и, российская,...","[в, XXI, столетие, итальянский, и, российский,...","[ADP, ADJ, NOUN, ADJ, CCONJ, ADJ, NOUN, PUNCT,...","[None, None, None, None, None, None, None, Non...","[None, {'NumForm': 'Roman', 'NumType': 'Ord'},...","[3, 3, 11, 7, 6, 4, 11, 9, 11, 9, 0, 11, 12, 1...","[case, amod, obl, amod, cc, conj, nsubj:pass, ...","[None, None, None, None, None, None, {'SpaceAf..."


In [11]:
source_vocab = Vocabulary()
target_vocabs = {target_name: Vocabulary(add_bos_eos_tokens=False) for target_name in target_names}
for i in range(len(train_df)):
    source_vocab.add_tokens(train_df[source_name].iloc[i])
    for target_name in target_names:
        target_vocabs[target_name].add_tokens(train_df[target_name].iloc[i])

mask_index = source_vocab.mask_idx
source_vocab_len = len(source_vocab)
cls_names_params = {key:len(target_vocabs[key]) for key in target_names}

In [12]:
print(len(source_vocab))
for el in target_names:
    print(len(target_vocabs[el]))

121667
20


In [15]:
if USE_PRETRAINED:
    with open("data/train_states.json", "r", encoding="utf-8") as file:
        train_states = json.load(file)

    with open("data/validation_states.json", "r", encoding="utf-8") as file:
        validation_states = json.load(file)
    
    model = torch.load(MODEL_SAVE_FILEPATH, weights_only=False)
else:
    train_states = []
    validation_states = []
    model = MHAModel(MAX_SOURCE_LENGTH, source_vocab_len, EMBEDDING_DIM, ATTENTION_DIM, NUM_HEAD, NUM_ENCODER_LAYERS, CLASSIFIER_FC_HIDDEN_DIM, ENCODER_FC_HIDDEN_DIM,\
                     cls_names_params, DROPOUT, TEMPERATURE, BATCH_FIRST, BIAS, mask_index)

In [None]:
vectorizer = Vectorizer(source_vocab, target_vocabs, MAX_SOURCE_LENGTH, mask_index)
dataset = CustomDataset(train_df, vectorizer, target_names)

model = model.to(device=DEVICE)
optimizer = optim.Adam(model.parameters(), LEARNING_RATE)

In [None]:
try:
    for epoch in range(EPOCHS):
        dataset.set_dataframe_split('train')
        batch_generator = generate_batches(dataset, BATCH_SIZE, SHUFFLE, DROP_LAST, DEVICE)
        epoch_sum_train_loss = 0.0
        epoch_running_train_loss = 0.0
        epoch_train_acc = 0.0
        model.train()
        for batch_idx, batch_dict in enumerate(batch_generator):
            optimizer.zero_grad()
            prediction = model(batch_dict['source_x'])



except KeyboardInterrupt:
    print('Принудительная остановка')

IndentationError: expected an indented block after 'for' statement on line 2 (2958534976.py, line 4)