# Обучение модели

In [None]:
!pip install transformers torch pandas datasets

import pandas as pd
from transformers import MarianTokenizer, MarianMTModel
from datasets import Dataset
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

Используемое устройство: cpu


In [None]:
df = pd.read_csv('russian_aleut_dataset.csv', sep=';', encoding='utf-8')
translation_pairs = df[['Russian', 'Aleut']].dropna().to_dict('records')

print("Первые 3 записи в translation_pairs:", translation_pairs[:3])

dataset = Dataset.from_list([{"source": pair['Russian'], "target": pair['Aleut']} for pair in translation_pairs])
print("Первые 3 записи в dataset:", dataset[:3])


Первые 3 записи в translation_pairs: [{'Russian': 'Бабушка', 'Aleut': 'ana-ẍ ana-ẍ'}, {'Russian': 'Белая птица сидит на дереве', 'Aleut': 'sisu-ẍ chngii-ẍ qaada-ẍ ku-ga-n ungut-iku-ẍ'}, {'Russian': 'Белые хлопья покрывают гору', 'Aleut': 'sisu-ẍ usa-ku-ẍ qugana-ẍ ku-ga-n'}]
Первые 3 записи в dataset: {'source': ['Бабушка', 'Белая птица сидит на дереве', 'Белые хлопья покрывают гору'], 'target': ['ana-ẍ ana-ẍ', 'sisu-ẍ chngii-ẍ qaada-ẍ ku-ga-n ungut-iku-ẍ', 'sisu-ẍ usa-ku-ẍ qugana-ẍ ku-ga-n']}


In [None]:
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

tokenizer.add_tokens(["ẍ"])
model.resize_token_embeddings(len(tokenizer))

sample_text = "qana-ẍ angali-ẍ ula-ẍ a-ku-ẍ"
encoded = tokenizer(sample_text, return_tensors="pt")
decoded = tokenizer.decode(encoded["input_ids"][0], skip_special_tokens=True)
print("Токенизировано и декодировано:", decoded)

In [None]:
def preprocess_function(examples):
    inputs = examples['source']
    targets = examples['target']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

train_test = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test['train']
eval_dataset = train_test['test']

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mbart_aleut",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

Map:   0%|          | 0/1142 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0726,0.061021
2,0.0416,0.030191
3,0.0183,0.018155
4,0.0101,0.015203
5,0.0053,0.014458


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=1285, training_loss=0.05965882285211801, metrics={'train_runtime': 9672.4047, 'train_samples_per_second': 0.531, 'train_steps_per_second': 0.133, 'total_flos': 174068103905280.0, 'train_loss': 0.05965882285211801, 'epoch': 5.0})

In [None]:
from google.colab import files

model.save_pretrained("Marian_aleut_model")
tokenizer.save_pretrained("Marian_aleut_model")

!zip -r Marian_aleut_model.zip Marian_aleut_model

files.download("Marian_aleut_model.zip")

# Тест модели

In [None]:
model = MarianMTModel.from_pretrained("Marian_aleut_model").to(device)
tokenizer = MarianTokenizer.from_pretrained("Marian_aleut_model")

In [None]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
    translated_tokens = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,
        early_stopping=True
    )
    result = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    result = result.replace(" ẍ", "ẍ")
    return result

print(translate("Где большой дом?"))
print(translate("Кто видит реку?"))

qana-ẍ angali-ẍ ula-ẍ a-ku-ẍ
kiin chigana-ẍ tugu-ku-ẍ
