In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from tqdm import tqdm

# Загрузка данных
train_path = "./data/train"
test_no_reference_path = "./data/test_no_reference"

# Чтение JSONL-файлов
train_data = pd.read_json(train_path, lines=True)
test_data = pd.read_json(test_no_reference_path, lines=True)

# Инициализация модели и токенайзера
model_name = "t5-small"  # Можно заменить на "t5-base" или другую модель
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained('./results/checkpoint-10000')





  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
# Предобработка данных
def preprocess_data(data, tokenizer, max_input_length=128, max_target_length=128):
    inputs = ["translate: " + src for src in data["src"]]
    targets = data["dst"] if "dst" in data else [""] * len(data)
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Преобразование данных для Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(
    lambda batch: preprocess_data(batch, tokenizer), batched=True, remove_columns=["src", "dst"]
)

Map: 100%|█████████████████████████████████████████████████████████████| 300000/300000 [00:34<00:00, 8753.57 examples/s]


In [3]:
# Настройка аргументов для обучения
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=50,
    num_train_epochs=10,
    weight_decay=0.01,
    save_steps=5_000,
    save_total_limit=2,
    fp16=True,  # Использование FP16 для ускорения на GPU
    logging_dir="./logs",
)

# Создание объекта Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Обучение модели
trainer.train()




Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.3213
1000,0.3178
1500,0.3138
2000,0.3175
2500,0.3151
3000,0.3154
3500,0.317
4000,0.3182
4500,0.3174
5000,0.3175


TrainOutput(global_step=60000, training_loss=0.3091086146036784, metrics={'train_runtime': 9155.9098, 'train_samples_per_second': 327.657, 'train_steps_per_second': 6.553, 'total_flos': 1.01506351104e+17, 'train_loss': 0.3091086146036784, 'epoch': 10.0})

In [4]:
#tokenizer = T5Tokenizer.from_pretrained(model_name)
#model = T5ForConditionalGeneration.from_pretrained('./results/checkpoint-60000')

In [4]:
from tqdm import tqdm

# Генерация переводов для данных test_no_reference с tqdm
def generate_translations(data, tokenizer, model, max_input_length=128, max_target_length=128):
    translations = []
    
    # Iterate over data with tqdm
    for src in tqdm(data["src"], desc="Translating"):
        # Prepare input for the model
        input_text = "translate: " + src
        inputs_tokenized = tokenizer(
            input_text,
            max_length=max_input_length,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )
        inputs_tokenized = {key: val.to(model.device) for key, val in inputs_tokenized.items()}
        
        # Generate translation
        outputs = model.generate(**inputs_tokenized, max_length=max_target_length)
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translation)
    
    return translations

# Генерация переводов
test_data["dst"] = generate_translations(test_data, tokenizer, model)

# Сохранение результатов в JSONL
output_path = "./data/answ5.jsonl"
test_data.to_json(output_path, orient="records", lines=True, force_ascii=False)
print(f"Результаты сохранены в {output_path}.")


Translating: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [18:10<00:00,  1.09s/it]

Результаты сохранены в ./data/answ5.jsonl.



