In [None]:
!pip install transformers




In [None]:
import time
from transformers import MarianMTModel, MarianTokenizer
import torch
import datetime
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

def translate_batch(texts, indices):
    tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
    translated_tokens = model.generate(**tokens, num_beams=1)
    translated_texts = [f"{tokenizer.decode(t, skip_special_tokens=True)} [[{idx}]]" for t, idx in zip(translated_tokens, indices)]
    return translated_texts

def get_last_translated_line(output_file_path):
    if not os.path.exists(output_file_path):
        print(f"El archivo {output_file_path} no existe. Creando un nuevo archivo...")
        with open(output_file_path, 'w', encoding='utf-8') as f:
            pass
        return 0
    with open(output_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        return len(lines)

input_file_path = '/content/english.txt'

output_file_path = '/content/spanish.txt'

start_line = get_last_translated_line(output_file_path) + 1

with open(input_file_path, 'r', encoding='utf-8') as input_file:
    lines = input_file.readlines()[start_line-1:]

total_lines = len(lines)
batch_size = 100
num_batches = (total_lines + batch_size - 1) // batch_size

with open(output_file_path, 'a', encoding='utf-8') as output_file:
    start_time = time.time()

    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min(batch_start + batch_size, total_lines)
        batch_lines = [line.strip() for line in lines[batch_start:batch_end]]
        indices = list(range(start_line + batch_start, start_line + batch_end))

        translated_lines = translate_batch(batch_lines, indices)
        output_file.write('\n'.join(translated_lines) + '\n')
        output_file.flush()

        elapsed_time = time.time() - start_time
        batches_remaining = num_batches - (i + 1)
        time_per_batch = elapsed_time / (i + 1)
        eta = time_per_batch * batches_remaining
        print(f"Lote {i + 1}/{num_batches} completado. Tiempo transcurrido: {datetime.timedelta(seconds=int(elapsed_time))}. ETA: {datetime.timedelta(seconds=int(eta))}")

    print("¡Traducción completada con éxito!")




Lote 1/3000 completado. Tiempo transcurrido: 0:00:06. ETA: 5:39:21
Lote 2/3000 completado. Tiempo transcurrido: 0:00:09. ETA: 4:02:00
Lote 3/3000 completado. Tiempo transcurrido: 0:00:11. ETA: 3:07:42
Lote 4/3000 completado. Tiempo transcurrido: 0:00:13. ETA: 2:42:29
Lote 5/3000 completado. Tiempo transcurrido: 0:00:13. ETA: 2:15:28
Lote 6/3000 completado. Tiempo transcurrido: 0:00:14. ETA: 1:58:03
Lote 7/3000 completado. Tiempo transcurrido: 0:00:15. ETA: 1:47:16
Lote 8/3000 completado. Tiempo transcurrido: 0:00:16. ETA: 1:42:38
Lote 9/3000 completado. Tiempo transcurrido: 0:00:17. ETA: 1:34:19
Lote 10/3000 completado. Tiempo transcurrido: 0:00:17. ETA: 1:27:07
Lote 11/3000 completado. Tiempo transcurrido: 0:00:18. ETA: 1:23:52
Lote 12/3000 completado. Tiempo transcurrido: 0:00:19. ETA: 1:19:57
Lote 13/3000 completado. Tiempo transcurrido: 0:00:21. ETA: 1:20:34
Lote 14/3000 completado. Tiempo transcurrido: 0:00:22. ETA: 1:18:17
Lote 15/3000 completado. Tiempo transcurrido: 0:00:22. ET