  Celda 1: Importaciones y Configuraci√≥n Inicial

In [1]:

  # ============================================================================
  # SISTEMA DE TRADUCCI√ìN NEURONAL H√çBRIDO MEJORADO - NLLB200 + ByT5
  # Versi√≥n Optimizada y Corregida
  # ============================================================================

  import torch
  import torch.nn as nn
  import torch.nn.functional as F
  from torch.utils.data import Dataset, DataLoader
  from transformers import (
      AutoTokenizer, AutoModelForSeq2SeqLM,
      T5ForConditionalGeneration, ByT5Tokenizer,
      get_linear_schedule_with_warmup
  )
  import numpy as np
  import pandas as pd
  from typing import List, Dict, Tuple, Optional, Union, Any
  import json
  import os
  from tqdm.auto import tqdm
  import gc
  import warnings
  import time
  from dataclasses import dataclass
  from datetime import datetime
  import matplotlib.pyplot as plt
  import seaborn as sns
  from pathlib import Path

  # Suprimir warnings innecesarios
  warnings.filterwarnings("ignore", category=UserWarning)
  warnings.filterwarnings("ignore", category=FutureWarning)

  print("‚úÖ Importaciones completadas")
  print(f"üî• PyTorch: {torch.__version__}")
  print(f"üíæ CUDA disponible: {torch.cuda.is_available()}")
  if torch.cuda.is_available():
      print(f"üöÄ GPU: {torch.cuda.get_device_name()}")
      print(f"üíΩ Memoria GPU: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


‚úÖ Importaciones completadas
üî• PyTorch: 2.8.0+cu126
üíæ CUDA disponible: True
üöÄ GPU: Tesla T4
üíΩ Memoria GPU: 14.7 GB


  Celda 2: Configuraci√≥n Optimizada del Sistema

In [2]:
  # ============================================================================
  # CONFIGURACI√ìN OPTIMIZADA - SIN RALENTIZADORES
  # ============================================================================

  print("‚öôÔ∏è Configurando entorno optimizado...")

  # ‚ùå ELIMINADO: Configuraciones que ralentizan
  # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # ‚ùå RALENTIZA MUCH√çSIMO
  # os.environ["TORCH_USE_CUDA_DSA"] = "1"    # ‚ùå SOLO PARA DEBUG
  # torch.cuda.set_per_process_memory_fraction(0.8)  # ‚ùå FRAGMENTA MEMORIA

  # ‚úÖ Configuraci√≥n optimizada para velocidad
  if torch.cuda.is_available():
      device = torch.device("cuda")

      # Optimizaciones que S√ç mejoran rendimiento
      torch.backends.cudnn.benchmark = True  # ‚úÖ Optimiza para tama√±os fijos
      torch.backends.cuda.matmul.allow_tf32 = True  # ‚úÖ M√°s r√°pido en GPUs modernas
      torch.backends.cudnn.allow_tf32 = True

      print(f"‚úÖ GPU optimizada: {torch.cuda.get_device_name()}")
      print(f"üíæ Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

      # Solo limpiar memoria al inicio, NO en loops
      torch.cuda.empty_cache()

  else:
      device = torch.device("cpu")
      print("‚ö†Ô∏è Usando CPU - entrenamiento ser√° lento")

  # Configuraci√≥n de precisi√≥n mixta optimizada seg√∫n GPU
  def setup_mixed_precision():
      """Configurar precisi√≥n mixta seg√∫n la GPU disponible"""
      if torch.cuda.is_available():
          gpu_name = torch.cuda.get_device_name().lower()

          if 't4' in gpu_name:
              print("üéØ GPU T4 detectada: usando FP16")
              return True, torch.float16
          elif 'a100' in gpu_name or 'v100' in gpu_name:
              print("üöÄ GPU moderna detectada: usando BF16")
              return True, torch.bfloat16
          else:
              print("üîß GPU gen√©rica: usando FP16")
              return True, torch.float16

      return False, torch.float32

  USE_AMP, AMP_DTYPE = setup_mixed_precision()

  print(f"‚úÖ Configuraci√≥n completada:")
  print(f"  üî• Device: {device}")
  print(f"  ‚ö° Mixed precision: {USE_AMP}")
  print(f"  üìä AMP dtype: {AMP_DTYPE}")

  # Detectar entorno
  try:
      import google.colab
      IN_COLAB = True
      print("üì± Entorno: Google Colab")
  except ImportError:
      IN_COLAB = False
      print("üíª Entorno: Local/Servidor")


‚öôÔ∏è Configurando entorno optimizado...
‚úÖ GPU optimizada: Tesla T4
üíæ Memoria GPU total: 14.7 GB
üéØ GPU T4 detectada: usando FP16
‚úÖ Configuraci√≥n completada:
  üî• Device: cuda
  ‚ö° Mixed precision: True
  üìä AMP dtype: torch.float16
üì± Entorno: Google Colab


Celda 2.5 Conexion con Google Drive:

In [3]:

  # ============================================================================
  # MONTAJE DE GOOGLE DRIVE - CELDA QUE FALTABA
  # ============================================================================

  print("üìÅ Configurando acceso a Google Drive...")

  try:
      from google.colab import drive

      # Montar Google Drive
      print("üîó Montando Google Drive...")
      drive.mount('/content/drive')

      # Verificar acceso
      drive_path = "/content/drive/MyDrive"
      if os.path.exists(drive_path):
          print("‚úÖ Google Drive montado exitosamente")
          print(f"üìÅ Ruta base: {drive_path}")

          # Crear directorios necesarios
          directories_to_create = [
              "/content/drive/MyDrive/model_checkpoints",
              "/content/drive/MyDrive/translation_data",
              "/content/drive/MyDrive/final_models",
              "/content/drive/MyDrive/training_logs"
          ]

          for directory in directories_to_create:
              os.makedirs(directory, exist_ok=True)
              print(f"üìÇ Directorio asegurado: {directory}")

          DRIVE_MOUNTED = True
          DRIVE_BASE_PATH = drive_path

      else:
          print("‚ùå Error: No se puede acceder a Google Drive")
          DRIVE_MOUNTED = False
          DRIVE_BASE_PATH = None

  except ImportError:
      print("‚ö†Ô∏è No est√° en Google Colab")
      DRIVE_MOUNTED = False
      DRIVE_BASE_PATH = None
  except Exception as e:
      print(f"‚ùå Error montando Google Drive: {e}")
      DRIVE_MOUNTED = False
      DRIVE_BASE_PATH = None

  print(f"‚úÖ Configuraci√≥n Drive completada:")
  print(f"  DRIVE_MOUNTED = {DRIVE_MOUNTED}")
  print(f"  DRIVE_BASE_PATH = {DRIVE_BASE_PATH}")


üìÅ Configurando acceso a Google Drive...
üîó Montando Google Drive...
Mounted at /content/drive
‚úÖ Google Drive montado exitosamente
üìÅ Ruta base: /content/drive/MyDrive
üìÇ Directorio asegurado: /content/drive/MyDrive/model_checkpoints
üìÇ Directorio asegurado: /content/drive/MyDrive/translation_data
üìÇ Directorio asegurado: /content/drive/MyDrive/final_models
üìÇ Directorio asegurado: /content/drive/MyDrive/training_logs
‚úÖ Configuraci√≥n Drive completada:
  DRIVE_MOUNTED = True
  DRIVE_BASE_PATH = /content/drive/MyDrive


  Celda 3: Configuraci√≥n del Modelo H√≠brido Corregida

In [4]:

  # ============================================================================
  # CONFIGURACI√ìN H√çBRIDA REALISTA Y OPTIMIZADA
  # ============================================================================

  class HybridTranslationConfig:
      def __init__(self):
          # Modelos base
          self.nllb_model_name = "facebook/nllb-200-distilled-600M"
          self.byt5_model_name = "google/byt5-small"

          # üîß CORRECCI√ìN CR√çTICA: Uso realista de modelos
          self.use_nllb_in_training = False  # ‚ùå DESACTIVADO para entrenamiento (muy costoso)
          self.use_nllb_for_inference = True  # ‚úÖ Solo para inferencia/fallback
          self.hybrid_mode = "byt5_primary"  # Modo: ByT5 principal, NLLB auxiliar

          # Configuraci√≥n de entrenamiento
          self.num_epochs = 10
          self.batch_size = 8 if not IN_COLAB else 4  # Menor batch en Colab
          self.learning_rate = 1e-5
          self.weight_decay = 0.01
          self.max_samples_per_epoch = 50000
          self.max_length = 128

          # Optimizaciones cr√≠ticas
          self.gradient_accumulation_steps = 4
          self.use_mixed_precision = USE_AMP
          self.max_grad_norm = 1.0

          # Early stopping
          self.early_stopping_patience = 3
          self.early_stopping_min_delta = 0.001

          # Warmup y scheduler
          self.warmup_steps = 1000
          self.scheduler_type = "linear"

          print("‚ö†Ô∏è CONFIGURACI√ìN H√çBRIDA:")
          print(f"  üéØ NLLB en entrenamiento: {'‚ùå NO' if not self.use_nllb_in_training else '‚úÖ S√ç'}")
          print(f"  üåç NLLB para inferencia: {'‚úÖ S√ç' if self.use_nllb_for_inference else '‚ùå NO'}")
          print(f"  üî§ Modelo principal: ByT5 (optimizado)")
          print(f"  üìä Batch size: {self.batch_size}")
          print(f"  ‚ö° Mixed precision: {'‚úÖ S√ç' if self.use_mixed_precision else '‚ùå NO'}")

  # Crear configuraci√≥n
  config = HybridTranslationConfig()

  # Idiomas soportados (mantener tu lista original)

  SUPPORTED_LANGUAGES = {
      # ===== IDIOMAS PRINCIPALES =====
      'es': 'spa_Latn',    # Espa√±ol (500M hablantes)
      'en': 'eng_Latn',    # Ingl√©s (1.5B hablantes)
      'fr': 'fra_Latn',    # Franc√©s (280M hablantes)
      'pt': 'por_Latn',    # Portugu√©s (260M hablantes)
      'ar': 'ara_Arab',    # √Årabe (422M hablantes)
      'ru': 'rus_Cyrl',    # Ruso (258M hablantes)
      'zh': 'zho_Hans',    # Chino simplificado (918M)
      'hi': 'hin_Deva',    # Hindi (602M hablantes)

      # ===== EUROPA (pa√≠ses en desarrollo del este) =====
      'uk': 'ukr_Cyrl',    # Ucraniano (37M) - pa√≠s en reconstrucci√≥n
      'bg': 'bul_Cyrl',    # B√∫lgaro (7M)
      'hr': 'hrv_Latn',    # Croata (5M)
      'sr': 'srp_Cyrl',    # Serbio (12M)
      'mk': 'mkd_Cyrl',    # Macedonio (2M)
      'sq': 'sqi_Latn',    # Alban√©s (6M)
      'ro': 'ron_Latn',    # Rumano (22M)
      'hu': 'hun_Latn',    # H√∫ngaro (13M)
      'pl': 'pol_Latn',    # Polaco (45M)
      'cs': 'ces_Latn',    # Checo (10M)
      'sk': 'slk_Latn',    # Eslovaco (5M)
      'et': 'est_Latn',    # Estonio (1M)
      'lv': 'lav_Latn',    # Let√≥n (2M)
      'lt': 'lit_Latn',    # Lituano (3M)

      # ===== √ÅFRICA (AMPLIADO) =====
      # √Åfrica Occidental
      'sw': 'swa_Latn',    # Swahili (200M) - lingua franca √Åfrica Oriental
      'ha': 'hau_Latn',    # Hausa (80M) - Nigeria, N√≠ger, Ghana
      'yo': 'yor_Latn',    # Yoruba (45M) - Nigeria, Ben√≠n
      'ig': 'ibo_Latn',    # Igbo (45M) - Nigeria
      'wo': 'wol_Latn',    # Wolof (12M) - Senegal, Gambia
      'ff': 'fuv_Latn',    # Fulah (65M) - Sahel africano
      'bm': 'bam_Latn',    # Bambara (15M) - Mali
      'tw': 'twi_Latn',    # Twi (17M) - Ghana
      'ak': 'aka_Latn',    # Akan (11M) - Ghana, Costa de Marfil
      'ee': 'ewe_Latn',    # Ewe (6M) - Ghana, Togo
      'gaa': 'gaa_Latn',   # Ga (3M) - Ghana
      'kr': 'kau_Latn',    # Kanuri (10M) - Nigeria, Chad

      # √Åfrica Oriental y Meridional
      'am': 'amh_Ethi',    # Amh√°rico (57M) - Etiop√≠a
      'om': 'orm_Latn',    # Oromo (37M) - Etiop√≠a (m√°s hablado)
      'ti': 'tir_Ethi',    # Tigri√±a (9M) - Etiop√≠a, Eritrea
      'so': 'som_Latn',    # Somal√≠ (21M) - Somalia, Etiop√≠a, Kenia
      'zu': 'zul_Latn',    # Zulu (27M) - Sud√°frica
      'xh': 'xho_Latn',    # Xhosa (19M) - Sud√°frica
      'af': 'afr_Latn',    # Afrik√°ans (16M) - Sud√°frica
      'st': 'sot_Latn',    # Sesotho (7M) - Lesoto, Sud√°frica
      'tn': 'tsn_Latn',    # Tswana (8M) - Botswana, Sud√°frica
      'ss': 'ssw_Latn',    # Siswati (2M) - Esuatini, Sud√°frica
      've': 'ven_Latn',    # Venda (1M) - Sud√°frica
      'ts': 'tso_Latn',    # Tsonga (7M) - Sud√°frica, Mozambique
      'nr': 'nbl_Latn',    # Ndebele (2M) - Sud√°frica
      'ny': 'nya_Latn',    # Chichewa (17M) - Malawi, Zambia
      'sn': 'sna_Latn',    # Shona (15M) - Zimbabue
      'rw': 'kin_Latn',    # Kinyarwanda (25M) - Ruanda
      'rn': 'run_Latn',    # Kirundi (13M) - Burundi
      'kg': 'kon_Latn',    # Kikongo (10M) - RD Congo, Angola
      'ln': 'lin_Latn',    # Lingala (15M) - RD Congo
      'lua': 'luo_Latn',   # Luo (4M) - Kenia, Tanzania
      'mg': 'mlg_Latn',    # Malgache (25M) - Madagascar

      # √Åfrica del Norte (Bereber)
      'ber': 'ber_Tfng',   # Bereber/Amazigh (30M) - Marruecos, Argelia
      'kab': 'kab_Latn',   # Cabila (7M) - Argelia

      # ===== ASIA (PA√çSES EN DESARROLLO) =====
      # Asia Meridional
      'bn': 'ben_Beng',    # Bengal√≠ (300M) - Bangladesh, India
      'ur': 'urd_Arab',    # Urdu (230M) - Pakist√°n, India
      'pa': 'pan_Guru',    # Panyab√≠ (130M) - India, Pakist√°n
      'gu': 'guj_Gujr',    # Gujarati (60M) - India
      'or': 'ory_Orya',    # Oriya (45M) - India
      'as': 'asm_Beng',    # Asam√©s (15M) - India
      'ml': 'mal_Mlym',    # Malayalam (38M) - India
      'kn': 'kan_Knda',    # Canar√©s (44M) - India
      'te': 'tel_Telu',    # Telugu (95M) - India
      'ta': 'tam_Taml',    # Tamil (78M) - India, Sri Lanka
      'si': 'sin_Sinh',    # Cingal√©s (16M) - Sri Lanka
      'ne': 'nep_Deva',    # Nepal√≠ (16M) - Nepal
      'my': 'mya_Mymr',    # Birmano (33M) - Myanmar
      'km': 'khm_Khmr',    # Jemer (16M) - Camboya
      'lo': 'lao_Laoo',    # Lao (30M) - Laos

      # Asia Central (A√ëADIDO Uzbeko y otros)
      'uz': 'uzn_Latn',    # Uzbeko (34M) - Uzbekist√°n ‚úÖ TU SOLICITADO
      'kk': 'kaz_Cyrl',    # Kazajo (15M) - Kazajst√°n
      'ky': 'kir_Cyrl',    # Kirgu√≠s (5M) - Kirguist√°n
      'tg': 'tgk_Cyrl',    # Tayiko (9M) - Tayikist√°n
      'tk': 'tuk_Latn',    # Turkmeno (6M) - Turkmenist√°n
      'az': 'aze_Latn',    # Azerbaiyano (23M) - Azerbaiy√°n
      'hy': 'hye_Armn',    # Armenio (7M) - Armenia
      'ka': 'kat_Geor',    # Georgiano (4M) - Georgia

      # Asia Oriental ‚úÖ YA TIENES JAPON√âS
      'ja': 'jpn_Jpan',    # Japon√©s (125M)
      'ko': 'kor_Hang',    # Coreano (77M)
      'mn': 'khk_Cyrl',    # Mongol (6M) - Mongolia

      # Asia Sudoriental
      'th': 'tha_Thai',    # Tailand√©s (60M) - Tailandia
      'vi': 'vie_Latn',    # Vietnamita (95M) - Vietnam
      'id': 'ind_Latn',    # Indonesio (280M) - Indonesia
      'ms': 'msa_Latn',    # Malayo (60M) - Malasia, Brun√©i
      'tl': 'tgl_Latn',    # Filipino/Tagalo (45M) - Filipinas
      'ceb': 'ceb_Latn',   # Cebuano (25M) - Filipinas
      'war': 'war_Latn',   # Waray (3M) - Filipinas
      'ilo': 'ilo_Latn',   # Ilocano (10M) - Filipinas

      # ===== AM√âRICA (PA√çSES EN DESARROLLO) =====
      # Am√©rica Central y Caribe
      'ht': 'hat_Latn',    # Criollo haitiano (12M) - Hait√≠
      'gn': 'grn_Latn',    # Guaran√≠ (12M) - Paraguay
      'qu': 'quy_Latn',    # Quechua (10M) - Per√∫, Bolivia, Ecuador
      'ay': 'aym_Latn',    # Aimara (3M) - Bolivia, Per√∫

      # Idiomas ind√≠genas importantes
      'nah': 'nah_Latn',   # N√°huatl (1.5M) - M√©xico
      'yua': 'yua_Latn',   # Maya yucateco (800K) - M√©xico
      'bzd': 'bzd_Latn',   # Belize Kriol (200K) - Belice

      # ===== OCEAN√çA =====
      'fj': 'fij_Latn',    # Fiyiano (350K) - Fiyi
      'sm': 'smo_Latn',    # Samoano (510K) - Samoa
      'to': 'ton_Latn',    # Tongano (200K) - Tonga
      'bi': 'bis_Latn',    # Bislama (200K) - Vanuatu
      'ty': 'tah_Latn',    # Tahitiano (280K) - Polinesia Francesa

      # ===== OTROS IDIOMAS RELEVANTES =====
      'mt': 'mlt_Latn',    # Malt√©s (520K) - Malta
      'is': 'isl_Latn',    # Island√©s (350K) - Islandia
      'ga': 'gle_Latn',    # Irland√©s (170K) - Irlanda
      'cy': 'cym_Latn',    # Gal√©s (880K) - Gales
      'eu': 'eus_Latn',    # Euskera (750K) - Espa√±a, Francia
      'ca': 'cat_Latn',    # Catal√°n (10M) - Espa√±a
      'gl': 'glg_Latn',    # Gallego (2.4M) - Espa√±a

      # Europa menos desarrollada
      'be': 'bel_Cyrl',    # Bielorruso (5M) - Bielorrusia
      'lv': 'lav_Latn',    # Let√≥n (1.9M) - Letonia
      'sl': 'slv_Latn',    # Esloveno (2.1M) - Eslovenia
      'fi': 'fin_Latn',    # Fin√©s (5.5M) - Finlandia
      'da': 'dan_Latn',    # Dan√©s (6M) - Dinamarca
      'sv': 'swe_Latn',    # Sueco (10M) - Suecia
      'no': 'nor_Latn',    # Noruego (5M) - Noruega
      'nl': 'nld_Latn',    # Neerland√©s (24M) - Pa√≠ses Bajos
      'de': 'deu_Latn',    # Alem√°n (95M) - Alemania
      'it': 'ita_Latn',    # Italiano (65M) - Italia
      'tr': 'tur_Latn',    # Turco (88M) - Turqu√≠a
      'he': 'heb_Hebr',    # Hebreo (9M) - Israel
      'fa': 'pes_Arab',    # Persa/Farsi (70M) - Ir√°n
      'ps': 'pbt_Arab',    # Past√∫n (60M) - Afganist√°n, Pakist√°n
  }

  print(f"üåç TOTAL IDIOMAS SOPORTADOS: {len(SUPPORTED_LANGUAGES)}")
  print(f"üìä Cobertura estimada: ~4.5 mil millones de hablantes")
  print(f"üéØ Enfoque: Pa√≠ses en desarrollo y idiomas con pocos recursos")
  print(f"üåç Idiomas soportados: {len(SUPPORTED_LANGUAGES)}")


‚ö†Ô∏è CONFIGURACI√ìN H√çBRIDA:
  üéØ NLLB en entrenamiento: ‚ùå NO
  üåç NLLB para inferencia: ‚úÖ S√ç
  üî§ Modelo principal: ByT5 (optimizado)
  üìä Batch size: 4
  ‚ö° Mixed precision: ‚úÖ S√ç
üåç TOTAL IDIOMAS SOPORTADOS: 123
üìä Cobertura estimada: ~4.5 mil millones de hablantes
üéØ Enfoque: Pa√≠ses en desarrollo y idiomas con pocos recursos
üåç Idiomas soportados: 123


  Celda 4: Modelo H√≠brido Corregido NLLB + ByT5

In [5]:

  # ============================================================================
  # MODELO H√çBRIDO CORREGIDO - SIN ERRORES DE TOKENIZACI√ìN
  # ============================================================================

  class HybridNLLBByT5Model(nn.Module):
      def __init__(self, config):
          super().__init__()
          self.config = config

          # üîß CORRECCI√ìN CR√çTICA: Solo cargar NLLB si realmente se va a usar
          if config.use_nllb_for_inference:
              print("üåç Cargando NLLB para inferencia...")
              try:
                  self.nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
                      config.nllb_model_name,
                      torch_dtype=torch.float32,
                      low_cpu_mem_usage=True
                  )
                  self.nllb_tokenizer = AutoTokenizer.from_pretrained(config.nllb_model_name)
                  print(f"‚úÖ NLLB cargado para inferencia")
              except Exception as e:
                  print(f"‚ö†Ô∏è Error cargando NLLB: {e}")
                  self.nllb_model = None
                  self.nllb_tokenizer = None
          else:
              print("üö´ NLLB completamente desactivado")
              self.nllb_model = None
              self.nllb_tokenizer = None

          # ByT5: modelo principal y √öNICO para entrenamiento
          print("üî§ Cargando ByT5 (modelo principal)...")
          self.byt5_model = T5ForConditionalGeneration.from_pretrained(
              config.byt5_model_name,
              low_cpu_mem_usage=True
          )
          self.byt5_tokenizer = ByT5Tokenizer.from_pretrained(config.byt5_model_name)

          # Verificar vocabulario
          print(f"  üìä ByT5 vocab size: {self.byt5_tokenizer.vocab_size}")
          print(f"  üî§ ByT5 pad token: {self.byt5_tokenizer.pad_token_id}")
          print(f"  üî§ ByT5 eos token: {self.byt5_tokenizer.eos_token_id}")

          # üîß CORRECCI√ìN: Capas de mejora simplificadas
          byt5_hidden_size = self.byt5_model.config.d_model

          self.enhancement_layer = nn.Sequential(
              nn.Linear(byt5_hidden_size, byt5_hidden_size),
              nn.LayerNorm(byt5_hidden_size),
              nn.Dropout(0.1)
          )

          print(f"‚úÖ Modelo h√≠brido inicializado")
          print(f"  üî§ ByT5 hidden size: {byt5_hidden_size}")
          print(f"  ‚öôÔ∏è Solo ByT5 ser√° entrenado")

      def forward(self, input_ids, attention_mask, labels=None, use_nllb=False, **kwargs):
          """Forward pass CORREGIDO - solo ByT5 durante entrenamiento"""

          # üîß FORZAR ByT5 durante entrenamiento
          if self.training:
              use_nllb = False

          # üîß CORRECCI√ìN CR√çTICA: Solo usar ByT5 (vocabulario consistente)
          try:
              # Validaci√≥n de entrada
              batch_size, seq_len = input_ids.shape
              device = input_ids.device

              # üîß CR√çTICO: Asegurar que input_ids est√° en rango v√°lido para ByT5
              vocab_size = self.byt5_model.config.vocab_size
              input_ids = torch.clamp(input_ids, 0, vocab_size - 1)

              if labels is not None:
                  # üîß CR√çTICO: Manejar labels fuera de rango
                  labels = torch.clamp(labels, -100, vocab_size - 1)
                  # Reemplazar tokens fuera de rango con -100 (ignorar en loss)
                  labels = torch.where(
                      (labels >= vocab_size) | (labels < 0),
                      torch.tensor(-100, device=device, dtype=labels.dtype),
                      labels
                  )

              # Forward pass SOLO con ByT5
              outputs = self.byt5_model(
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  labels=labels,
                  **kwargs
              )

              return outputs

          except Exception as e:
              print(f"‚ùå Error en forward: {e}")

              # üîß FALLBACK seguro
              batch_size, seq_len = input_ids.shape
              device = input_ids.device
              vocab_size = self.byt5_model.config.vocab_size

              from transformers.modeling_outputs import Seq2SeqLMOutput
              return Seq2SeqLMOutput(
                  loss=torch.tensor(1.0, device=device, requires_grad=True) if labels is not None else None,
                  logits=torch.randn((batch_size, seq_len, vocab_size), device=device, requires_grad=True)
              )

      def generate(self, input_ids, attention_mask=None, **kwargs):
          """Generaci√≥n simplificada - solo ByT5"""
          try:
              return self.byt5_model.generate(
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  pad_token_id=self.byt5_tokenizer.pad_token_id,
                  eos_token_id=self.byt5_tokenizer.eos_token_id,
                  **kwargs
              )
          except Exception as e:
              print(f"‚ö†Ô∏è Error en generaci√≥n: {e}")
              # Fallback: devolver input_ids
              return input_ids

  print("‚úÖ Clase HybridNLLBByT5Model CORREGIDA definida")


‚úÖ Clase HybridNLLBByT5Model CORREGIDA definida


  Celda 5: Dataset y DataLoader Optimizados

In [6]:

  # ============================================================================
  # DATASET CORREGIDO - SIN ERRORES DE TOKENIZACI√ìN
  # ============================================================================

  class MultilingualTranslationDataset(Dataset):
      def __init__(self, translation_pairs, tokenizer, max_length=128):
          self.data = translation_pairs
          self.tokenizer = tokenizer
          self.max_length = max_length

          # üîß CORRECCI√ìN: Verificar tokenizer
          print(f"  üìä Dataset con {len(translation_pairs)} pares")
          print(f"  üî§ Tokenizer: {type(tokenizer).__name__}")
          print(f"  üìè Max length: {max_length}")

      def __len__(self):
          return len(self.data)

      def __getitem__(self, idx):
          item = self.data[idx]

          # Obtener textos con validaci√≥n
          source_text = str(item.get('source', ''))[:200]  # Limitar longitud
          target_text = str(item.get('target', ''))[:200]

          # üîß CORRECCI√ìN CR√çTICA: Validar que el texto no est√© vac√≠o
          if not source_text.strip():
              source_text = "Hello"  # Fallback
          if not target_text.strip():
              target_text = "Hola"   # Fallback

          try:
              # üîß TOKENIZACI√ìN SEGURA
              source_encoding = self.tokenizer(
                  source_text,
                  truncation=True,
                  padding=False,
                  max_length=self.max_length,
                  return_tensors=None,
                  add_special_tokens=True  # üîß IMPORTANTE: A√±adir tokens especiales
              )

              target_encoding = self.tokenizer(
                  target_text,
                  truncation=True,
                  padding=False,
                  max_length=self.max_length,
                  return_tensors=None,
                  add_special_tokens=True  # üîß IMPORTANTE
              )

              # üîß VALIDACI√ìN: Asegurar que tenemos datos v√°lidos
              if not source_encoding['input_ids'] or not target_encoding['input_ids']:
                  raise ValueError("Tokenizaci√≥n vac√≠a")

              return {
                  'input_ids': source_encoding['input_ids'],
                  'target_ids': target_encoding['input_ids'],
                  'source_lang': item.get('source_lang', 'en'),
                  'target_lang': item.get('target_lang', 'es')
              }

          except Exception as e:
              print(f"‚ö†Ô∏è Error tokenizando item {idx}: {e}")
              # üîß FALLBACK SEGURO
              fallback_ids = [self.tokenizer.pad_token_id] * 10
              return {
                  'input_ids': fallback_ids,
                  'target_ids': fallback_ids,
                  'source_lang': 'en',
                  'target_lang': 'es'
              }

  def optimized_collate_fn(batch):
      """Funci√≥n de collation CORREGIDA"""

      # Extraer datos
      input_ids = [item['input_ids'] for item in batch]
      target_ids = [item['target_ids'] for item in batch]

      # üîß CORRECCI√ìN: pad_token_id correcto para ByT5
      pad_token_id = 0  # ByT5 usa 0 como pad_token_id

      # Verificar que tenemos datos v√°lidos
      if not input_ids or not target_ids:
          print("‚ö†Ô∏è Batch vac√≠o detectado")
          # Crear batch de fallback
          input_ids = [[pad_token_id] * 10 for _ in range(len(batch))]
          target_ids = [[pad_token_id] * 10 for _ in range(len(batch))]

      # Calcular longitudes m√°ximas
      max_input_len = min(128, max(len(seq) for seq in input_ids))  # üîß Limitar a 128
      max_target_len = min(128, max(len(seq) for seq in target_ids))

      # Pad sequences y crear m√°scaras
      padded_inputs = []
      padded_targets = []
      input_masks = []

      for inp, tgt in zip(input_ids, target_ids):
          # üîß CORRECCI√ìN: Validar longitudes
          inp = inp[:max_input_len]  # Truncar si es necesario
          tgt = tgt[:max_target_len]

          # Input padding
          inp_len = len(inp)
          inp_padded = inp + [pad_token_id] * (max_input_len - inp_len)
          inp_mask = [1] * inp_len + [0] * (max_input_len - inp_len)

          # Target padding
          tgt_len = len(tgt)
          tgt_padded = tgt + [pad_token_id] * (max_target_len - tgt_len)

          padded_inputs.append(inp_padded)
          padded_targets.append(tgt_padded)
          input_masks.append(inp_mask)

      # üîß CONVERSI√ìN SEGURA A TENSORES
      try:
          return {
              'input_ids': torch.tensor(padded_inputs, dtype=torch.long),
              'attention_mask': torch.tensor(input_masks, dtype=torch.long),
              'labels': torch.tensor(padded_targets, dtype=torch.long),
          }
      except Exception as e:
          print(f"‚ùå Error creando tensores: {e}")
          # Fallback batch
          batch_size = len(batch)
          return {
              'input_ids': torch.zeros((batch_size, 10), dtype=torch.long),
              'attention_mask': torch.ones((batch_size, 10), dtype=torch.long),
              'labels': torch.zeros((batch_size, 10), dtype=torch.long),
          }

  def create_optimized_dataloader(dataset, batch_size, is_training=True):
      """Crear DataLoader CORREGIDO"""

      if IN_COLAB:
          dataloader_config = {
              'batch_size': batch_size,
              'shuffle': is_training,
              'num_workers': 0,
              'pin_memory': True,
              'drop_last': is_training,
              'collate_fn': optimized_collate_fn
          }
          print("üîß DataLoader configurado para Colab")
      else:
          dataloader_config = {
              'batch_size': batch_size,
              'shuffle': is_training,
              'num_workers': 2,
              'pin_memory': True,
              'persistent_workers': True,
              'prefetch_factor': 2,
              'drop_last': is_training,
              'collate_fn': optimized_collate_fn
          }
          print("üöÄ DataLoader optimizado para entorno local")

      return DataLoader(dataset, **dataloader_config)

  print("‚úÖ Dataset y DataLoader CORREGIDOS definidos")



‚úÖ Dataset y DataLoader CORREGIDOS definidos


In [7]:
  # ============================================================================
  # DEBUGGING - VERIFICAR RUTAS Y ARCHIVOS EN GOOGLE DRIVE
  # ============================================================================

  import os
  from pathlib import Path

  print("üîç DEBUGGING: Verificando rutas en Google Drive...")

  # Rutas a verificar
  potential_paths = [
      "/content/drive/My Drive/GlobalTranslator/NMT/Dataset",
      "/content/drive/MyDrive/GlobalTranslator/NMT/Dataset",
      "/content/drive/My Drive/GlobalTranslatorApp/Codigo/NMT/Dataset",
      "/content/drive/MyDrive/GlobalTranslatorApp/Codigo/NMT/Dataset"
  ]

  for path in potential_paths:
      print(f"\nüìÇ Verificando: {path}")
      if os.path.exists(path):
          print(f"   ‚úÖ Existe")
          try:
              files = os.listdir(path)
              csv_files = [f for f in files if f.endswith('.csv')]
              nmt_files = [f for f in csv_files if f.startswith('NMT_')]

              print(f"   üìÑ Total archivos: {len(files)}")
              print(f"   üìä Archivos CSV: {len(csv_files)}")
              print(f"   üéØ Archivos NMT_*: {len(nmt_files)}")

              if nmt_files:
                  print(f"   üìã Primeros archivos NMT:")
                  for f in nmt_files[:5]:
                      print(f"      - {f}")

              if csv_files:
                  print(f"   üìã Todos los CSV:")
                  for f in csv_files[:10]:
                      print(f"      - {f}")

          except Exception as e:
              print(f"   ‚ùå Error listando: {e}")
      else:
          print(f"   ‚ùå No existe")

  # Verificar estructura desde ra√≠z
  print(f"\nüå≥ Explorando estructura desde /content/drive/:")
  try:
      for root in ["/content/drive/My Drive", "/content/drive/MyDrive"]:
          if os.path.exists(root):
              print(f"\nüìÇ {root}/")
              for item in os.listdir(root)[:10]:
                  item_path = os.path.join(root, item)
                  if os.path.isdir(item_path):
                      print(f"   üìÅ {item}/")
                      # Buscar GlobalTranslator
                      if 'global' in item.lower():
                          print(f"      üéØ Posible match: {item}")
                          try:
                              sub_items = os.listdir(item_path)[:5]
                              for sub in sub_items:
                                  print(f"         üìÅ {sub}")
                          except:
                              pass
  except Exception as e:
      print(f"‚ùå Error explorando: {e}")


üîç DEBUGGING: Verificando rutas en Google Drive...

üìÇ Verificando: /content/drive/My Drive/GlobalTranslator/NMT/Dataset
   ‚úÖ Existe
   üìÑ Total archivos: 3
   üìä Archivos CSV: 3
   üéØ Archivos NMT_*: 3
   üìã Primeros archivos NMT:
      - NMT_train17.csv
      - NMT_train18.csv
      - NMT_val3.csv
   üìã Todos los CSV:
      - NMT_train17.csv
      - NMT_train18.csv
      - NMT_val3.csv

üìÇ Verificando: /content/drive/MyDrive/GlobalTranslator/NMT/Dataset
   ‚úÖ Existe
   üìÑ Total archivos: 3
   üìä Archivos CSV: 3
   üéØ Archivos NMT_*: 3
   üìã Primeros archivos NMT:
      - NMT_train17.csv
      - NMT_train18.csv
      - NMT_val3.csv
   üìã Todos los CSV:
      - NMT_train17.csv
      - NMT_train18.csv
      - NMT_val3.csv

üìÇ Verificando: /content/drive/My Drive/GlobalTranslatorApp/Codigo/NMT/Dataset
   ‚ùå No existe

üìÇ Verificando: /content/drive/MyDrive/GlobalTranslatorApp/Codigo/NMT/Dataset
   ‚ùå No existe

üå≥ Explorando estructura desde /content/

  Celda 6: Carga de Datos Robusta



In [8]:
# ============================================================================
# Celda 6: Drive + Config + Carga robusta de datasets pre-tokenizados (ByT5)
#           con filtrado de ejemplos inv√°lidos (previene NaN)
# ============================================================================
import os, re, ast, gc
from pathlib import Path
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

# ----------------------------- Drive ----------------------------------------
try:
    DRIVE_MOUNTED
    DRIVE_BASE_PATH
    print(f"‚úÖ Variables de Drive ya definidas: DRIVE_MOUNTED={DRIVE_MOUNTED}, BASE={DRIVE_BASE_PATH}")
except NameError:
    print("‚ö†Ô∏è Variables de Drive no definidas, configurando...")
    try:
        from google.colab import drive
        print("üîó Montando Google Drive...")
        drive.mount('/content/drive')
        DRIVE_BASE_PATH = "/content/drive/My Drive"
        if os.path.exists(DRIVE_BASE_PATH):
            DRIVE_MOUNTED = True
            print(f"‚úÖ Google Drive montado: {DRIVE_BASE_PATH}")
            for d in [f"{DRIVE_BASE_PATH}/GlobalTranslator/NMT/Models",
                      f"{DRIVE_BASE_PATH}/GlobalTranslator/NMT/Dataset"]:
                os.makedirs(d, exist_ok=True)
        else:
            DRIVE_MOUNTED = False
            DRIVE_BASE_PATH = None
            print("‚ùå No se puede acceder a Google Drive")
    except Exception as e:
        print(f"‚ùå Error montando Drive: {e}")
        DRIVE_MOUNTED = False
        DRIVE_BASE_PATH = None

print(f"üéõÔ∏è Estado final: DRIVE_MOUNTED={DRIVE_MOUNTED}, DRIVE_BASE_PATH={DRIVE_BASE_PATH}")

# ----------------------------- Config ---------------------------------------
try:
    current_config = enhanced_config
    print("‚úÖ Usando enhanced_config")
except NameError:
    try:
        current_config = config
        print("‚úÖ Usando config")
    except NameError:
        print("‚ö†Ô∏è Creando configuraci√≥n b√°sica (fallback)...")
        from dataclasses import dataclass
        @dataclass
        class BasicConfig:
            # longitudes y padding
            max_length: int = 128
            pad_token_id: int = 0
            label_pad_id: int = -100
            # entrenamiento (por si los usa el trainer)
            batch_size: int = 4
            learning_rate: float = 1e-4
            num_epochs: int = 3
            gradient_accumulation_steps: int = 4
            clip_norm: float = 1.0
            patience: int = 3
            warmup_steps: int = 100
            lr_scheduler_type: str = "linear"
            min_lr: float = 1e-6
            # modelo
            byt5_model_name: str = "google/byt5-small"
            # rutas
            model_save_path: str = "/content/drive/My Drive/GlobalTranslator/NMT/Models"
            checkpoint_dir: str = "/content/drive/My Drive/GlobalTranslator/NMT/Models"
        current_config = BasicConfig()
        print("‚úÖ Configuraci√≥n b√°sica creada")

MODEL_SAVE_PATH = f"{DRIVE_BASE_PATH}/GlobalTranslator/NMT/Models" if DRIVE_BASE_PATH else "/content"
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
current_config.model_save_path = MODEL_SAVE_PATH
current_config.checkpoint_dir = MODEL_SAVE_PATH
if 'enhanced_config' not in globals():
    enhanced_config = current_config
    print("‚úÖ enhanced_config configurado globalmente")

# ---------------------- Parseo y normalizaci√≥n ------------------------------
def _safe_list_eval(val) -> List[int]:
    if isinstance(val, (list, tuple, np.ndarray)):
        return [int(x) for x in val]
    if isinstance(val, str):
        s = val.strip()
        try:
            out = ast.literal_eval(s)
            if isinstance(out, (list, tuple, np.ndarray)):
                return [int(x) for x in out]
        except Exception:
            pass
        s = s.strip("[]")
        if not s:
            return []
        return [int(tok) for tok in s.split(",") if tok.strip() != ""]
    return []

def _pad_or_trim(seq: List[int], L: int, pad_value: int) -> List[int]:
    if len(seq) > L:
        return seq[:L]
    if len(seq) < L:
        return seq + [pad_value] * (L - len(seq))
    return seq

def _normalize_example(ex: Dict, L: int, pad_id: int, label_pad_id: int) -> Dict:
    inp = _pad_or_trim(ex['input_ids'], L, pad_id)
    msk = _pad_or_trim(ex['attention_mask'], L, 0)
    tgt = _pad_or_trim(ex['labels'], L, pad_id)
    tgt_m = _pad_or_trim(ex['target_attention_mask'], L, 0)
    lbl = [label_pad_id if tok == pad_id else tok for tok in tgt]
    return {
        'input_ids': inp,
        'attention_mask': msk,
        'labels': lbl,
        'target_attention_mask': tgt_m,
        'src_lang': ex.get('src_lang'),
        'tgt_lang': ex.get('tgt_lang'),
    }

# ------------------- B√∫squeda y carga streaming -----------------------------
def _find_files(base_path: str, prefix: str) -> List[Path]:
    base = Path(base_path)
    if not base.exists():
        print(f"‚ùå Ruta no encontrada: {base}")
        return []
    files = list(base.glob(f"{prefix}*.csv"))
    def extract_num(p: Path):
        nums = re.findall(r'\d+', p.stem)
        return int(nums[-1]) if nums else -1
    files.sort(key=extract_num)
    if not files:
        print(f"‚ùå No se encontraron archivos con patr√≥n {prefix}*.csv")
    else:
        print(f"üìÅ {prefix}: {len(files)} archivos -> {[f.name for f in files]}")
    return files

def load_pretokenized_byT5(
    base_path: str,
    prefix: str,
    max_files: Optional[int] = None,
    max_rows_per_file: Optional[int] = None,
    L: Optional[int] = None,
    pad_id: int = 0,
    label_pad_id: int = -100,
    min_valid_target_tokens: int = 1
) -> List[Dict]:
    """
    Carga datos con columnas exactas:
      input_ids, input_attention_mask, target_ids, target_attention_mask, input_label, target_label
    Filtra ejemplos cuyo target tenga < min_valid_target_tokens tokens v√°lidos (evita NaN).
    """
    files = _find_files(base_path, prefix)
    if not files:
        return []

    if max_files is not None:
        files = files[:max_files]
        print(f"üìä Limitando a {max_files} archivos para {prefix}")

    cols = [
        'input_ids',
        'input_attention_mask',
        'target_ids',
        'target_attention_mask',
        'input_label',
        'target_label'
    ]
    chunksize = 50_000
    out: List[Dict] = []
    kept = dropped_empty = dropped_short = 0

    for fp in files:
        try:
            print(f"üì• Leyendo {fp.name} ...")
            n_rows = 0
            for chunk in pd.read_csv(fp, usecols=cols, chunksize=chunksize):
                for _, row in chunk.iterrows():
                    n_rows += 1
                    try:
                        inp_ids = _safe_list_eval(row['input_ids'])
                        inp_msk = _safe_list_eval(row['input_attention_mask'])
                        tgt_ids = _safe_list_eval(row['target_ids'])
                        tgt_msk = _safe_list_eval(row['target_attention_mask'])

                        # Corrige m√°scaras si no coinciden
                        if len(inp_msk) != len(inp_ids):
                            inp_msk = [0 if tok == pad_id else 1 for tok in inp_ids]
                        if len(tgt_msk) != len(tgt_ids):
                            tgt_msk = [0 if tok == pad_id else 1 for tok in tgt_ids]

                        # Filtro clave: al menos N tokens target v√°lidos (!= pad_id)
                        valid_target_tokens = sum(1 for t in tgt_ids if t != pad_id)
                        if valid_target_tokens == 0:
                            dropped_empty += 1
                            continue
                        if valid_target_tokens < min_valid_target_tokens:
                            dropped_short += 1
                            continue

                        ex = {
                            'input_ids': inp_ids,
                            'attention_mask': inp_msk,
                            'labels': tgt_ids,
                            'target_attention_mask': tgt_msk,
                            'src_lang': str(row['input_label']).strip(),
                            'tgt_lang': str(row['target_label']).strip()
                        }

                        if L is not None:
                            ex = _normalize_example(ex, L=L, pad_id=pad_id, label_pad_id=label_pad_id)

                        out.append(ex)
                        kept += 1

                        if max_rows_per_file and (kept % max_rows_per_file == 0):
                            break
                    except Exception:
                        continue  # ignora fila corrupta

                if max_rows_per_file and (kept % max_rows_per_file == 0):
                    break
                gc.collect()

            print(f"   ‚úÖ {fp.name}: le√≠das={n_rows} | guardadas(acum)={kept} | vac√≠as={dropped_empty} | cortas={dropped_short}")
        except Exception as e:
            print(f"‚ùå Error en {fp.name}: {e}")

    print(f"üìä TOTAL {prefix}: guardadas={kept} | descartadas(vac√≠as)={dropped_empty} | descartadas(cortas)={dropped_short}")
    return out

# ------------------------ Dataset Torch -------------------------------------
class PreTokenizedByT5Dataset(Dataset):
    def __init__(self, data: List[Dict]):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        it = self.data[idx]
        return {
            'input_ids': torch.tensor(it['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(it['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(it['labels'], dtype=torch.long),
            'target_attention_mask': torch.tensor(it['target_attention_mask'], dtype=torch.long),
            'src_lang': it.get('src_lang'),
            'tgt_lang': it.get('tgt_lang'),
        }

# ----------------------- Carga principal ------------------------------------
def load_translation_data_byT5():
    base = f"{DRIVE_BASE_PATH}/GlobalTranslator/NMT/Dataset" if DRIVE_BASE_PATH else "/content"
    print("üîÑ Cargando datos pre-tokenizados ByT5...")
    print("üìÇ Carpeta:", base)

    L = int(getattr(current_config, "max_length", 128))
    pad_id = int(getattr(current_config, "pad_token_id", 0))
    label_pad_id = int(getattr(current_config, "label_pad_id", -100))

    train_pairs = load_pretokenized_byT5(base, "NMT_train", L=L, pad_id=pad_id, label_pad_id=label_pad_id)
    val_pairs   = load_pretokenized_byT5(base, "NMT_val",   L=L, pad_id=pad_id, label_pad_id=label_pad_id)
    return train_pairs, val_pairs

print("üìä Iniciando carga de datos pre-tokenizados ByT5...")
train_pairs, val_pairs = load_translation_data_byT5()

print("üîÑ Creando datasets PyTorch...")
TRAIN_DATASET = PreTokenizedByT5Dataset(train_pairs)
VAL_DATASET   = PreTokenizedByT5Dataset(val_pairs)
print(f"‚úÖ Datasets: train={len(TRAIN_DATASET)} | val={len(VAL_DATASET)}")

# Muestra de sanity-check
if len(TRAIN_DATASET) > 0:
    s = TRAIN_DATASET[0]
    print("üìã Sample:")
    print("   input_ids[:10]:", s['input_ids'][:10].tolist())
    print("   labels[:10]:   ", s['labels'][:10].tolist())

# Crea DataLoaders si existe tu utilidad de Celda 5
if 'create_tokenized_dataloaders' in globals():
    try:
        TRAIN_LOADER, VAL_LOADER = create_tokenized_dataloaders(
            train_pairs, val_pairs,
            batch_size=int(getattr(current_config, "batch_size", 4)),
            max_length=int(getattr(current_config, "max_length", 128)),
            pad_token_id=int(getattr(current_config, "pad_token_id", 0)),
            label_pad_id=int(getattr(current_config, "label_pad_id", -100)),
        )
        print("‚úÖ DataLoaders creados (TRAIN_LOADER, VAL_LOADER)")
    except Exception as e:
        print(f"‚ö†Ô∏è create_tokenized_dataloaders fall√≥: {e}")
        TRAIN_LOADER = VAL_LOADER = None
else:
    TRAIN_LOADER = VAL_LOADER = None
    print("‚ÑπÔ∏è No hay 'create_tokenized_dataloaders' (Celda 5).")


‚úÖ Variables de Drive ya definidas: DRIVE_MOUNTED=True, BASE=/content/drive/MyDrive
üéõÔ∏è Estado final: DRIVE_MOUNTED=True, DRIVE_BASE_PATH=/content/drive/MyDrive
‚úÖ Usando config
‚úÖ enhanced_config configurado globalmente
üìä Iniciando carga de datos pre-tokenizados ByT5...
üîÑ Cargando datos pre-tokenizados ByT5...
üìÇ Buscando en: /content/drive/MyDrive/GlobalTranslator/NMT/Dataset
üìÅ Encontrados 2 archivos NMT_train*.csv: ['NMT_train17.csv', 'NMT_train18.csv']
üì• Cargando NMT_train17.csv ...
   ‚úÖ 100000 filas v√°lidas de NMT_train17.csv
üì• Cargando NMT_train18.csv ...
   ‚úÖ 100000 filas v√°lidas de NMT_train18.csv
üìä TOTAL cargado para 'NMT_train': 200000 ejemplos
üìÅ Encontrados 1 archivos NMT_val*.csv: ['NMT_val3.csv']
üì• Cargando NMT_val3.csv ...
   ‚úÖ 100000 filas v√°lidas de NMT_val3.csv
üìä TOTAL cargado para 'NMT_val': 100000 ejemplos
üîÑ Creando datasets PyTorch...
‚úÖ Datasets creados:
   üìà Entrenamiento: 200000 muestras
   üìä Validaci√≥n:    

  Celda 7: Inicializaci√≥n del Modelo

In [9]:

  # ============================================================================
  # INICIALIZACI√ìN DEL MODELO H√çBRIDO
  # ============================================================================

  print("üöÄ Inicializando modelo h√≠brido...")

  # Crear modelo
  model = HybridNLLBByT5Model(enhanced_config)
  model.to(device)

  print(f"‚úÖ Modelo cargado en: {device}")

  # Contar par√°metros
  total_params = sum(p.numel() for p in model.parameters())
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

  print(f"üìä Par√°metros del modelo:")
  print(f"  üî¢ Total: {total_params:,}")
  print(f"  üéØ Entrenables: {trainable_params:,}")
  print(f"  üìà Porcentaje entrenable: {100 * trainable_params / total_params:.1f}%")

  # Verificar memoria GPU
  if torch.cuda.is_available():
      torch.cuda.empty_cache()  # Limpiar antes de medir
      memory_allocated = torch.cuda.memory_allocated() / 1024**3
      memory_reserved = torch.cuda.memory_reserved() / 1024**3

      print(f"üíæ Memoria GPU:")
      print(f"  üìä Asignada: {memory_allocated:.2f} GB")
      print(f"  üîí Reservada: {memory_reserved:.2f} GB")

  # Configurar rutas de guardado espec√≠ficas
  enhanced_config.best_model_path = os.path.join(MODEL_SAVE_PATH, "best_hybrid_model.pt")
  enhanced_config.final_model_path = os.path.join(MODEL_SAVE_PATH, "final_hybrid_model.pt")

  print(f"üìÅ Configuraci√≥n de guardado:")
  print(f"  üèÜ Mejor modelo: {enhanced_config.best_model_path}")
  print(f"  üéØ Modelo final: {enhanced_config.final_model_path}")

  print("‚úÖ Modelo h√≠brido inicializado correctamente")


üöÄ Inicializando modelo h√≠brido...
üåç Cargando NLLB para inferencia...


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

‚úÖ NLLB cargado para inferencia
üî§ Cargando ByT5 (modelo principal)...


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

  üìä ByT5 vocab size: 256
  üî§ ByT5 pad token: 0
  üî§ ByT5 eos token: 1
‚úÖ Modelo h√≠brido inicializado
  üî§ ByT5 hidden size: 1472
  ‚öôÔ∏è Solo ByT5 ser√° entrenado
‚úÖ Modelo cargado en: cuda
üìä Par√°metros del modelo:
  üî¢ Total: 916,882,752
  üéØ Entrenables: 916,882,752
  üìà Porcentaje entrenable: 100.0%
üíæ Memoria GPU:
  üìä Asignada: 3.43 GB
  üîí Reservada: 3.54 GB
üìÅ Configuraci√≥n de guardado:
  üèÜ Mejor modelo: /content/drive/MyDrive/GlobalTranslator/NMT/Models/best_hybrid_model.pt
  üéØ Modelo final: /content/drive/MyDrive/GlobalTranslator/NMT/Models/final_hybrid_model.pt
‚úÖ Modelo h√≠brido inicializado correctamente


In [None]:
from tqdm.auto import tqdm
import torch
from torch.cuda.amp import autocast, GradScaler

def _count_valid_targets(labels):
    # cuenta tokens != -100
    return int((labels != -100).sum().item())

def train_epoch(model, dataloader, optimizer, scheduler, device, epoch, config):
    model.train()
    scaler = GradScaler(enabled=torch.cuda.is_available())
    total_loss = 0.0
    skipped = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")

    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)

        # ‚ö†Ô∏è SALTO si no hay ni un token v√°lido
        if _count_valid_targets(labels) == 0:
            skipped += 1
            continue

        with autocast(enabled=torch.cuda.is_available()):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs['loss'] / config.gradient_accumulation_steps

        if torch.isnan(loss):
            skipped += 1
            continue

        scaler.scale(loss).backward()

        if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), getattr(config, "clip_norm", 1.0))
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        total_loss += loss.item() * config.gradient_accumulation_steps

        progress_bar.set_postfix({
            'loss': f"{(total_loss / max(1, (batch_idx + 1))):.4f}",
            'lr': scheduler.get_last_lr()[0] if hasattr(scheduler, "get_last_lr") else None,
            'skip': skipped
        })

        if torch.cuda.is_available() and (batch_idx % 200 == 0):
            torch.cuda.empty_cache()

    if skipped > 0:
        print(f"‚ö†Ô∏è Batches saltados por labels vac√≠os/NaN: {skipped}")
    return total_loss / max(1, len(dataloader))

def validate(model, dataloader, device, config):
    model.eval()
    total_loss = 0.0
    skipped = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['labels'].to(device, non_blocking=True)

            if _count_valid_targets(labels) == 0:
                skipped += 1
                continue

            with autocast(enabled=torch.cuda.is_available()):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs['loss']

            if torch.isnan(loss):
                skipped += 1
                continue

            total_loss += loss.item()

    if skipped > 0:
        print(f"‚ö†Ô∏è Val batches saltados por labels vac√≠os/NaN: {skipped}")
    return total_loss / max(1, len(dataloader))


  Celda 8: Configuraci√≥n de Optimizador y Scheduler

In [10]:

  # ============================================================================
  # CONFIGURACI√ìN DE OPTIMIZADOR Y SCHEDULER
  # ============================================================================

  print("‚öôÔ∏è Configurando optimizador y scheduler...")

  # Optimizador
  optimizer = torch.optim.AdamW(
      model.parameters(),
      lr=config.learning_rate,
      weight_decay=config.weight_decay,
      eps=1e-8,
      betas=(0.9, 0.999)
  )

  # Calcular pasos totales para el scheduler
  total_steps = (len(train_pairs) // config.batch_size) * config.num_epochs
  warmup_steps = min(config.warmup_steps, total_steps // 10)  # M√°ximo 10% de warmup

  # Scheduler
  if config.scheduler_type == "linear":
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=warmup_steps,
          num_training_steps=total_steps
      )
  else:
      scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
          optimizer,
          T_max=total_steps,
          eta_min=config.learning_rate * 0.01
      )

  # Configurar AMP scaler si est√° habilitado
  if USE_AMP:
      scaler = torch.cuda.amp.GradScaler()
      print("‚úÖ AMP Scaler configurado")
  else:
      scaler = None

  print(f"‚úÖ Configuraci√≥n completada:")
  print(f"  üéØ Optimizador: AdamW")
  print(f"  üìà Learning rate: {config.learning_rate}")
  print(f"  üî• Scheduler: {config.scheduler_type}")
  print(f"  üìä Total steps: {total_steps:,}")
  print(f"  üîÑ Warmup steps: {warmup_steps:,}")
  print(f"  ‚ö° Mixed precision: {'‚úÖ S√ç' if USE_AMP else '‚ùå NO'}")


‚öôÔ∏è Configurando optimizador y scheduler...
‚úÖ AMP Scaler configurado
‚úÖ Configuraci√≥n completada:
  üéØ Optimizador: AdamW
  üìà Learning rate: 1e-05
  üî• Scheduler: linear
  üìä Total steps: 500,000
  üîÑ Warmup steps: 1,000
  ‚ö° Mixed precision: ‚úÖ S√ç


  Celda 9: Creaci√≥n de Datasets y DataLoaders


In [11]:

  # ============================================================================
  # CREACI√ìN DE DATASETS Y DATALOADERS
  # ============================================================================

  print("üìä Creando datasets y dataloaders...")

  # Obtener tokenizer principal (ByT5)
  tokenizer = model.byt5_tokenizer

  # Crear datasets
  train_dataset = MultilingualTranslationDataset(
      train_pairs,
      tokenizer,
      max_length=config.max_length
  )

  val_dataset = MultilingualTranslationDataset(
      val_pairs,
      tokenizer,
      max_length=config.max_length
  )

  # Crear dataloaders optimizados
  train_loader = create_optimized_dataloader(
      train_dataset,
      config.batch_size,
      is_training=True
  )

  val_loader = create_optimized_dataloader(
      val_dataset,
      config.batch_size,
      is_training=False
  )

  print(f"‚úÖ Datasets y DataLoaders creados:")
  print(f"  üèãÔ∏è Train dataset: {len(train_dataset)} samples")
  print(f"  üìä Val dataset: {len(val_dataset)} samples")
  print(f"  üì¶ Train batches: {len(train_loader)}")
  print(f"  üì¶ Val batches: {len(val_loader)}")

  # Probar un batch
  print("\nüß™ Probando primer batch...")
  try:
      sample_batch = next(iter(train_loader))
      print(f"  ‚úÖ Batch shape: {sample_batch['input_ids'].shape}")
      print(f"  ‚úÖ Labels shape: {sample_batch['labels'].shape}")
      print(f"  ‚úÖ Attention mask shape: {sample_batch['attention_mask'].shape}")
  except Exception as e:
      print(f"  ‚ùå Error en batch de prueba: {e}")


üìä Creando datasets y dataloaders...
  üìä Dataset con 200000 pares
  üî§ Tokenizer: ByT5Tokenizer
  üìè Max length: 128
  üìä Dataset con 100000 pares
  üî§ Tokenizer: ByT5Tokenizer
  üìè Max length: 128
üîß DataLoader configurado para Colab
üîß DataLoader configurado para Colab
‚úÖ Datasets y DataLoaders creados:
  üèãÔ∏è Train dataset: 200000 samples
  üìä Val dataset: 100000 samples
  üì¶ Train batches: 50000
  üì¶ Val batches: 25000

üß™ Probando primer batch...
  ‚úÖ Batch shape: torch.Size([4, 6])
  ‚úÖ Labels shape: torch.Size([4, 5])
  ‚úÖ Attention mask shape: torch.Size([4, 6])


  Celda 10: Funciones de Entrenamiento Optimizadas

In [12]:

  # ============================================================================
  # FUNCIONES DE ENTRENAMIENTO COMPLETAMENTE OPTIMIZADAS
  # ============================================================================

  def train_epoch_optimized(model, dataloader, optimizer, scheduler, device, epoch, config, scaler=None):
      """√âpoca de entrenamiento completamente optimizada"""

      model.train()
      total_loss = 0.0
      num_batches = 0

      # Progress bar
      progress_bar = tqdm(dataloader, desc=f"√âpoca {epoch+1}")

      accumulation_steps = config.gradient_accumulation_steps

      for batch_idx, batch in enumerate(progress_bar):
          # Mover a device
          input_ids = batch['input_ids'].to(device, non_blocking=True)
          attention_mask = batch['attention_mask'].to(device, non_blocking=True)
          labels = batch['labels'].to(device, non_blocking=True)

          # Zero gradients solo al inicio del accumulation
          if batch_idx % accumulation_steps == 0:
              optimizer.zero_grad()

          # Forward pass con AMP - SOLO BYT5 DURANTE ENTRENAMIENTO
          if USE_AMP and scaler is not None:
              with torch.cuda.amp.autocast(dtype=AMP_DTYPE):
                  outputs = model(
                      input_ids=input_ids,
                      attention_mask=attention_mask,
                      labels=labels,
                      use_nllb=False  # üîß FORZAR ByT5 durante entrenamiento
                  )
                  loss = outputs.loss / accumulation_steps
          else:
              outputs = model(
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  labels=labels,
                  use_nllb=False  # üîß FORZAR ByT5 durante entrenamiento
              )
              loss = outputs.loss / accumulation_steps

          # Backward pass
          if USE_AMP and scaler is not None:
              scaler.scale(loss).backward()
          else:
              loss.backward()

          # Optimization step cada accumulation_steps
          if (batch_idx + 1) % accumulation_steps == 0:
              if USE_AMP and scaler is not None:
                  scaler.unscale_(optimizer)
                  torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                  scaler.step(optimizer)
                  scaler.update()
              else:
                  torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                  optimizer.step()

              if scheduler:
                  scheduler.step()

          # Acumular loss
          total_loss += loss.item() * accumulation_steps
          num_batches += 1

          # Actualizar progress bar
          current_lr = optimizer.param_groups[0]['lr']
          progress_bar.set_postfix({
              'Loss': f"{loss.item() * accumulation_steps:.4f}",
              'Avg': f"{total_loss/num_batches:.4f}",
              'LR': f"{current_lr:.2e}",
              'Model': 'ByT5'  # Indicar que usa ByT5
          })

          # Limpieza de memoria peri√≥dica (cada 100 batches, no cada batch)
          if batch_idx > 0 and batch_idx % 100 == 0:
              torch.cuda.empty_cache()

      progress_bar.close()

      # Limpiar memoria al final de la √©poca
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
      gc.collect()

      avg_loss = total_loss / max(num_batches, 1)
      return avg_loss

  def validate_model(model, dataloader, device):
      """Validaci√≥n del modelo - TAMBI√âN SOLO ByT5"""

      model.eval()
      total_loss = 0.0
      num_batches = 0

      with torch.no_grad():
          progress_bar = tqdm(dataloader, desc="Validando")

          for batch in progress_bar:
              input_ids = batch['input_ids'].to(device, non_blocking=True)
              attention_mask = batch['attention_mask'].to(device, non_blocking=True)
              labels = batch['labels'].to(device, non_blocking=True)

              # Forward pass - SOLO ByT5 durante validaci√≥n tambi√©n
              if USE_AMP:
                  with torch.cuda.amp.autocast(dtype=AMP_DTYPE):
                      outputs = model(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          use_nllb=False  # üîß ByT5 tambi√©n en validaci√≥n
                      )
              else:
                  outputs = model(
                      input_ids=input_ids,
                      attention_mask=attention_mask,
                      labels=labels,
                      use_nllb=False  # üîß ByT5 tambi√©n en validaci√≥n
                  )

              total_loss += outputs.loss.item()
              num_batches += 1

              progress_bar.set_postfix({
                  'Val Loss': f"{outputs.loss.item():.4f}",
                  'Model': 'ByT5'
              })

      progress_bar.close()
      avg_loss = total_loss / max(num_batches, 1)
      return avg_loss

  def save_checkpoint(model, optimizer, scheduler, epoch, loss, checkpoint_dir="./checkpoints"):
      """Guardar checkpoint - MEJORADO con Google Drive autom√°tico"""

      # Crear carpetas
      os.makedirs(checkpoint_dir, exist_ok=True)

      # üìÅ GUARDAR TAMBI√âN EN GOOGLE DRIVE autom√°ticamente
      drive_checkpoint_dir = "/content/drive/MyDrive/model_checkpoints"
      try:
          os.makedirs(drive_checkpoint_dir, exist_ok=True)
          save_to_drive = True
      except:
          save_to_drive = False
          print("‚ö†Ô∏è No se puede acceder a Google Drive")

      checkpoint = {
          'epoch': epoch,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
          'loss': loss,
          'config': config.__dict__,
          'timestamp': datetime.now().isoformat(),
          'model_type': 'HybridNLLBByT5Model',
          'training_mode': 'byt5_only',  # Indicar que se entren√≥ solo ByT5
          'pytorch_version': torch.__version__
      }

      # Guardar LOCAL
      checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
      latest_path = os.path.join(checkpoint_dir, "latest_model.pt")

      torch.save(checkpoint, checkpoint_path)
      torch.save(checkpoint, latest_path)

      print(f"üíæ Checkpoint guardado LOCAL: {checkpoint_path}")

      # ‚úÖ GUARDAR EN GOOGLE DRIVE si es posible
      if save_to_drive:
          try:
              drive_checkpoint_path = os.path.join(drive_checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
              drive_latest_path = os.path.join(drive_checkpoint_dir, "latest_model.pt")

              torch.save(checkpoint, drive_checkpoint_path)
              torch.save(checkpoint, drive_latest_path)

              print(f"‚òÅÔ∏è Checkpoint guardado DRIVE: {drive_checkpoint_path}")

          except Exception as e:
              print(f"‚ö†Ô∏è Error guardando en Drive: {e}")

      return checkpoint_path

  def test_hybrid_inference(model, tokenizer, test_text="Hello, how are you?"):
      """
      Probar INFERENCIA H√çBRIDA - aqu√≠ s√≠ se usa la selecci√≥n inteligente
      """
      print(f"\nüß™ Probando inferencia h√≠brida con: '{test_text}'")

      model.eval()

      with torch.no_grad():
          # Tokenizar
          inputs = tokenizer(
              test_text,
              return_tensors="pt",
              padding=True,
              truncation=True,
              max_length=128
          ).to(device)

          # üåç PRUEBA 1: Con NLLB (si est√° disponible)
          if hasattr(model, 'generate_hybrid') and model.nllb_model is not None:
              print("üåç Probando con NLLB (idiomas bien soportados)...")
              try:
                  nllb_output = model.generate_hybrid(
                      input_ids=inputs['input_ids'],
                      attention_mask=inputs['attention_mask'],
                      source_lang='en',
                      target_lang='es',  # Idioma bien soportado por NLLB
                      max_length=128,
                      num_beams=4,
                      early_stopping=True
                  )

                  nllb_translation = tokenizer.decode(nllb_output[0], skip_special_tokens=True)
                  print(f"  üåç NLLB: {nllb_translation}")

              except Exception as e:
                  print(f"  ‚ùå NLLB fall√≥: {e}")

          # üî§ PRUEBA 2: Con ByT5 (modelo entrenado)
          print("üî§ Probando con ByT5 (modelo entrenado)...")
          try:
              byt5_output = model.byt5_model.generate(
                  input_ids=inputs['input_ids'],
                  attention_mask=inputs['attention_mask'],
                  max_length=128,
                  num_beams=4,
                  early_stopping=True
              )

              byt5_translation = tokenizer.decode(byt5_output[0], skip_special_tokens=True)
              print(f"  üî§ ByT5: {byt5_translation}")

          except Exception as e:
              print(f"  ‚ùå ByT5 fall√≥: {e}")

  print("‚úÖ Funciones de entrenamiento optimizadas definidas")
  print("üîß Modo: ByT5 para entrenamiento, h√≠brido para inferencia")


‚úÖ Funciones de entrenamiento optimizadas definidas
üîß Modo: ByT5 para entrenamiento, h√≠brido para inferencia


  Celda 11: Funci√≥n Principal de Entrenamiento

*   Elemento de lista
*   Elemento de lista



In [13]:

  # ============================================================================
  # FUNCI√ìN PRINCIPAL DE ENTRENAMIENTO - CORREGIDA
  # ============================================================================

  def train_with_optimization(model, train_pairs, val_pairs, config, device):
      """Entrenamiento optimizado principal - FUNCI√ìN CORREGIDA"""

      print("üéØ Iniciando entrenamiento optimizado...")

      # Crear datasets y loaders (ya creados anteriormente, pero por consistencia)
      tokenizer = model.byt5_tokenizer
      train_dataset = MultilingualTranslationDataset(train_pairs, tokenizer, config.max_length)
      val_dataset = MultilingualTranslationDataset(val_pairs, tokenizer, config.max_length)

      train_loader = create_optimized_dataloader(train_dataset, config.batch_size, True)
      val_loader = create_optimized_dataloader(val_dataset, config.batch_size, False)

      # Variables de seguimiento
      best_val_loss = float('inf')
      patience_counter = 0
      training_history = []

      print(f"üìä Configuraci√≥n de entrenamiento:")
      print(f"  üìà √âpocas: {config.num_epochs}")
      print(f"  üéØ Batch size: {config.batch_size}")
      print(f"  üìä Gradient accumulation: {config.gradient_accumulation_steps}")
      print(f"  ‚ö° Mixed precision: {'‚úÖ' if USE_AMP else '‚ùå'}")

      # Loop de entrenamiento
      for epoch in range(config.num_epochs):
          print(f"\n{'='*60}")
          print(f"üîÑ √âPOCA {epoch + 1}/{config.num_epochs}")
          print(f"{'='*60}")

          # Entrenar √©poca
          epoch_start_time = time.time()
          train_loss = train_epoch_optimized(
              model, train_loader, optimizer, scheduler, device, epoch, config, scaler
          )
          epoch_time = time.time() - epoch_start_time

          # Validar
          print("\nüìä Ejecutando validaci√≥n...")
          val_loss = validate_model(model, val_loader, device)

          # Guardar m√©tricas
          epoch_metrics = {
              'epoch': epoch + 1,
              'train_loss': train_loss,
              'val_loss': val_loss,
              'epoch_time': epoch_time,
              'learning_rate': optimizer.param_groups[0]['lr']
          }
          training_history.append(epoch_metrics)

          # Mostrar resultados
          print(f"\nüìà RESULTADOS √âPOCA {epoch + 1}:")
          print(f"  üèãÔ∏è Train Loss: {train_loss:.4f}")
          print(f"  üìä Val Loss: {val_loss:.4f}")
          print(f"  ‚è±Ô∏è Tiempo: {epoch_time/60:.2f} min")
          print(f"  üìà Learning Rate: {optimizer.param_groups[0]['lr']:.2e}")

          # Early stopping
          if val_loss < best_val_loss:
              best_val_loss = val_loss
              patience_counter = 0
              print(f"  üèÜ ¬°Nuevo mejor modelo! (Val Loss: {best_val_loss:.4f})")

              # Guardar mejor modelo
              save_checkpoint(model, optimizer, scheduler, epoch, val_loss, "./checkpoints")

          else:
              patience_counter += 1
              print(f"  ‚è≥ Paciencia: {patience_counter}/{config.early_stopping_patience}")

          if patience_counter >= config.early_stopping_patience:
              print(f"\nüõë Early stopping activado (sin mejora en {patience_counter} √©pocas)")
              break

          # Mostrar memoria GPU
          if torch.cuda.is_available():
              memory_used = torch.cuda.memory_allocated() / 1024**3
              memory_reserved = torch.cuda.memory_reserved() / 1024**3
              print(f"  üíæ GPU Memory: {memory_used:.2f}GB usado, {memory_reserved:.2f}GB reservada")

      print(f"\nüéâ ¬°ENTRENAMIENTO COMPLETADO!")
      print(f"üèÜ Mejor Val Loss: {best_val_loss:.4f}")
      print(f"üìä √âpocas entrenadas: {len(training_history)}")

      return training_history

  # üîß CORRECCI√ìN CR√çTICA: Funci√≥n que estaba faltante
  def train_hybrid_model(model, train_pairs, val_pairs, config, device):
      """
      Funci√≥n principal que estaba faltante - CORRECCI√ìN CR√çTICA
      """
      print("üöÄ Ejecutando train_hybrid_model...")
      return train_with_optimization(model, train_pairs, val_pairs, config, device)

  print("‚úÖ Funci√≥n de entrenamiento principal definida y CORREGIDA")


‚úÖ Funci√≥n de entrenamiento principal definida y CORREGIDA


  Celda 12: Reanudaci√≥n de Entrenamiento (Eliminar celdas duplicadas 15 y 17)


In [14]:

  # ============================================================================
  # REANUDACI√ìN DE ENTRENAMIENTO - VERSI√ìN MEJORADA CON GOOGLE DRIVE
  # ============================================================================

  def setup_training_resumption_enhanced(model, optimizer, scheduler):
      """
      Configurar reanudaci√≥n con b√∫squeda inteligente en m√∫ltiples ubicaciones
      """

      print("üîç Buscando checkpoints existentes...")

      # Buscar checkpoints en orden de prioridad
      checkpoint_paths = [
          "/content/drive/MyDrive/model_checkpoints/latest_model.pt",  # Drive - m√°s reciente
          "./checkpoints/latest_model.pt",  # Local - backup
          "/content/drive/MyDrive/model_checkpoints/checkpoint_epoch_9.pt",  # Espec√≠ficos recientes
          "/content/drive/MyDrive/model_checkpoints/checkpoint_epoch_8.pt",
          "/content/drive/MyDrive/model_checkpoints/checkpoint_epoch_7.pt",
          "/content/drive/MyDrive/model_checkpoints/checkpoint_epoch_6.pt",
          "./checkpoints/checkpoint_epoch_9.pt",  # Local espec√≠ficos
          "./checkpoints/checkpoint_epoch_8.pt",
      ]

      for i, checkpoint_path in enumerate(checkpoint_paths):
          if os.path.exists(checkpoint_path):
              try:
                  print(f"üìÇ ENCONTRADO checkpoint ({i+1}): {checkpoint_path}")

                  # Cargar checkpoint
                  checkpoint = torch.load(checkpoint_path, map_location=device)

                  # Validar que el checkpoint es compatible
                  required_keys = ['model_state_dict', 'optimizer_state_dict', 'epoch']
                  if not all(key in checkpoint for key in required_keys):
                      print(f"  ‚ö†Ô∏è Checkpoint incompleto, probando siguiente...")
                      continue

                  # Cargar estados
                  model.load_state_dict(checkpoint['model_state_dict'])
                  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

                  if scheduler and 'scheduler_state_dict' in checkpoint and checkpoint['scheduler_state_dict']:
                      scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
                      print("  ‚úÖ Scheduler cargado")

                  # Extraer informaci√≥n
                  start_epoch = checkpoint.get('epoch', 0) + 1
                  best_loss = checkpoint.get('loss', float('inf'))
                  training_history = checkpoint.get('history', [])

                  # Informaci√≥n del checkpoint
                  timestamp = checkpoint.get('timestamp', 'unknown')
                  training_mode = checkpoint.get('training_mode', 'unknown')

                  print(f"  ‚úÖ REANUDACI√ìN EXITOSA:")
                  print(f"    üîÑ Desde √©poca: {start_epoch}")
                  print(f"    üéØ Mejor loss: {best_loss:.4f}")
                  print(f"    üìä Historial: {len(training_history)} √©pocas")
                  print(f"    üïê Guardado: {timestamp}")
                  print(f"    ü§ñ Modo entrenamiento: {training_mode}")

                  return start_epoch, best_loss, training_history

              except Exception as e:
                  print(f"  ‚ùå Error cargando {checkpoint_path}: {e}")
                  print(f"  üîÑ Probando siguiente ubicaci√≥n...")
                  continue

      print("üìù No se encontraron checkpoints v√°lidos")
      print("üÜï Iniciando entrenamiento desde cero")
      return 0, float('inf'), []

  def verify_model_compatibility(model, checkpoint_path):
      """Verificar que el modelo cargado es compatible"""

      try:
          checkpoint = torch.load(checkpoint_path, map_location='cpu')

          # Verificar configuraci√≥n
          if 'config' in checkpoint:
              saved_config = checkpoint['config']
              current_config = config.__dict__

              # Verificar compatibilidad de modelos base
              if (saved_config.get('byt5_model_name') != current_config.get('byt5_model_name') or
                  saved_config.get('nllb_model_name') != current_config.get('nllb_model_name')):

                  print("‚ö†Ô∏è ADVERTENCIA: Configuraci√≥n de modelos base diferente")
                  print(f"  Guardado: ByT5={saved_config.get('byt5_model_name')}, NLLB={saved_config.get('nllb_model_name')}")
                  print(f"  Actual:   ByT5={current_config.get('byt5_model_name')}, NLLB={current_config.get('nllb_model_name')}")

                  response = input("¬øContinuar de todos modos? (y/n): ")
                  if response.lower() != 'y':
                      return False

          return True

      except Exception as e:
          print(f"‚ùå Error verificando compatibilidad: {e}")
          return False

  def cleanup_old_checkpoints(max_checkpoints=5):
      """Limpiar checkpoints antiguos para ahorrar espacio"""

      directories = [
          "./checkpoints",
          "/content/drive/MyDrive/model_checkpoints"
      ]

      for checkpoint_dir in directories:
          if not os.path.exists(checkpoint_dir):
              continue

          try:
              # Obtener todos los checkpoints de √©poca
              epoch_files = glob.glob(os.path.join(checkpoint_dir, "checkpoint_epoch_*.pt"))

              if len(epoch_files) > max_checkpoints:
                  # Ordenar por n√∫mero de √©poca (extraer del nombre)
                  def extract_epoch_num(filepath):
                      try:
                          filename = os.path.basename(filepath)
                          return int(filename.split('epoch_')[1].split('.')[0])
                      except:
                          return 0

                  epoch_files.sort(key=extract_epoch_num)

                  # Eliminar los m√°s antiguos
                  files_to_delete = epoch_files[:-max_checkpoints]

                  for file_path in files_to_delete:
                      try:
                          os.remove(file_path)
                          epoch_num = extract_epoch_num(file_path)
                          print(f"üóëÔ∏è Eliminado checkpoint antiguo: √©poca {epoch_num}")
                      except Exception as e:
                          print(f"‚ö†Ô∏è Error eliminando {file_path}: {e}")

          except Exception as e:
              print(f"‚ö†Ô∏è Error limpiando {checkpoint_dir}: {e}")

  # ============================================================================
  # EJECUTAR CONFIGURACI√ìN DE REANUDACI√ìN
  # ============================================================================

  print("üöÄ Configurando reanudaci√≥n de entrenamiento...")

  # Buscar y cargar checkpoint autom√°ticamente
  start_epoch, best_loss, training_history = setup_training_resumption_enhanced(
      model, optimizer, scheduler
  )

  # Limpiar checkpoints antiguos para ahorrar espacio
  cleanup_old_checkpoints(max_checkpoints=3)

  # Mostrar estado actual
  print(f"\nüìä ESTADO ACTUAL DEL ENTRENAMIENTO:")
  print(f"  üéØ √âpoca inicial: {start_epoch}")
  print(f"  üèÜ Mejor loss hasta ahora: {best_loss:.4f}")
  print(f"  üìà √âpocas en historial: {len(training_history)}")

  # Configurar variables globales
  current_epoch = start_epoch
  current_best_loss = best_loss
  current_history = training_history

  # Probar inferencia h√≠brida si el modelo fue cargado
  if start_epoch > 0:
      print(f"\nüß™ Probando modelo cargado...")
      test_hybrid_inference(model, tokenizer, "Hello, how are you today?")

  print("‚úÖ Configuraci√≥n de reanudaci√≥n completada")



üöÄ Configurando reanudaci√≥n de entrenamiento...
üîç Buscando checkpoints existentes...
üìù No se encontraron checkpoints v√°lidos
üÜï Iniciando entrenamiento desde cero
‚ö†Ô∏è Error limpiando /content/drive/MyDrive/model_checkpoints: name 'glob' is not defined

üìä ESTADO ACTUAL DEL ENTRENAMIENTO:
  üéØ √âpoca inicial: 0
  üèÜ Mejor loss hasta ahora: inf
  üìà √âpocas en historial: 0
‚úÖ Configuraci√≥n de reanudaci√≥n completada


In [None]:
# Auditor√≠a r√°pida de targets
def audit_pairs(pairs, sample=10000, pad_id=0):
    n = min(sample, len(pairs))
    ok, empty, oob = 0, 0, 0
    max_lab = 0
    for i in range(n):
        lbl = pairs[i]['labels']
        max_lab = max(max_lab, max(lbl) if len(lbl)>0 else 0)
        valid = sum(1 for t in lbl if t != pad_id)
        if valid == 0:
            empty += 1
        else:
            ok += 1
    print(f"Audit: n={n} | ok={ok} | empty={empty} | max_label_id={max_lab}")

audit_pairs(train_pairs, pad_id=int(getattr(current_config, "pad_token_id", 0)))
audit_pairs(val_pairs,   pad_id=int(getattr(current_config, "pad_token_id", 0)))


  Celda 13: EJECUCI√ìN DEL ENTRENAMIENTO

In [15]:

  # ============================================================================
  # üéØ EJECUCI√ìN PRINCIPAL DEL ENTRENAMIENTO
  # ============================================================================

  print("üöÄ INICIANDO ENTRENAMIENTO DEL MODELO H√çBRIDO OPTIMIZADO")
  print("=" * 70)

  try:
      # Verificar que todo est√° listo
      print("üîç Verificaci√≥n pre-entrenamiento:")
      print(f"  ‚úÖ Modelo: {type(model).__name__}")
      print(f"  ‚úÖ Device: {device}")
      print(f"  ‚úÖ Datos train: {len(train_pairs)}")
      print(f"  ‚úÖ Datos val: {len(val_pairs)}")
      print(f"  ‚úÖ Optimizador: {type(optimizer).__name__}")
      print(f"  ‚úÖ Scheduler: {type(scheduler).__name__}")

      # üîß EJECUTAR FUNCI√ìN CORREGIDA
      print(f"\nüéØ Iniciando entrenamiento...")
      history = train_hybrid_model(model, train_pairs, val_pairs, config, device)

      print("\n" + "=" * 70)
      print("üéâ ¬°ENTRENAMIENTO COMPLETADO EXITOSAMENTE!")
      print("=" * 70)

      # Mostrar resumen final
      if history:
          final_metrics = history[-1]
          print(f"\nüìä M√âTRICAS FINALES:")
          print(f"  üèãÔ∏è Train Loss final: {final_metrics['train_loss']:.4f}")
          print(f"  üìä Val Loss final: {final_metrics['val_loss']:.4f}")
          print(f"  üìà √âpocas completadas: {len(history)}")
          print(f"  ‚è±Ô∏è Tiempo total: {sum(h['epoch_time'] for h in history)/60:.2f} min")

          # Encontrar mejor √©poca
          best_epoch = min(history, key=lambda x: x['val_loss'])
          print(f"  üèÜ Mejor √©poca: {best_epoch['epoch']} (Val Loss: {best_epoch['val_loss']:.4f})")

      # Guardar historial
      with open('./training_history.json', 'w') as f:
          json.dump(history, f, indent=2)
      print(f"üìÑ Historial guardado en: ./training_history.json")

  except KeyboardInterrupt:
      print("\n‚èπÔ∏è Entrenamiento interrumpido por el usuario")

  except Exception as e:
      print(f"\n‚ùå Error durante el entrenamiento: {e}")
      import traceback
      traceback.print_exc()

  finally:
      # Limpieza final
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
      gc.collect()
      print("üßπ Limpieza de memoria completada")


üöÄ INICIANDO ENTRENAMIENTO DEL MODELO H√çBRIDO OPTIMIZADO
üîç Verificaci√≥n pre-entrenamiento:
  ‚úÖ Modelo: HybridNLLBByT5Model
  ‚úÖ Device: cuda
  ‚úÖ Datos train: 200000
  ‚úÖ Datos val: 100000
  ‚úÖ Optimizador: AdamW
  ‚úÖ Scheduler: LambdaLR

üéØ Iniciando entrenamiento...
üöÄ Ejecutando train_hybrid_model...
üéØ Iniciando entrenamiento optimizado...
  üìä Dataset con 200000 pares
  üî§ Tokenizer: ByT5Tokenizer
  üìè Max length: 128
  üìä Dataset con 100000 pares
  üî§ Tokenizer: ByT5Tokenizer
  üìè Max length: 128
üîß DataLoader configurado para Colab
üîß DataLoader configurado para Colab
üìä Configuraci√≥n de entrenamiento:
  üìà √âpocas: 10
  üéØ Batch size: 4
  üìä Gradient accumulation: 4
  ‚ö° Mixed precision: ‚úÖ

üîÑ √âPOCA 1/10


√âpoca 1:   0%|          | 0/50000 [00:00<?, ?it/s]


üìä Ejecutando validaci√≥n...


Validando:   0%|          | 0/25000 [00:00<?, ?it/s]


üìà RESULTADOS √âPOCA 1:
  üèãÔ∏è Train Loss: nan
  üìä Val Loss: nan
  ‚è±Ô∏è Tiempo: 91.07 min
  üìà Learning Rate: 9.77e-06
  ‚è≥ Paciencia: 1/3
  üíæ GPU Memory: 6.79GB usado, 7.82GB reservada

üîÑ √âPOCA 2/10


√âpoca 2:   0%|          | 0/50000 [00:00<?, ?it/s]


üìä Ejecutando validaci√≥n...


Validando:   0%|          | 0/25000 [00:00<?, ?it/s]


üìà RESULTADOS √âPOCA 2:
  üèãÔ∏è Train Loss: nan
  üìä Val Loss: nan
  ‚è±Ô∏è Tiempo: 88.75 min
  üìà Learning Rate: 9.52e-06
  ‚è≥ Paciencia: 2/3
  üíæ GPU Memory: 6.79GB usado, 7.82GB reservada

üîÑ √âPOCA 3/10


√âpoca 3:   0%|          | 0/50000 [00:00<?, ?it/s]


üìä Ejecutando validaci√≥n...


Validando:   0%|          | 0/25000 [00:00<?, ?it/s]


üìà RESULTADOS √âPOCA 3:
  üèãÔ∏è Train Loss: nan
  üìä Val Loss: nan
  ‚è±Ô∏è Tiempo: 88.63 min
  üìà Learning Rate: 9.27e-06
  ‚è≥ Paciencia: 3/3

üõë Early stopping activado (sin mejora en 3 √©pocas)

üéâ ¬°ENTRENAMIENTO COMPLETADO!
üèÜ Mejor Val Loss: inf
üìä √âpocas entrenadas: 3

üéâ ¬°ENTRENAMIENTO COMPLETADO EXITOSAMENTE!

üìä M√âTRICAS FINALES:
  üèãÔ∏è Train Loss final: nan
  üìä Val Loss final: nan
  üìà √âpocas completadas: 3
  ‚è±Ô∏è Tiempo total: 268.45 min
  üèÜ Mejor √©poca: 1 (Val Loss: nan)
üìÑ Historial guardado en: ./training_history.json
üßπ Limpieza de memoria completada


 Celda 14: Pruebas y Evaluaci√≥n del Modelo

In [15]:

  # ============================================================================
  # PRUEBAS Y EVALUACI√ìN DEL MODELO ENTRENADO
  # ============================================================================

  def test_model_translations(model, tokenizer, test_sentences=None):
      """Probar el modelo con oraciones de ejemplo"""

      if test_sentences is None:
          test_sentences = [
              "Hello, how are you today?",
              "The weather is beautiful.",
              "I love machine learning.",
              "Good morning!",
              "Thank you very much.",
              "Where is the library?",
              "I need help with this problem.",
              "What time is it?",
              "How much does this cost?",
              "I don't understand."
          ]

      model.eval()
      print("üß™ Probando traducciones del modelo...")
      print("-" * 60)

      with torch.no_grad():
          for i, source_text in enumerate(test_sentences, 1):
              try:
                  # Tokenizar entrada
                  inputs = tokenizer(
                      source_text,
                      return_tensors="pt",
                      padding=True,
                      truncation=True,
                      max_length=config.max_length
                  ).to(device)

                  # Generar traducci√≥n
                  if hasattr(model, 'generate_hybrid'):
                      # Usar generaci√≥n h√≠brida si est√° disponible
                      outputs = model.generate_hybrid(
                          input_ids=inputs['input_ids'],
                          attention_mask=inputs['attention_mask'],
                          max_length=config.max_length,
                          num_beams=4,
                          temperature=0.7,
                          do_sample=True,
                          early_stopping=True,
                          source_lang='en',
                          target_lang='es'
                      )
                  else:
                      # Usar generaci√≥n est√°ndar
                      outputs = model.byt5_model.generate(
                          input_ids=inputs['input_ids'],
                          attention_mask=inputs['attention_mask'],
                          max_length=config.max_length,
                          num_beams=2,
                          early_stopping=True
                      )

                  # Decodificar resultado
                  translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

                  print(f"{i:2d}. EN: {source_text}")
                  print(f"    ES: {translation}")
                  print()

              except Exception as e:
                  print(f"{i:2d}. ‚ùå Error traduciendo '{source_text}': {e}")
                  print()

  def calculate_model_size(model):
      """Calcular tama√±o del modelo en MB"""
      param_size = 0
      buffer_size = 0

      for param in model.parameters():
          param_size += param.nelement() * param.element_size()

      for buffer in model.buffers():
          buffer_size += buffer.nelement() * buffer.element_size()

      size_mb = (param_size + buffer_size) / 1024**2
      return size_mb

  # Ejecutar pruebas si el entrenamiento fue exitoso
  if 'history' in locals() and history:
      print("üéØ Ejecutando pruebas del modelo entrenado...")

      # Informaci√≥n del modelo
      model_size_mb = calculate_model_size(model)
      print(f"üìä Tama√±o del modelo: {model_size_mb:.2f} MB")

      # Pruebas de traducci√≥n
      test_model_translations(model, tokenizer)

      # Cargar mejor checkpoint si existe
      best_checkpoint_path = "./checkpoints/latest_model.pt"
      if os.path.exists(best_checkpoint_path):
          print("üìÇ Cargando mejor checkpoint para pruebas...")
          try:
              checkpoint = torch.load(best_checkpoint_path, map_location=device)
              model.load_state_dict(checkpoint['model_state_dict'])
              print("‚úÖ Mejor checkpoint cargado")

              print("\nüèÜ Traducciones con el MEJOR modelo:")
              test_model_translations(model, tokenizer)

          except Exception as e:
              print(f"‚ö†Ô∏è Error cargando checkpoint: {e}")

  else:
      print("‚ö†Ô∏è No se puede probar el modelo - entrenamiento no completado")


üéØ Ejecutando pruebas del modelo entrenado...
üìä Tama√±o del modelo: 3532.61 MB
üß™ Probando traducciones del modelo...
------------------------------------------------------------
üåç Generando con NLLB (en->es)
 1. ‚ùå Error traduciendo 'Hello, how are you today?': bytes must be in range(0, 256)

üåç Generando con NLLB (en->es)
 2. ‚ùå Error traduciendo 'The weather is beautiful.': bytes must be in range(0, 256)

üåç Generando con NLLB (en->es)
 3. EN: I love machine learning.
    ES:     ll  lloo  llooov    lloooooooooooooooovvv                           lll      lllloooooooooooooooooooooooooooooooooooooooovv

üåç Generando con NLLB (en->es)
 4. ‚ùå Error traduciendo 'Good morning!': bytes must be in range(0, 256)

üåç Generando con NLLB (en->es)
 5. ‚ùå Error traduciendo 'Thank you very much.': bytes must be in range(0, 256)

üåç Generando con NLLB (en->es)
 6. ‚ùå Error traduciendo 'Where is the library?': bytes must be in range(0, 256)

üåç Generando con NLLB (en->es)

  Celda 15: Guardar Modelo Final

In [None]:

  # ============================================================================
  # GUARDAR MODELO FINAL PARA PRODUCCI√ìN
  # ============================================================================

  def save_final_model(model, tokenizer, config, save_dir="./final_model"):
      """Guardar modelo final optimizado para producci√≥n"""

      print(f"üíæ Guardando modelo final en: {save_dir}")
      os.makedirs(save_dir, exist_ok=True)

      try:
          # Guardar estado del modelo
          model_path = os.path.join(save_dir, "model.pt")
          torch.save({
              'model_state_dict': model.state_dict(),
              'config': config.__dict__,
              'model_type': 'HybridNLLBByT5Model',
              'pytorch_version': torch.__version__,
              'save_timestamp': datetime.now().isoformat()
          }, model_path)

          # Guardar tokenizer
          tokenizer.save_pretrained(os.path.join(save_dir, "tokenizer"))

          # Guardar configuraci√≥n como JSON
          config_path = os.path.join(save_dir, "config.json")
          with open(config_path, 'w') as f:
              json.dump(config.__dict__, f, indent=2)

          # Guardar informaci√≥n del modelo
          model_info = {
              'model_name': 'HybridNLLBByT5Model',
              'base_models': {
                  'primary': config.byt5_model_name,
                  'secondary': config.nllb_model_name if config.use_nllb_for_inference else None
              },
              'training_info': {
                  'epochs_trained': len(history) if 'history' in locals() else 0,
                  'final_train_loss': history[-1]['train_loss'] if 'history' in locals() and history else None,
                  'final_val_loss': history[-1]['val_loss'] if 'history' in locals() and history else None,
                  'best_val_loss': min(h['val_loss'] for h in history) if 'history' in locals() and history else None
              },
              'model_size_mb': calculate_model_size(model),
              'supported_languages': list(SUPPORTED_LANGUAGES.keys()),
              'optimizations': {
                  'mixed_precision': config.use_mixed_precision,
                  'gradient_accumulation': config.gradient_accumulation_steps,
                  'nllb_in_training': config.use_nllb_in_training,
                  'nllb_for_inference': config.use_nllb_for_inference
              }
          }

          info_path = os.path.join(save_dir, "model_info.json")
          with open(info_path, 'w') as f:
              json.dump(model_info, f, indent=2)

          # Listar archivos guardados
          saved_files = os.listdir(save_dir)
          print("üìÑ Archivos guardados:")
          for file in saved_files:
              file_path = os.path.join(save_dir, file)
              if os.path.isfile(file_path):
                  size_mb = os.path.getsize(file_path) / (1024**2)
                  print(f"  üìÑ {file}: {size_mb:.2f} MB")

          print(f"‚úÖ Modelo final guardado exitosamente en: {save_dir}")
          return save_dir

      except Exception as e:
          print(f"‚ùå Error guardando modelo final: {e}")
          return None

  # Guardar modelo final si el entrenamiento fue exitoso
  if 'model' in locals() and model is not None:
      final_model_path = save_final_model(model, tokenizer, config)

      if final_model_path:
          print(f"\nüéâ ¬°MODELO GUARDADO EXITOSAMENTE!")
          print(f"üìÅ Ubicaci√≥n: {final_model_path}")
          print("\nüìã Para usar el modelo:")
          print("1. Carga el modelo: torch.load('final_model/model.pt')")
          print("2. Carga el tokenizer desde: 'final_model/tokenizer'")
          print("3. Revisa la configuraci√≥n en: 'final_model/config.json'")

  else:
      print("‚ö†Ô∏è No hay modelo para guardar")


üíæ Guardando modelo final en: ./final_model


## **USER GUIDE**


In [None]:
print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë          HYBRID NLLB-ByT5 TRANSLATION MODEL - USER GUIDE         ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

üìö C√ìMO USAR ESTE NOTEBOOK:

1Ô∏è‚É£ PREPARACI√ìN:
   - Aseg√∫rate de tener GPU disponible (recomendado)
   - Instala todas las dependencias (Celda 1)

2Ô∏è‚É£ DATOS DE ENTRENAMIENTO:
   - Modifica la funci√≥n prepare_training_data() en Celda 6
   - Formato: (texto_origen, texto_destino, idioma_origen, idioma_destino)
   - Para NLLB, usa c√≥digos como: 'eng_Latn', 'spa_Latn', etc.

3Ô∏è‚É£ CONFIGURACI√ìN:
   - Ajusta hiperpar√°metros en HybridTranslationConfig (Celda 3)
   - Batch size, learning rate, epochs, etc.

4Ô∏è‚É£ ENTRENAMIENTO:
   - Ejecuta main_training_pipeline() (Celda 20) para proceso completo
   - O entrena paso a paso ejecutando celdas individuales

5Ô∏è‚É£ AGREGAR NUEVOS IDIOMAS:
   - Usa add_new_language_support() (Celda 12)
   - Proporciona pares de entrenamiento para el nuevo idioma

6Ô∏è‚É£ INFERENCIA:
   - Usa model.generate_translation() para traducir
   - O usa interactive_translation() para interfaz interactiva

üìä IDIOMAS SOPORTADOS POR NLLB-200:
   - 200+ idiomas incluyendo:
     ‚Ä¢ Principales: Ingl√©s, Espa√±ol, Franc√©s, Alem√°n, Chino, etc.
     ‚Ä¢ Regionales: Catal√°n, Gallego, Euskera, etc.
     ‚Ä¢ Minoritarios: Muchos idiomas con pocos recursos

   Lista completa: https://github.com/facebookresearch/fairseq/tree/nllb

‚öôÔ∏è PERSONALIZACI√ìN:
   - Puedes cambiar el modelo NLLB base por versiones m√°s grandes
   - Ajusta el modelo ByT5 (small, base, large)
   - Modifica las capas de fusi√≥n seg√∫n tus necesidades

‚ö†Ô∏è CONSIDERACIONES:
   - El modelo distilled NLLB-200 usa ~2.4GB de VRAM
   - ByT5-small usa ~1.2GB adicionales
   - Con batch_size=4, necesitas ~8GB de VRAM m√≠nimo
   - Para modelos m√°s grandes, ajusta batch_size y gradient_accumulation

üí° PR√ìXIMOS PASOS:
   1. Entrenar con m√°s datos (m√≠nimo 10k pares por idioma)
   2. Fine-tuning espec√≠fico por dominio
   3. Implementar beam search para mejor calidad
   4. Agregar post-procesamiento espec√≠fico por idioma
   5. Crear API REST para servir el modelo

üìß SOPORTE:
   Si tienes problemas, verifica:
   - Versiones de librer√≠as
   - Memoria GPU disponible
   - Formato correcto de datos
   - C√≥digos de idioma v√°lidos

¬°Buena suerte con tu modelo de traducci√≥n multiling√ºe! üåçüöÄ
""")

# Verificar estado final
print("\n" + "="*60)
print("ESTADO DEL SISTEMA:")
print(f"  ‚Ä¢ GPU disponible: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  ‚Ä¢ GPU: {torch.cuda.get_device_name(0)}")
    print(f"  ‚Ä¢ VRAM libre: {torch.cuda.mem_get_info()[0]/1e9:.2f} GB")
print(f"  ‚Ä¢ Modelos cargados: ‚úì")
print(f"  ‚Ä¢ Listo para entrenamiento: ‚úì")
print("="*60)