In [None]:
                
import warnings
warnings.filterwarnings("ignore")

import os
import wandb
from math import ceil
from transformers import (
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

# wandb_dir = os.path.join(os.getcwd(), 'wandb_runs')
# os.makedirs(wandb_dir, exist_ok=True)

# Import our custom modules
from config import Config
from metrics import create_compute_metrics
from data_utils import set_seeds, load_and_prepare_dataset, preprocess_dataset
from utils import clear_memory, create_directories, safe_training_check, save_model_safe
from train import setup_model, create_training_args, create_callbacks

def main():
    """Main training function"""
    
    # Load config
    config = Config()
    
    print("=" * 60)
    print("üöÄ Nepali Grammar Error Correction Training")
    print("=" * 60)
    print(f"Model: {config.model_id}")
    print(f"LoRA: {config.use_lora}")
    print(f"Samples: {config.num_samples or 'Full dataset'}")
    print("=" * 60)
    
    # Setup
    set_seeds(config.seed)
    clear_memory()
    create_directories(config.output_dir)
    
    # Initialize wandb
    wandb.finish()
    wandb.init(
        project=config.wandb_project,
        config=vars(config),
        # dir=wandb_dir
    )
    run_id = wandb.run.id
    
    # Load data
    dataset = load_and_prepare_dataset(config)
    
    # Setup model
    model, tokenizer = setup_model(config)
    
    # Preprocess
    dataset_encoded = preprocess_dataset(dataset, tokenizer, config)
    
    # Create training args
    training_args = create_training_args(config, dataset_encoded, run_id)
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        padding=True
    )
    
    # Create metrics
    compute_metrics = create_compute_metrics(tokenizer, config)
    
    # Create callbacks
    callbacks = create_callbacks(config, tokenizer, dataset)
    
    # Create trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset_encoded["train"],
        eval_dataset=dataset_encoded["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=callbacks,
    )
    
        # Safety check
    if not safe_training_check(trainer):
        print("\n‚ùå Safety checks failed! Fix issues before training.")
        return
    
    # Train!
    print("\n" + "=" * 60)
    print("üèãÔ∏è  Starting training...")
    print("=" * 60)
    
    try:
        if config.resume_from_checkpoint:
            print("continuing training from latest checkpoint.....")
            trainer.train(resume_from_checkpoint=True)
        else:
            trainer.train()
        print("\n‚úÖ Training complete!")
    except Exception as e:
        print(f"\n‚ùå Training failed: {e}")
        wandb.finish()
        return
    
    # Save model
    best_model_path = f"{config.output_dir}/best_model"
    save_model_safe(model, tokenizer, best_model_path, use_lora=config.use_lora)
    
    print(f"\nüéâ All done! Model saved to {best_model_path}")
    wandb.finish()

if __name__ == "__main__":
    main()


üöÄ Nepali Grammar Error Correction Training
Model: google/mt5-small
LoRA: True
Samples: 15
‚úÖ Seeds set to 42
Memory cleared
Directories created in c:\Users\Lenovo\Desktop\Nepali_GEC\nepali_gec\outputs


[34m[1mwandb[0m: Currently logged in as: [33mlsumit008[0m ([33mlsumit008-khwopa-college-of-engineering[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python313\Lib\logging\__init__.py", line 1153, in emit
    stream.write(msg + self.terminator)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python313\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode characters in position 187-191: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Lenovo\Desktop\Nepali_GEC\nepali_gec\myenv\Lib\site-packages\i


üìö Loading dataset: sumitaryal/nepali_grammatical_error_correction
  Using 15 samples
  Train: 13 samples
  Valid: 2 samples

 Lodaing model: google/mt5-small
 Using LoRA + 8-bit quantization
trainable params: 1,376,256 || all params: 301,553,024 || trainable%: 0.4564

‚öôÔ∏è  Preprocessing dataset...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:00<00:00, 277.89 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 89.28 examples/s]

  ‚úÖ Preprocessing complete

üìä Training plan:
  Steps per epoch: 1
  Total steps: 3
  Warmup steps: 0
Enabled metrics:bleu, chrf, gleu, correction_accuracy





‚úÖ Added EarlyStoppingCallback (patience=3)
‚úÖ Added SamplePredictionCallback (num_samples=5)
‚úÖ Added MemoryCleanupCallback

 Running pre-training safety checks...
Model device: cuda:0
Train dataset size: 13
Eval dataset size: 2
 Data loading works
 Performing evaluation check...


üîç Sample - Pred: '<extra_id_0>...' | Ref: '‡§´‡§∞‡•ç‡§ï‡§®‡•á ‡§π‡•ã ‡§â‡§§‡•à?...' | Match: False
üìä Logged 5 sample predictions to W&B
 Evaluation successful
Initial metrics: {'eval_loss': 10.18242073059082, 'eval_model_preparation_time': 0.0025, 'eval_bleu': 0.0, 'eval_gleu': 0.0, 'eval_chrf': 0.0, 'eval_correction_accuracy': 0.0, 'eval_runtime': 6.2777, 'eval_samples_per_second': 0.319, 'eval_steps_per_second': 0.159}

üèãÔ∏è  Starting training...


Epoch,Training Loss,Validation Loss,Model Preparation Time,Bleu,Gleu,Chrf,Correction Accuracy
1,21.0346,9.093679,0.0025,0.0,0.0,0.0,0.0
2,24.705,8.171597,0.0025,0.0,0.0,0.0,0.0
3,25.7782,8.344097,0.0025,0.0,0.0,0.0,0.0


üîç Sample - Pred: '<extra_id_0>...' | Ref: '‡§´‡§∞‡•ç‡§ï‡§®‡•á ‡§π‡•ã ‡§â‡§§‡•à?...' | Match: False
üìä Logged 5 sample predictions to W&B
üîç Sample - Pred: '<extra_id_0>...' | Ref: '‡§´‡§∞‡•ç‡§ï‡§®‡•á ‡§π‡•ã ‡§â‡§§‡•à?...' | Match: False
üìä Logged 5 sample predictions to W&B
üîç Sample - Pred: '<extra_id_0>...' | Ref: '‡§´‡§∞‡•ç‡§ï‡§®‡•á ‡§π‡•ã ‡§â‡§§‡•à?...' | Match: False
üìä Logged 5 sample predictions to W&B

‚úÖ Training complete!

üíæ Saving model to c:\Users\Lenovo\Desktop\Nepali_GEC\nepali_gec\outputs/best_model...


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


  ‚úÖ LoRA adapter saved
  ‚úÖ Tokenizer saved

üéâ All done! Model saved to c:\Users\Lenovo\Desktop\Nepali_GEC\nepali_gec\outputs/best_model


0,1
epoch,‚ñÅ‚ñÖ‚ñà
eval/bleu,‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/chrf,‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/correction_accuracy,‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/gleu,‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/loss,‚ñà‚ñÑ‚ñÅ‚ñÇ
eval/model_preparation_time,‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñÅ‚ñÇ‚ñà‚ñÖ
eval/samples_per_second,‚ñà‚ñá‚ñÅ‚ñÑ
eval/steps_per_second,‚ñà‚ñá‚ñÅ‚ñÑ

0,1
epoch,3
eval/bleu,0
eval/chrf,0
eval/correction_accuracy,0
eval/gleu,0
eval/loss,8.3441
eval/model_preparation_time,0.0025
eval/runtime,6.8666
eval/samples_per_second,0.291
eval/steps_per_second,0.146
