Fine-tuning mt5 model with sumit aryal's nepali_grammatical_error_correction dataset in hugging face.

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import MT5Tokenizer
import torch
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import random
# Set all seeds for reproducibility
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)
torch.cuda.manual_seed_all(100)

In [None]:

ds = load_dataset("sumitaryal/nepali_grammatical_error_correction")

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 7723971
    })
    valid: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 406525
    })
})

In [None]:
# Since only train and valid dataset is given we need to seperate train dataset to train and test
# so we will have train, test and valid dataset with approximately equal test and valid data

dataset = ds["train"].train_test_split(test_size=0.05, seed=42)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 7337772
    })
    test: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 386199
    })
})

In [None]:
dataset["valid"] = ds["valid"]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 7337772
    })
    test: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 386199
    })
    valid: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 406525
    })
})

In [None]:
# # For each split, take 1% using train_test_split

# from datasets import DatasetDict

# dataset = DatasetDict({
#     'train': dataset['train'].train_test_split(test_size=0.001, seed=42)['test'],
#     'test': dataset['test'].train_test_split(test_size=0.001, seed=42)['test'],
#     'valid': dataset['valid'].train_test_split(test_size=0.001, seed=42)['test']
# })

# print(dataset)

In [None]:
dataset["train"].features

{'incorrect_sentence': Value('string'), 'correct_sentence': Value('string')}

Dataset Exploration

In [None]:
# set dataset to dataframe because we can use high level apis for data visualization
# dataset.set_format(type="pandas")
# df = dataset["train"][:]
# df.head(10)

In [None]:
# Count ALL duplicate occurrences (including first)
# print(f"All duplicate rows: {df['correct_sentence'].duplicated(keep=False).sum()}")

# # Count only the first occurrence of each duplicate group
# unique_duplicates = df['correct_sentence'].duplicated().sum()
# print(f"Duplicate copies: {unique_duplicates}")

# # See which values are duplicated
# duplicate_values = df[df['correct_sentence'].duplicated(keep=False)]['correct_sentence'].unique()
# print(f"Number of unique sentences that have duplicates: {len(duplicate_values)}")

# unique_correct_sentences = df['correct_sentence'].unique()
# print(f"Number of unique correct sentences: {len(unique_correct_sentences)}")

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd

# # Length distribution of incorrect vs correct sentences
# incorrect_lengths = [len(sent.split()) for sent in df['incorrect_sentence']]
# correct_lengths = [len(sent.split()) for sent in df['correct_sentence']]

# plt.figure(figsize=(12, 4))

# # Use histplot or kdeplot instead of barplot for distribution visualization
# plt.subplot(1, 2, 1)
# sns.histplot(incorrect_lengths, label='Incorrect', alpha=0.7, kde=True, color='red')
# sns.histplot(correct_lengths, label='Correct', alpha=0.7, kde=True, color='blue')
# plt.legend()
# plt.title('Sentence Length Distribution')
# plt.xlabel('Number of Words')
# plt.ylabel('Frequency')

# plt.subplot(1, 2, 2)
# sns.boxplot(data=[incorrect_lengths, correct_lengths])
# plt.xticks([0, 1], ['Incorrect', 'Correct'])
# plt.title('Sentence Length Boxplot')
# plt.ylabel('Number of Words')

# plt.tight_layout()
# plt.show()

In [None]:
# resetting dataset format since we don't need df format anymore
# dataset.reset_format()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# Define tokenizer and model

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_ckpt = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
text = "‡§§‡•ç‡§Ø‡§∏‡•à‡§≤‡•á ‡§â‡§π‡§æ‡§Å ‡§Ø‡•ã ‡§Æ‡•Å‡§¶‡•ç‡§¶‡§æ‡§Æ‡§æ ‡§á‡§ú‡§≤‡§æ‡§∏‡§¨‡§æ‡§ü ‡§π‡§ü‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ ‡•§"
encoded_text = tokenizer(text)
print(encoded_text)
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)
print(tokenizer.convert_tokens_to_string(tokens))
tokenizer.vocab_size

{'input_ids': [12278, 43508, 1048, 2139, 13345, 19650, 39688, 4096, 1195, 259, 145610, 144308, 9941, 259, 68409, 157242, 259, 378, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['‚ñÅ‡§§‡•ç‡§Ø', '‡§∏‡•à', '‡§≤‡•á', '‚ñÅ‡§â', '‡§π‡§æ‡§Å', '‚ñÅ‡§Ø‡•ã', '‚ñÅ‡§Æ‡•Å‡§¶‡•ç', '‡§¶‡§æ', '‡§Æ‡§æ', '‚ñÅ', '‡§á‡§ú', '‡§≤‡§æ‡§∏', '‡§¨‡§æ‡§ü', '‚ñÅ', '‡§π‡§ü', '‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ', '‚ñÅ', '‡•§', '</s>']
‡§§‡•ç‡§Ø‡§∏‡•à‡§≤‡•á ‡§â‡§π‡§æ‡§Å ‡§Ø‡•ã ‡§Æ‡•Å‡§¶‡•ç‡§¶‡§æ‡§Æ‡§æ ‡§á‡§ú‡§≤‡§æ‡§∏‡§¨‡§æ‡§ü ‡§π‡§ü‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ ‡•§</s>


250100

In [None]:
dataset["train"][:2]


{'incorrect_sentence': ['‡§Ü‡§´‡•ç‡§®‡•ã ‡§®‡§ø‡§∑‡•ç‡§†‡§æ ‡§®‡§õ‡•ã‡§°‡•Ä ‡•§',
  '‡§´‡•à‡§≤‡§ø‡§Å‡§¶‡•ã ‡§ö‡§ø‡§Ø‡§æ ‡§ñ‡•á‡§§‡•Ä ‡§∞ ‡§ò‡§ü‡•ç‡§¶‡•ã ‡§ï‡§æ‡§Æ‡§¶‡§æ‡§∞‡§ï‡§æ ‡§ï‡§æ‡§∞‡§£ ‡§ñ‡•á‡§§‡•Ä‡§µ‡§æ‡§≤‡§æ‡§≤‡•á ‡§®‡•à‡§∏‡§¨‡•à ‡§ï‡§æ‡§Æ ‡§ó‡§∞‡•ç‡§® ‡§≠‡§è‡§ï‡§æ‡§≤‡•á ‡§ï‡•É‡§∑‡§ï‡§ï‡•ã ‡§µ‡•ç‡§Ø‡§∏‡•ç‡§§‡§§‡§æ ‡•ß‡•® ‡§Æ‡§π‡§ø‡§®‡§æ ‡§®‡•à ‡§â‡§∏‡•ç‡§§‡•à ‡§õ ‡•§'],
 'correct_sentence': ['‡§Ü‡§´‡•ç‡§®‡•ã ‡§®‡§ø‡§∑‡•ç‡§†‡§æ ‡§®‡§õ‡•ã‡§°‡•Ä ‡§¨‡§∏‡•ç‡§®‡•Å‡§≠‡§Ø‡•ã ‡•§',
  '‡§´‡•à‡§≤‡§ø‡§Å‡§¶‡•ã ‡§ö‡§ø‡§Ø‡§æ ‡§ñ‡•á‡§§‡•Ä ‡§∞ ‡§ò‡§ü‡•ç‡§¶‡•ã ‡§ï‡§æ‡§Æ‡§¶‡§æ‡§∞‡§ï‡§æ ‡§ï‡§æ‡§∞‡§£ ‡§ñ‡•á‡§§‡•Ä‡§µ‡§æ‡§≤‡§æ‡§≤‡•á ‡§®‡•à‡§∏‡§¨‡•à ‡§ï‡§æ‡§Æ ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§®‡•á ‡§≠‡§è‡§ï‡§æ‡§≤‡•á ‡§ï‡•É‡§∑‡§ï‡§ï‡•ã ‡§µ‡•ç‡§Ø‡§∏‡•ç‡§§‡§§‡§æ ‡•ß‡•® ‡§Æ‡§π‡§ø‡§®‡§æ ‡§®‡•à ‡§â‡§∏‡•ç‡§§‡•à ‡§õ ‡•§']}

In [None]:

def tokenize(batch):
    # tokenize input (incorrect)
    input_encodings = tokenizer(
        batch["incorrect_sentence"],
        padding=False,       # No padding during tokenization  # keep sentences at natural length
        truncation=True,
    )
    # tokenize target (correct)
    target_encodings = tokenizer(
        batch["correct_sentence"],
        padding=False,
        truncation=True,
    )

    # set labels for seq2seq training                           # for seq2deq models, the "labels" are the token IDs of the target sequence
    input_encodings["labels"] = target_encodings["input_ids"]

    return input_encodings


print(tokenize(dataset["train"][:2]))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[943, 10430, 14656, 259, 145937, 33541, 259, 863, 225461, 11085, 259, 378, 1], [6389, 4439, 23717, 7929, 53069, 29847, 2075, 55208, 2376, 259, 996, 24981, 99474, 9780, 8520, 18413, 1770, 259, 13576, 55208, 2376, 38961, 1048, 259, 9357, 2312, 47161, 8520, 11435, 259, 15246, 68448, 67611, 111238, 91753, 13378, 1114, 259, 67522, 59051, 1437, 259, 9357, 2139, 62909, 2237, 259, 378, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[943, 10430, 14656, 259, 145937, 33541, 259, 863, 225461, 11085, 259, 208662, 68782, 259, 378, 1], [6389, 4439, 23717, 7929, 53069, 29847, 2075, 55208, 2376, 259, 996, 24981, 99474, 9780, 8520, 18413, 1770, 259, 13576, 55208, 2376, 38961, 1048, 259, 9357, 2312, 47161, 8520, 2661, 121003, 259, 15246, 68448, 67611, 111238, 91753, 13378, 1114, 259, 67522, 59051, 1437, 259, 9357,

In [None]:
# apply tokenize function across all the splits in corpus
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=1000)

Map:   0%|          | 0/7337772 [00:00<?, ? examples/s]

Map:   0%|          | 0/386199 [00:00<?, ? examples/s]

Map:   0%|          | 0/406525 [00:00<?, ? examples/s]

In [None]:
dataset_encoded["train"].column_names

['incorrect_sentence',
 'correct_sentence',
 'input_ids',
 'attention_mask',
 'labels']

In [None]:
# Pytorch expects input in tensor format
# enables parallel computation on GPU, Optmize storage and operations, automatic differentiation
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# # ! pip install wandb
# import wandb
# wandb.login()

In [None]:

# ! pip install evaluate
# ! pip install sacrebleu
# ! pip install rouge_score

In [None]:
import numpy as np
import evaluate

# Load metrics once
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
gleu_metric = evaluate.load("google_bleu")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # 1Ô∏è‚É£ Convert logits ‚Üí predicted token IDs (if logits provided)
    if predictions.ndim == 3:
        predictions = np.argmax(predictions, axis=-1)

    # 2Ô∏è‚É£ Align sequence lengths if needed
    min_len = min(predictions.shape[1], labels.shape[1])
    predictions = predictions[:, :min_len]
    labels = labels[:, :min_len]

    # 3Ô∏è‚É£ Replace -100 with pad token ID (for decoding)
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # 4Ô∏è‚É£ Decode to text
    pred_texts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # DEBUG: Show sample outputs
    print(f"Sample predictions and labels:")
    for i in range(min(3, len(pred_texts))):
        print(f"Pred {i}: '{pred_texts[i]}'")
        print(f"Label {i}: '{label_texts[i]}'")
        print(f"Pred length: {len(pred_texts[i])}, Label length: {len(label_texts[i])}")
        print("---")

    # 5Ô∏è‚É£ Token accuracy (ignoring -100s)
    mask = labels != tokenizer.pad_token_id
    correct_tokens = np.sum((predictions == labels) & mask)
    total_tokens = np.sum(mask)
    token_accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0

    # 6Ô∏è‚É£ Remove empty pairs for text metrics
    non_empty_pairs = [(p.strip(), l.strip()) for p, l in zip(pred_texts, label_texts) if p.strip() and l.strip()]

    metrics = {"eval_token_accuracy": token_accuracy}

    if non_empty_pairs:
        preds, refs = zip(*non_empty_pairs)

        try:
            bleu = bleu_metric.compute(predictions=preds, references=[[r] for r in refs])
            metrics["eval_BLEU"] = bleu["score"]
        except Exception as e:
            print(f"BLEU error: {e}")
            metrics["eval_BLEU"] = 0.0
        try:
            gleu = gleu_metric.compute(predictions=preds, references=[[r] for r in refs])
            metrics["eval_GLEU"] = gleu["google_bleu"]
        except Exception as e:
            print(f"GLEU error: {e}")
            metrics["eval_GLEU"] = 0.0

        try:
            rouge = rouge_metric.compute(predictions=preds, references=refs)
            metrics["eval_ROUGE-L"] = rouge["rougeL"]
        except Exception as e:
            print(f"ROUGE error: {e}")
            metrics["eval_ROUGE-L"] = 0.0
    else:
        metrics.update({"eval_BLEU": 0.0, "eval_ROUGE-L": 0.0})


    return metrics


In [None]:
#  when we prepare our batch, we set up the decoder inputs by shifting the labels to
#  the right by one. After that, we make sure the padding tokens in the labels are ignored
#  by the loss function by setting them to ‚Äì100. We actually don‚Äôt have to do this manually,
#  though, since the DataCollatorForSeq2Seq comes to the rescue and takes care
#  of all these steps for us
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True) # dynamic padding to longest in batch
# no need to pad during tokenization it will only waste memory

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, TrainerCallback
import os
import wandb

wandb.init(project="nepali-grammar-correction", name="mt5-nepali")

batch_size = 16
num_train_epochs = 5
gradient_accumulation_steps = 2
learning_rate = 5e-5
weight_decay = 0.01
lr_scheduler_type = "linear"
steps_per_epoch = len(dataset_encoded["train"]) // (batch_size * gradient_accumulation_steps)     # no. of steps per epoch # log once per epoch
logging_steps = max(1, steps_per_epoch // 20)                                                     # Log 20 times per epoch
num_training_steps = steps_per_epoch * num_train_epochs
warmup_steps = int(0.05 * num_training_steps)


model_name = f"{model_ckpt}-finetuned-gec"

# Create directories
os.makedirs(f"../outputs/checkpoints/{model_name}", exist_ok=True)
os.makedirs("../outputs/best_model", exist_ok=True)
os.makedirs("../outputs/logs", exist_ok=True)

training_args = Seq2SeqTrainingArguments(output_dir=f"../outputs/checkpoints/{model_name}",
                                         num_train_epochs=num_train_epochs,

                                         # Memory Optimization:
                                         per_device_train_batch_size=batch_size,
                                         per_device_eval_batch_size=batch_size,
                                         gradient_accumulation_steps=gradient_accumulation_steps,  # Simulate larger batch size eg: 8 * 2 = 16
                                         fp16=False,                                                # Use mixed precision if GPU supports it

                                         # Logging & Saving:
                                         logging_dir="../outputs/logs",
                                         logging_steps=10,    # log the training loss and metrics every X steps
                                         eval_strategy="epochs",          # performs evaluation per epoch
                                        #  eval_steps=10000,
                                         save_strategy="epochs",          # saves model checkpoint per epoch
                                        #  save_steps=230000,
                                         save_total_limit=3,             # keep last 3 checkpoints
                                         overwrite_output_dir=True,      # Overwrite previous runs

                                         # Best Model saving:
                                         load_best_model_at_end=True,        # Load the best model at the end
                                         metric_for_best_model="eval_loss",   # Use eval_loss to determine best model
                                         greater_is_better=False,            # Lower eval_loss is better

                                         # performance
                                         warmup_steps=warmup_steps,             # Gradually increases LR at start
                                         learning_rate=learning_rate,
                                         weight_decay=weight_decay,             # L2 regularization
                                         lr_scheduler_type=lr_scheduler_type,


                                         # Seq2seq specific:
                                         predict_with_generate=True,    # essential for seq2seq , If not set then metrics will be computed on meaningless logits
                                         generation_max_length=128,      # Max output length
                                         generation_num_beams=1,        # 1=greedy, 4=beam search (slower but better)

                                         report_to="wandb",          # This enables automatic logging
                                         run_name="mt5-nepali",
                                         push_to_hub=False                       # save the model to HF
                                         )





In [None]:
import json

class CustomLoggingCallback(TrainerCallback):
    def __init__(self):
        self.best_metric = float('inf')
        self.logs_file = "../outputs/training_logs.csv"
        self._create_logs_file()

    def _create_logs_file(self):
        """Create CSV file with headers"""
        if not os.path.exists(self.logs_file):
            df = pd.DataFrame(columns=[
                'step', 'epoch', 'train_loss', 'eval_loss',
                'token_accuracy', 'BLEU','GLEU', 'ROUGE-L'
            ])
            df.to_csv(self.logs_file, index=False)

    def _append_to_csv(self, data):
        """Append new row to CSV"""
        df = pd.DataFrame([data])
        df.to_csv(self.logs_file, mode='a', header=False, index=False)

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Capture training logs"""
        if logs and 'loss' in logs and state.epoch is not None:
            log_data = {
                'step': state.global_step,
                'epoch': state.epoch,
                'train_loss': logs.get('loss'),
                'eval_loss': None,  # Will be filled during evaluation
                'token_accuracy': None,
                'BLEU': None,
                'GLEU': None,
                'ROUGE-L': None
            }
            self._append_to_csv(log_data)

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Capture evaluation metrics and save best model"""
        if metrics:
            # Update CSV with evaluation results
            eval_data = {
                'step': state.global_step,
                'epoch': state.epoch,
                'train_loss': None,
                'eval_loss': metrics.get('eval_loss'),
                'token_accuracy': metrics.get('eval_token_accuracy'),
                'BLEU': metrics.get('eval_BLEU'),
                'GLEU': metrics.get('eval_GLEU'),
                'ROUGE-L': metrics.get('eval_ROUGE-L')
            }
            self._append_to_csv(eval_data)

            # Save best model immediately
            current_eval_loss = metrics.get('eval_loss', float('inf'))
            if current_eval_loss < self.best_metric:
                self.best_metric = current_eval_loss
                print(f"üéâ New best model! Eval loss: {current_eval_loss:.4f} at step {state.global_step}")

                # Save best model
                best_model_path = "../outputs/best_model"
                trainer.model.save_pretrained(best_model_path)
                trainer.tokenizer.save_pretrained(best_model_path)

                # Save best model info
                best_model_info = {
                    "best_eval_loss": current_eval_loss,
                    "step": state.global_step,
                    "epoch": state.epoch,
                    "all_metrics": metrics
                }
                with open(os.path.join(best_model_path, "best_model_info.json"), "w") as f:
                    json.dump(best_model_info, f, indent=2)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["valid"],
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[CustomLoggingCallback(),
               EarlyStoppingCallback(early_stopping_patience=3)  # Stop if no improvement for 3 evals
    ]
)

In [None]:
# # Instead of waiting hours into training to discover issues, test everythings fist with evaluate()


# # Complete safety check
# def safe_training_check(trainer):
#     """Comprehensive pre-training safety check"""
#     print(" Running pre-training safety checks...")

#     # 1. Check model is on correct device
#     print(f"Model device: {next(trainer.model.parameters()).device}")

#     # 2. Check dataset sizes
#     print(f"Train dataset size: {len(trainer.train_dataset)}")
#     print(f"Eval dataset size: {len(trainer.eval_dataset)}")

#     # 3. Test data loading
#     try:
#         sample_batch = next(iter(trainer.get_train_dataloader()))
#         print(" Data loading works")
#         # print(f"Batch keys: {sample_batch.keys()}")
#     except Exception as e:
#         print(f" Data loading failed: {e}")
#         return False

#     # 4. Test evaluation
#     try:
#         trainer.model.eval()    # Set to evaluation mode
#         print(" Performing evaluation check...")
#         # eval_results = trainer.evaluate()
#         print(" Evaluation successful")
#         print(f"Initial metrics: {eval_results}")
#         return True
#     except Exception as e:
#         print(f" Evaluation failed: {e}")
#         return False

# # Usage
# if safe_training_check(trainer):
#     print(" All checks passed! Starting training...")
#     trainer.train()
# else:
#     print(" Fix issues before training!")

trainer.train()


Step,Training Loss,Validation Loss
