In [74]:
import sys
sys.prefix
from pprint import pprint

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Nov 14 12:05:54 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1080 Ti     On  | 00000000:06:00.0 Off |                  N/A |
| 34%   47C    P8              17W / 250W |     27MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce GTX 1080 Ti     On  | 00000000:0A:00.0 Off |  

In [3]:
# ensure enough memory present so that training does not stop
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 67.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [12]:
# # Install required libraries
# !pip install datasets==2.10.1
# !pip install transformers==4.28.0
# !pip install accelerate
# !pip install jiwer
# !pip install huggingface_hub

Collecting datasets==2.10.1
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 1.18.3
    Uninstalling datasets-1.18.3:
      Successfully uninstalled datasets-1.18.3
Successfully installed datasets-2.10.1


In [5]:
# Import libraries
import torch
from transformers import BartTokenizerFast, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from jiwer import wer
from huggingface_hub import notebook_login

In [6]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

map columns for noisy data

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the CSV file into a DataFrame
df = pd.read_csv('data/sentences_noise_data_sample.csv')

# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create a DataFrame for the training set
train_subset = pd.DataFrame({
    'Sentence': train_df['Sentence'],
    'noisy_sentence': train_df.apply(lambda row: row[row['Best_Alternative']], axis=1)
})

# Create a DataFrame for the testing set
test_subset = pd.DataFrame({
    'Sentence': test_df['Sentence'],
    'noisy_sentence': test_df.apply(lambda row: row[row['Best_Alternative']], axis=1)
})

# Save the training and testing sets to new CSV files
train_subset.to_csv('data/train_sentences_noise_data.csv', index=False)
test_subset.to_csv('data/test_sentences_noise_data.csv', index=False)


convert to json

In [8]:
import pandas as pd

def convert_csv_to_json(csv_path, json_output_path):

    df = pd.read_csv(csv_path)

    json_data = df.to_json(orient='records')


    with open(json_output_path, 'w') as json_file:
        json_file.write(json_data)

csv_file_path = 'data/train_sentences_noise_data.csv'
json_output_file_path = 'data/train_output.json'
convert_csv_to_json(csv_file_path, json_output_file_path)

In [9]:
csv_file_path = 'data/test_sentences_noise_data.csv'
json_output_file_path = 'data/test_output.json'
convert_csv_to_json(csv_file_path, json_output_file_path)

In [10]:
# Load the dataset from the JSON files
dataset = load_dataset('json', data_files={'train': 'data/train_output.json',
                                           'test': 'data/test_output.json'})

Downloading and preparing dataset json/default to /home/jinda/.cache/huggingface/datasets/json/default-1f260b0f7599e2c4/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/jinda/.cache/huggingface/datasets/json/default-1f260b0f7599e2c4/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'noisy_sentence'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['Sentence', 'noisy_sentence'],
        num_rows: 2000
    })
})


In [12]:
print(dataset['train'][:5])
print(dataset['test'][:5])

{'Sentence': ["It's not easy to win.", "I don't want you to tell Tom about what happened here.", "She's in the National Theatre Company.", 'You should distinguish between right and wrong.', 'I knew that Tom was a coward.'], 'noisy_sentence': ['itznot  e azyt ow in', 'idom twamdy oud o telltom  ap outwhat  happenedhe re', 's hesi nthenatiomar d heatrec onpa my', 'distinku is hbedwee mligh dandwro mg  youwer eb robab', 'ikm ow dhaddom i s outraged ikn ew dhad tomw asac o']}
{'Sentence': ['Tom sat in the dentist’s chair.', "He didn't say what kind of bike he has.", "I'd be happy to come in.", 'Good afternoon! How are you?', "I told you I didn't want this."], 'noisy_sentence': ['dond oogou this false t eeth', 'cy cre heditn t s ayw h adki mdov bikehe haz', 'i t be happ ytocom eim', 'hello a r eyous dilldhele', 'idolty ou  ititn   dwamddhis']}


In [13]:
# Split the dataset into train and validation sets
train_dataset, val_dataset = train_test_split(dataset['train'], test_size=0.1, random_state=42)

In [14]:
from datasets import Dataset

train_data = Dataset.from_dict(train_dataset)  # Convert the train data to a dataset
val_data = Dataset.from_dict(val_dataset)      # Convert the validation data to a dataset

In [15]:
print(type(train_data))
print(type(val_data))

<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


In [16]:
# Check column names in train dataset
print(train_data.column_names)

# Check column names in validation dataset
print(val_data.column_names)

['Sentence', 'noisy_sentence']
['Sentence', 'noisy_sentence']


In [17]:
print(train_data['Sentence'][:5])
print(train_data['noisy_sentence'][:5])
#print(val_data[:5])

["Do you know Tom's full name?", "Today I went to the dentist's.", "You'll ask Tom for permission to do that, won't you?", 'Morocco got into debt.', "The reason that Tom isn't here is because he's sick in bed."]
['do youk now to msvur  lnam e', 'yiwe mt tothete ndists  inee dtoothbas de', 'i n su red omd hatw  ouldtod h ati vyouazgethim to', 'mor o c cogodintod ebt', 'tom izi nbe dbecaus ehezn odfeerin g werl thereaz omthatt om iz mdherei zpe  cau z ehessi ']


In [18]:
# Define the source and target language columns
source_lang = 'noisy_sentence'
target_lang = 'Sentence'

In [19]:
print(source_lang)

noisy_sentence


In [20]:
# Define the max_length for padding and truncation
max_length = 512

In [21]:
# Initialize the tokenizer
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

# Tokenize the data
# The preprocess_function function is defined to preprocess the data by tokenizing the inputs and labels
def preprocess_function(examples):
    inputs = [f'{source_lang}: {text}' for text in examples[source_lang]]
    targets = examples[target_lang]
    encoding = tokenizer(inputs, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
    model_inputs = {
        'input_ids': encoding['input_ids'].squeeze(),
        'attention_mask': encoding['attention_mask'].squeeze(),
        'labels': tokenizer(targets, padding=True, truncation=True, return_tensors='pt')['input_ids'].squeeze()
    }
    return model_inputs


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [22]:
# Select a random data point from the train dataset
sample_data = train_data[0]

# Call the preprocess function on the sample data
processed_data = preprocess_function(sample_data)

# Inspect the output
print(processed_data)

{'input_ids': tensor([[    0,  2362, 26351,  1215, 19530,  4086,    35,   385,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1021,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1437,  1437,     2],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1423,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1021,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1717,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,   449,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1437,  1437,     2],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,   295,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1021,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,   885,     2,     1],
        [    0,  2362, 26351,  1215, 19530,  4086,    35,  1437,  1437,     2],
        [    0,  2362, 263

In [23]:
#train_data = preprocess_function(train_data)
#val_data = preprocess_function(val_data)

# Apply preprocess_function to train_data and val_data
train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [24]:
# Access a few samples from train_data
for i in range(5):
    sample_input_ids = train_data['input_ids'][i]
    sample_attention_mask = train_data['attention_mask'][i]
    sample_labels = train_data['labels'][i]

    print(f"Sample {i+1}:")
    print("Input IDs:", sample_input_ids)
    print("Attention Mask:", sample_attention_mask)
    print("Labels:", sample_labels)
    print()

Sample 1:
Input IDs: [0, 2362, 26351, 1215, 19530, 4086, 35, 109, 47, 330, 122, 7, 43601, 705, 710, 1437, 784, 8697, 364, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels: [0, 8275, 47, 216, 1560, 18, 455, 766, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [25]:
# Prepare DataLoader for training and validation
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=16)

In [26]:
# define a data_collator function for batch processing
def data_collator(features):
    batch = {}
    # Pad input_ids and attention_mask to the maximum length within the batch
    max_length = max(len(feature['input_ids']) for feature in features)
    batch['input_ids'] = torch.stack([torch.tensor(feature['input_ids'] + [tokenizer.pad_token_id] * (max_length - len(feature['input_ids']))) for feature in features])
    batch['attention_mask'] = torch.stack([torch.tensor(feature['attention_mask'] + [0] * (max_length - len(feature['attention_mask']))) for feature in features])
    batch['labels'] = torch.stack([torch.tensor(feature['labels'] + [-100] * (max_length - len(feature['labels']))) for feature in features])
    return batch

In [36]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="spell_correction_tatoeba",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
# verify passing the correct inputs to the trainer
print("Train Dataset:", train_data)
print("Validation Dataset:", val_data)
print("Tokenizer:", tokenizer)
print("Training Arguments:", training_args)

Train Dataset: Dataset({
    features: ['Sentence', 'noisy_sentence', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 7200
})
Validation Dataset: Dataset({
    features: ['Sentence', 'noisy_sentence', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})
Tokenizer: BartTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})
Training Arguments: Seq2SeqTrainingArguments(
_n_gpu=3,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find

In [38]:
# model is initialized with the BARTForConditionalGeneration class and moved to the GPU if available.
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

loading configuration file config.json from cache at /home/jinda/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "L

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [39]:
torch.cuda.empty_cache()

In [40]:
# The Seq2SeqTrainer is created with the defined model, training arguments, datasets, tokenizer, and data_collator
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

/home/jinda/Desktop/torgo/torgo_inference/Machine Translation/spell_correction_tatoeba is already a clone of https://huggingface.co/matrixcc/spell_correction_tatoeba. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7200
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 9000
  Number of trainable parameters = 139420416


  0%|          | 0/9000 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.5821688771247864, 'eval_runtime': 8.3946, 'eval_samples_per_second': 95.299, 'eval_steps_per_second': 4.05, 'epoch': 1.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-500
Configuration saved in spell_correction_tatoeba/checkpoint-500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-500/generation_config.json


{'loss': 2.4322, 'learning_rate': 9.444444444444445e-06, 'epoch': 1.67}


Model weights saved in spell_correction_tatoeba/checkpoint-500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-500/special_tokens_map.json
tokenizer config file saved in spell_correction_tatoeba/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.37821945548057556, 'eval_runtime': 8.4014, 'eval_samples_per_second': 95.223, 'eval_steps_per_second': 4.047, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3624381124973297, 'eval_runtime': 8.4337, 'eval_samples_per_second': 94.858, 'eval_steps_per_second': 4.031, 'epoch': 3.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-1000
Configuration saved in spell_correction_tatoeba/checkpoint-1000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-1000/generation_config.json


{'loss': 0.2996, 'learning_rate': 8.888888888888888e-06, 'epoch': 3.33}


Model weights saved in spell_correction_tatoeba/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-1000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.35417410731315613, 'eval_runtime': 8.42, 'eval_samples_per_second': 95.012, 'eval_steps_per_second': 4.038, 'epoch': 4.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-1500
Configuration saved in spell_correction_tatoeba/checkpoint-1500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-1500/generation_config.json


{'loss': 0.2663, 'learning_rate': 8.333333333333334e-06, 'epoch': 5.0}


Model weights saved in spell_correction_tatoeba/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-1500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-1500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34975579380989075, 'eval_runtime': 8.4071, 'eval_samples_per_second': 95.158, 'eval_steps_per_second': 4.044, 'epoch': 5.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3478945791721344, 'eval_runtime': 8.4068, 'eval_samples_per_second': 95.161, 'eval_steps_per_second': 4.044, 'epoch': 6.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-2000
Configuration saved in spell_correction_tatoeba/checkpoint-2000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-2000/generation_config.json


{'loss': 0.2401, 'learning_rate': 7.77777777777778e-06, 'epoch': 6.67}


Model weights saved in spell_correction_tatoeba/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-2000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34580177068710327, 'eval_runtime': 8.3945, 'eval_samples_per_second': 95.301, 'eval_steps_per_second': 4.05, 'epoch': 7.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34300926327705383, 'eval_runtime': 8.4914, 'eval_samples_per_second': 94.213, 'eval_steps_per_second': 4.004, 'epoch': 8.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-2500
Configuration saved in spell_correction_tatoeba/checkpoint-2500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-2500/generation_config.json


{'loss': 0.2246, 'learning_rate': 7.222222222222223e-06, 'epoch': 8.33}


Model weights saved in spell_correction_tatoeba/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-2500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-2500/special_tokens_map.json
tokenizer config file saved in spell_correction_tatoeba/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-1000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3413137197494507, 'eval_runtime': 8.4808, 'eval_samples_per_second': 94.331, 'eval_steps_per_second': 4.009, 'epoch': 9.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-3000
Configuration saved in spell_correction_tatoeba/checkpoint-3000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-3000/generation_config.json


{'loss': 0.2122, 'learning_rate': 6.666666666666667e-06, 'epoch': 10.0}


Model weights saved in spell_correction_tatoeba/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-3000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-3000/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-1500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34103479981422424, 'eval_runtime': 8.513, 'eval_samples_per_second': 93.974, 'eval_steps_per_second': 3.994, 'epoch': 10.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3396484851837158, 'eval_runtime': 8.4837, 'eval_samples_per_second': 94.299, 'eval_steps_per_second': 4.008, 'epoch': 11.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-3500
Configuration saved in spell_correction_tatoeba/checkpoint-3500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-3500/generation_config.json


{'loss': 0.198, 'learning_rate': 6.111111111111112e-06, 'epoch': 11.67}


Model weights saved in spell_correction_tatoeba/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-3500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-3500/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-2000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3385607898235321, 'eval_runtime': 8.4153, 'eval_samples_per_second': 95.064, 'eval_steps_per_second': 4.04, 'epoch': 12.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34031543135643005, 'eval_runtime': 8.4061, 'eval_samples_per_second': 95.169, 'eval_steps_per_second': 4.045, 'epoch': 13.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-4000
Configuration saved in spell_correction_tatoeba/checkpoint-4000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-4000/generation_config.json


{'loss': 0.1898, 'learning_rate': 5.555555555555557e-06, 'epoch': 13.33}


Model weights saved in spell_correction_tatoeba/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-4000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-4000/special_tokens_map.json
tokenizer config file saved in spell_correction_tatoeba/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/special_tokens_map.json
Several commits (2) will be pushed upstream.
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-2500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.33926472067832947, 'eval_runtime': 8.4089, 'eval_samples_per_second': 95.138, 'eval_steps_per_second': 4.043, 'epoch': 14.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-4500
Configuration saved in spell_correction_tatoeba/checkpoint-4500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-4500/generation_config.json


{'loss': 0.179, 'learning_rate': 5e-06, 'epoch': 15.0}


Model weights saved in spell_correction_tatoeba/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-4500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-4500/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-3000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3398318886756897, 'eval_runtime': 8.4188, 'eval_samples_per_second': 95.025, 'eval_steps_per_second': 4.039, 'epoch': 15.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.33935120701789856, 'eval_runtime': 8.4036, 'eval_samples_per_second': 95.197, 'eval_steps_per_second': 4.046, 'epoch': 16.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-5000
Configuration saved in spell_correction_tatoeba/checkpoint-5000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-5000/generation_config.json


{'loss': 0.1717, 'learning_rate': 4.444444444444444e-06, 'epoch': 16.67}


Model weights saved in spell_correction_tatoeba/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-5000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-5000/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-3500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3393841087818146, 'eval_runtime': 8.4787, 'eval_samples_per_second': 94.354, 'eval_steps_per_second': 4.01, 'epoch': 17.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3392324447631836, 'eval_runtime': 8.4051, 'eval_samples_per_second': 95.18, 'eval_steps_per_second': 4.045, 'epoch': 18.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-5500
Configuration saved in spell_correction_tatoeba/checkpoint-5500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-5500/generation_config.json


{'loss': 0.1626, 'learning_rate': 3.88888888888889e-06, 'epoch': 18.33}


Model weights saved in spell_correction_tatoeba/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-5500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-5500/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-4000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.340829461812973, 'eval_runtime': 8.5002, 'eval_samples_per_second': 94.115, 'eval_steps_per_second': 4.0, 'epoch': 19.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-6000
Configuration saved in spell_correction_tatoeba/checkpoint-6000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-6000/generation_config.json


{'loss': 0.1593, 'learning_rate': 3.3333333333333333e-06, 'epoch': 20.0}


Model weights saved in spell_correction_tatoeba/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-6000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-6000/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-4500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3408750891685486, 'eval_runtime': 8.5264, 'eval_samples_per_second': 93.826, 'eval_steps_per_second': 3.988, 'epoch': 20.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3409087061882019, 'eval_runtime': 8.3781, 'eval_samples_per_second': 95.488, 'eval_steps_per_second': 4.058, 'epoch': 21.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-6500
Configuration saved in spell_correction_tatoeba/checkpoint-6500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-6500/generation_config.json


{'loss': 0.1539, 'learning_rate': 2.7777777777777783e-06, 'epoch': 21.67}


Model weights saved in spell_correction_tatoeba/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-6500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-6500/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-5000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.341418981552124, 'eval_runtime': 8.3927, 'eval_samples_per_second': 95.321, 'eval_steps_per_second': 4.051, 'epoch': 22.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34244948625564575, 'eval_runtime': 8.3815, 'eval_samples_per_second': 95.448, 'eval_steps_per_second': 4.057, 'epoch': 23.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-7000
Configuration saved in spell_correction_tatoeba/checkpoint-7000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-7000/generation_config.json


{'loss': 0.1494, 'learning_rate': 2.222222222222222e-06, 'epoch': 23.33}


Model weights saved in spell_correction_tatoeba/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-7000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-7000/special_tokens_map.json
tokenizer config file saved in spell_correction_tatoeba/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-5500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34297892451286316, 'eval_runtime': 8.5101, 'eval_samples_per_second': 94.006, 'eval_steps_per_second': 3.995, 'epoch': 24.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-7500
Configuration saved in spell_correction_tatoeba/checkpoint-7500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-7500/generation_config.json


{'loss': 0.1458, 'learning_rate': 1.6666666666666667e-06, 'epoch': 25.0}


Model weights saved in spell_correction_tatoeba/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-7500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-7500/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-6000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34193241596221924, 'eval_runtime': 8.3908, 'eval_samples_per_second': 95.343, 'eval_steps_per_second': 4.052, 'epoch': 25.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34295132756233215, 'eval_runtime': 8.4095, 'eval_samples_per_second': 95.131, 'eval_steps_per_second': 4.043, 'epoch': 26.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-8000
Configuration saved in spell_correction_tatoeba/checkpoint-8000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-8000/generation_config.json


{'loss': 0.1426, 'learning_rate': 1.111111111111111e-06, 'epoch': 26.67}


Model weights saved in spell_correction_tatoeba/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-8000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-8000/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-6500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.34201592206954956, 'eval_runtime': 8.3863, 'eval_samples_per_second': 95.393, 'eval_steps_per_second': 4.054, 'epoch': 27.0}


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3432137966156006, 'eval_runtime': 8.3935, 'eval_samples_per_second': 95.312, 'eval_steps_per_second': 4.051, 'epoch': 28.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-8500
Configuration saved in spell_correction_tatoeba/checkpoint-8500/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-8500/generation_config.json


{'loss': 0.1414, 'learning_rate': 5.555555555555555e-07, 'epoch': 28.33}


Model weights saved in spell_correction_tatoeba/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-8500/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-8500/special_tokens_map.json
tokenizer config file saved in spell_correction_tatoeba/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-7000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3430284261703491, 'eval_runtime': 8.4205, 'eval_samples_per_second': 95.006, 'eval_steps_per_second': 4.038, 'epoch': 29.0}


Saving model checkpoint to spell_correction_tatoeba/checkpoint-9000
Configuration saved in spell_correction_tatoeba/checkpoint-9000/config.json
Configuration saved in spell_correction_tatoeba/checkpoint-9000/generation_config.json


{'loss': 0.1401, 'learning_rate': 0.0, 'epoch': 30.0}


Model weights saved in spell_correction_tatoeba/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in spell_correction_tatoeba/checkpoint-9000/tokenizer_config.json
Special tokens file saved in spell_correction_tatoeba/checkpoint-9000/special_tokens_map.json
Deleting older checkpoint [spell_correction_tatoeba/checkpoint-7500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Sentence, noisy_sentence. If Sentence, noisy_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 24


  0%|          | 0/34 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.34302985668182373, 'eval_runtime': 8.4142, 'eval_samples_per_second': 95.077, 'eval_steps_per_second': 4.041, 'epoch': 30.0}
{'train_runtime': 8761.2214, 'train_samples_per_second': 24.654, 'train_steps_per_second': 1.027, 'train_loss': 0.3115828611585829, 'epoch': 30.0}


TrainOutput(global_step=9000, training_loss=0.3115828611585829, metrics={'train_runtime': 8761.2214, 'train_samples_per_second': 24.654, 'train_steps_per_second': 1.027, 'train_loss': 0.3115828611585829, 'epoch': 30.0})

In [41]:
# Load the trained model from Hugging Face
model = BartForConditionalGeneration.from_pretrained("matrixcc/spell_correction_tatoeba")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/jinda/.cache/huggingface/hub/models--matrixcc--spell_correction_tatoeba/snapshots/dfa215158f3be6b71823ac1fd20a6104d9fd99da/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0":

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /home/jinda/.cache/huggingface/hub/models--matrixcc--spell_correction_tatoeba/snapshots/dfa215158f3be6b71823ac1fd20a6104d9fd99da/pytorch_model.bin
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

All model checkpoint weights were used when initializing BartForConditionalGeneration.

All the weights of BartForConditionalGeneration were initialized from the model checkpoint at matrixcc/spell_correction_tatoeba.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BartForConditionalGeneration for predictions without further training.
Generation config file not found, using a generation config created from the model config.


In [42]:
# Move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
#model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [43]:
test_dataset = dataset['test']

In [69]:
pprint(test_dataset['Sentence'][-10:])
pprint(test_dataset['noisy_sentence'][-10:])

['Please just get out of my way.',
 'A set of benefits was once in effect.',
 'If you walk a lot, you will develop calluses on your heel.',
 'The movie was better than I thought it was going to be.',
 'Sami realized Layla was dead.',
 'Tom said that he wished that Mary would eat healthier food.',
 'That is a mistake.',
 "I know it's very difficult to quit smoking.",
 'Ziri gave a talk about the Berber language.',
 "It's common in the East."]
['z dopgan gimku po nme',
 'a s ed ovb enefitzw a sonce inevve ct',
 'ivyouw alg aro tyouwilldefe lopc ar  l usesom yo ulhee r',
 'th efi lmwaz  nodwh  atiha  tthoughd itwas',
 'samirear i zetlayla waste ad',
 'edettoeadhealth ie lvood t omsai dt ha thewishetdhadm a rywoulde ath  eal',
 'ha disam i zdake  tha',
 'i d isimboz sibr eforhim dogif eupzn o king',
 'zir ispe aksper be larltheti ne',
 'thatrarel yhappen z']


In [44]:
print(len(test_dataset))

2000


In [65]:
# Inspect column names
column_names = test_dataset.column_names
print("Column names:", column_names)

# Inspect column data types
for column in column_names:
    column_data = test_dataset[column]
    data_type = type(column_data[0]) if len(column_data) > 0 else "Unknown"
    print(f"Column name: {column}, Data type: {data_type}")

Column names: ['Sentence', 'noisy_sentence']
Column name: Sentence, Data type: <class 'str'>
Column name: noisy_sentence, Data type: <class 'str'>


In [62]:
# verification
# Test the model on speaker dataset
verify_test_dataset = dataset['test']
print(len(verify_test_dataset))
print((verify_test_dataset))

2000
Dataset({
    features: ['Sentence', 'noisy_sentence'],
    num_rows: 2000
})


In [63]:
# Print the samples
cnt=0
for sample in test_dataset:
    print("input:", sample["noisy_sentence"])
    print("actual:", sample["Sentence"])
    
    cnt+=1
    if cnt==20:
        break

input: dond oogou this false t eeth
actual: Tom sat in the dentist’s chair.
input: cy cre heditn t s ayw h adki mdov bikehe haz
actual: He didn't say what kind of bike he has.
input: i t be happ ytocom eim
actual: I'd be happy to come in.
input: hello a r eyous dilldhele
actual: Good afternoon! How are you?
input: idolty ou  ititn   dwamddhis
actual: I told you I didn't want this.
input: otasi n gl ec reatureo ne alth hasmo leol lezs rig ht tobehe re  iv dheleisli febe yom deardh itis prob
actual: Not a single creature on earth has more or less right to be here.
input: wher ed iddomg  otocol
actual: Where did Tom go to college?
input: ay do mhadall ta ytof  imiz hd
actual: Tom had all day to finish doing that.
input: us inga progranma br et hern oz tadyouca madjusd dhet im e syout urnomtheheatingolair co nd ition imkacco rdimgd oaprezedschetule 
actual: Using a programmable thermostat, you can adjust the times you turn on the heating or air-conditioning according to a pre-set schedule.

In [48]:
test_data = test_dataset.map(preprocess_function, batched=True, batch_size=len(test_dataset))
model.eval()

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [49]:
predictions = []
references = []

for example in test_dataset:
    # This creates an input text by combining the values of the 'speaker' and 'prediction' fields from the current example.
    # It assumes that the example is a dictionary-like object with keys 'speaker' and 'prediction'.
    input_text = f"{example['noisy_sentence']}"

    # This creates a context where no gradients are computed, which can improve efficiency during inference.
    with torch.no_grad():

        # This tokenizes the input_text using the tokenizer and converts it into input IDs as a PyTorch tensor.
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # The generate method generates the output sequence based on the provided input.
        outputs = model.generate(input_ids=input_ids, max_length=max_length)

    predicted_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

    references.append(example['Sentence'])
    predictions.append(predicted_sentence)

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

In [50]:
# Verify that the number of predictions and references are the same
if len(predictions) == len(references):
    print("Number of predictions and references are the same.")
else:
    print("Mismatch in the number of predictions and references.")

Number of predictions and references are the same.


In [51]:
# Print the number of predictions and references
print("Number of predictions:", len(predictions))
print("Number of references:", len(references))

# print the length of the dataset
print("Number of rows in dataset:", len(test_dataset))

Number of predictions: 2000
Number of references: 2000
Number of rows in dataset: 2000


In [75]:
# verification
print("prediction:", predictions[:10])
print("reference:", references[:10])

AttributeError: 'list' object has no attribute 'write'

In [53]:
# calculate WER

from jiwer import wer

wer_value = wer(predictions, references)
wer_percentage = wer_value * 100

print(f"WER: {wer_percentage:.2f}%")

WER: 89.53%
