In [28]:
import pandas as pd
import argparse
import numpy as np
import json
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, BertConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BertTokenizer, BertForMaskedLM, DataCollatorForTokenClassification
from adapters import AutoAdapterModel
from adapters import AdapterConfig
from adapters import AdapterTrainer
from evaluate import load



In [None]:
#the language tags we are using are: bg ms ne jv mt ug bo si

# Bulgarian, Indonesian, Nepali, Javanese,  Maltese, Uyghur, Tibetan, Sinhala Respectively. 

# for the specific language, replace the tag before "..."/validation , "..."/train, and "..."/test with the correct language tag



In [3]:
"""


def tokenize_adjust_labels(all_samples_per_split):
        tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], padding=True, truncation=True, is_split_into_words=True)
        # tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
        # so the new keys [input_ids, labels (after adjustment)]
        # can be added to the datasets dict for each train test validation split
        total_adjusted_labels = []
        for k in range(0, len(tokenized_samples["input_ids"])):
            prev_wid = -1
            word_ids_list = tokenized_samples.word_ids(batch_index=k)
            existing_label_ids = all_samples_per_split["ner_tags"][k]
            i = -1
            adjusted_label_ids = []
        
            for wid in word_ids_list:
                if(wid is None):
                    adjusted_label_ids.append(-100)
                elif(wid!=prev_wid):
                    i = i + 1
                    adjusted_label_ids.append(existing_label_ids[i])
                    prev_wid = wid
                else:
                    label_name = label_names[existing_label_ids[i]]
                    adjusted_label_ids.append(existing_label_ids[i])
                
            total_adjusted_labels.append(adjusted_label_ids)
        tokenized_samples["labels"] = total_adjusted_labels
        tokenized_samples["labels"] = [list(map(int, x)) for x in tokenized_samples["labels"]]



dataset = load_dataset("wikiann", 'bg')

label_names = dataset["train"].features["ner_tags"].feature.names
print(label_names)
id2label = {id_: label for id_, label in enumerate(label_names)}
label2id = {label: id_ for id_, label in enumerate(label_names)}

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

# prepare model
config = AutoConfig.from_pretrained("bert-base-multilingual-cased", id2label=id2label, label2id=label2id)
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased", config=config)


print("final processed data set")
print(tokenized_dataset["train"][0])'

#creating a dataset collator (this is what facilitates the masked language modelling functionality)
"""


seeing what load_dataset looks like
{'tokens': ['–≤–∏–∂', '–î–∂–æ–Ω', '–£–∏–ª—è–º', '–°—Ç—Ä—ä—Ç'], 'ner_tags': [0, 1, 2, 2], 'langs': ['bg', 'bg', 'bg', 'bg'], 'spans': ['PER: –î–∂–æ–Ω –£–∏–ª—è–º –°—Ç—Ä—ä—Ç']}
after load dataset
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 24720.86 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 24564.83 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:00<00:00, 26247.43 examples/s]
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used whe

final processed data set
{'tokens': ['–≤–∏–∂', '–î–∂–æ–Ω', '–£–∏–ª—è–º', '–°—Ç—Ä—ä—Ç'], 'ner_tags': [0, 1, 2, 2], 'langs': ['bg', 'bg', 'bg', 'bg'], 'spans': ['PER: –î–∂–æ–Ω –£–∏–ª—è–º –°—Ç—Ä—ä—Ç']}


In [18]:
#I think this is the MLM pre processing for the data set: 

#load dataset
dataset = load_dataset("wikiann", 'bg')

#extracting "ner_tags" names
label_names = dataset["train"].features["ner_tags"].feature.names
#using tokenize_function. I'm splitting into words because that's what the paper code does


#tokenizing function and pre processing it to feed into mBERT. 
def tokenize_function(examples):
    return tokenizer(examples["tokens"], truncation=True, padding=True,is_split_into_words=True, )

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", use_fast=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["tokens", "ner_tags", "langs", "spans"])


id2label = {id_: label for id_, label in enumerate(label_names)}
label2id = {label: id_ for id_, label in enumerate(label_names)}

# prepare model
config = AutoConfig.from_pretrained("bert-base-multilingual-cased", id2label=id2label, label2id=label2id)
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased", config=config)


#print(tokenized_dataset["train"].features)

#putting model onto my mac's 'mps' system

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

model = model.to(device)

print(tokenized_datasets["train"].features)
print(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
mps


In [19]:

from accelerate import Accelerator
# uniform way of tracking our metrics: 
# Training arguments
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    weight_decay=0.01,
    output_dir="./training_output",
    overwrite_output_dir=True,
    save_total_limit=1,
    load_best_model_at_end=True,
    save_only_model=True,
)
#trainer values. (I'm using regular trainer, not adaptaTrainer). Data collator necessary for masked language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        #compute_metrics=lambda p: compute_metrics(p, label_names)
    )




trainer.train()
model.save_pretrained("./trained_mbert")
tokenizer.save_pretrained("./trained_mbert")

  2%|‚ñè         | 17/939 [15:51<14:20:27, 55.99s/it]
 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [1:58:43<3:30:39, 20.19s/it]
 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [2:03:11<3:30:39, 20.19s/it]

{'eval_loss': 8.333383560180664, 'eval_runtime': 268.3144, 'eval_samples_per_second': 37.27, 'eval_steps_per_second': 0.585, 'epoch': 1.0}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 500/939 [2:48:21<1:12:19,  9.88s/it]  

{'loss': 7.3594, 'grad_norm': nan, 'learning_rate': 4.6751863684771034e-05, 'epoch': 1.6}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [3:08:34<42:49,  8.21s/it]  
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [3:10:27<42:49,  8.21s/it]

{'eval_loss': nan, 'eval_runtime': 113.2497, 'eval_samples_per_second': 88.3, 'eval_steps_per_second': 1.386, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [4:00:47<00:00,  7.56s/it]  
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [4:02:34<00:00,  7.56s/it]

{'eval_loss': nan, 'eval_runtime': 105.8411, 'eval_samples_per_second': 94.481, 'eval_steps_per_second': 1.483, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [4:02:36<00:00, 15.50s/it]


{'train_runtime': 14556.2767, 'train_samples_per_second': 4.122, 'train_steps_per_second': 0.065, 'train_loss': 3.9187362719648564, 'epoch': 3.0}


('./trained_mbert/tokenizer_config.json',
 './trained_mbert/special_tokens_map.json',
 './trained_mbert/vocab.txt',
 './trained_mbert/added_tokens.json',
 './trained_mbert/tokenizer.json')

In [24]:
def tokenize_adjust_labels(all_samples_per_split):
        tokenized_samples = tokenizerNER.batch_encode_plus(all_samples_per_split["tokens"], padding=True, truncation=True, is_split_into_words=True)
        # tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
        # so the new keys [input_ids, labels (after adjustment)]
        # can be added to the datasets dict for each train test validation split
        total_adjusted_labels = []
        for k in range(0, len(tokenized_samples["input_ids"])):
            prev_wid = -1
            word_ids_list = tokenized_samples.word_ids(batch_index=k)
            existing_label_ids = all_samples_per_split["ner_tags"][k]
            i = -1
            adjusted_label_ids = []
        
            for wid in word_ids_list:
                if(wid is None):
                    adjusted_label_ids.append(-100)
                elif(wid!=prev_wid):
                    i = i + 1
                    adjusted_label_ids.append(existing_label_ids[i])
                    prev_wid = wid
                else:
                    label_name = label_names[existing_label_ids[i]]
                    adjusted_label_ids.append(existing_label_ids[i])
                
            total_adjusted_labels.append(adjusted_label_ids)
        tokenized_samples["labels"] = total_adjusted_labels
        tokenized_samples["labels"] = [list(map(int, x)) for x in tokenized_samples["labels"]]

        return tokenized_samples



datasetNER = load_dataset("wikiann", 'bg')
label_names = datasetNER["train"].features["ner_tags"].feature.names
print(label_names)
id2label = {id_: label for id_, label in enumerate(label_names)}
label2id = {label: id_ for id_, label in enumerate(label_names)}

tokenizerNER = AutoTokenizer.from_pretrained("./trained_mbert")
NER_tokenized_dataset = datasetNER.map(tokenize_adjust_labels, batched=True)

# prepare model


previous_mlm_model = BertForMaskedLM.from_pretrained("./trained_mbert")

# Extract the base BERT model
base_model = previous_mlm_model.bert

# Create a token classification model
NERconfig = BertConfig.from_pretrained("./trained_mbert")
NERconfig.num_labels = 7  # Set the number of labels for token classification

NERconfig.num_labels = len(label_names)  # Automatically set number of labels
NERconfig.id2label = id2label
NERconfig.label2id = label2id

NER_token_classification_model = AutoModelForTokenClassification.from_config(NERconfig)

# Assign the base BERT model to the token classification model
NER_token_classification_model.bert = base_model

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

NER_token_classification_model = NER_token_classification_model.to(device)

print("final processed data set for NER")
print(NER_tokenized_dataset["train"][0])




['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 18312.16 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 15721.40 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:01<00:00, 16491.72 examples/s]


final processed data set for NER
{'tokens': ['–≤–∏–∂', '–î–∂–æ–Ω', '–£–∏–ª—è–º', '–°—Ç—Ä—ä—Ç'], 'ner_tags': [0, 1, 2, 2], 'langs': ['bg', 'bg', 'bg', 'bg'], 'spans': ['PER: –î–∂–æ–Ω –£–∏–ª—è–º –°—Ç—Ä—ä—Ç'], 'input_ids': [101, 88504, 12025, 21499, 528, 13460, 27429, 526, 46672, 13368, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [32]:
#how the paper computes metrics: 
from seqeval.metrics import classification_report

def compute_metrics(p, label_names):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric = load("seqeval")
    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if k not in flattened_results.keys():
            flattened_results[k+"_f1"] = results[k]["f1"]
    return flattened_results


# Training arguments
ner_training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    weight_decay=0.01,
    output_dir="./NER_training_output",
    overwrite_output_dir=True,
    save_total_limit=1,
    load_best_model_at_end=True,
    save_only_model=True,
)
#trainer values. (I'm using regular trainer, not adaptaTrainer)
ner_trainer = Trainer(
        model=NER_token_classification_model,
        args=ner_training_args,
        train_dataset=NER_tokenized_dataset["train"],
        eval_dataset=NER_tokenized_dataset["validation"],
        
        data_collator=DataCollatorForTokenClassification(tokenizerNER),
        tokenizer=tokenizerNER,
        compute_metrics=lambda p: compute_metrics(p, label_names)
    )




ner_trainer.train()
#saving model and tokenizer
NER_token_classification_model.save_pretrained("./NER_trained_mbert")
tokenizerNER.save_pretrained("./NER_trained_mbert")

 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [23:11<46:23,  4.45s/it]

 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [11:12<19:19,  1.85s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                 
[A                                             
 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [11:56<19:19,  1.85s/it]
[A

{'eval_loss': 0.21806278824806213, 'eval_overall_precision': 0.8976061317572126, 'eval_overall_recall': 0.9165036236876188, 'eval_overall_f1': 0.9069564506012926, 'eval_overall_accuracy': 0.9555929897307979, 'eval_LOC_f1': 0.9378401257709518, 'eval_ORG_f1': 0.8467210697079933, 'eval_PER_f1': 0.9171853041557921, 'eval_runtime': 43.9653, 'eval_samples_per_second': 227.452, 'eval_steps_per_second': 3.571, 'epoch': 1.0}


                                                   
 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 500/939 [18:39<16:00,  2.19s/it]

{'loss': 0.051, 'grad_norm': 2.3674802780151367, 'learning_rate': 4.6751863684771034e-05, 'epoch': 1.6}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [23:11<10:07,  1.94s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                 
[A                                             
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [23:53<10:07,  1.94s/it]
[A

{'eval_loss': 0.16554318368434906, 'eval_overall_precision': 0.914581677673947, 'eval_overall_recall': 0.9196726280691119, 'eval_overall_f1': 0.9171200879362462, 'eval_overall_accuracy': 0.9610612588681201, 'eval_LOC_f1': 0.9437926162540697, 'eval_ORG_f1': 0.8649128868269088, 'eval_PER_f1': 0.9266068834226743, 'eval_runtime': 42.5268, 'eval_samples_per_second': 235.146, 'eval_steps_per_second': 3.692, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [35:18<00:00,  1.82s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                 
[A                                             
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [36:01<00:00,  1.82s/it]
[A

{'eval_loss': 0.19064943492412567, 'eval_overall_precision': 0.9186303688938645, 'eval_overall_recall': 0.9270853426658215, 'eval_overall_f1': 0.9228384902348036, 'eval_overall_accuracy': 0.9630762253393372, 'eval_LOC_f1': 0.9475979850104436, 'eval_ORG_f1': 0.8722650754378305, 'eval_PER_f1': 0.9348123385503724, 'eval_runtime': 42.5562, 'eval_samples_per_second': 234.984, 'eval_steps_per_second': 3.689, 'epoch': 3.0}


                                                 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [36:03<00:00,  2.30s/it]


{'train_runtime': 2163.2549, 'train_samples_per_second': 27.736, 'train_steps_per_second': 0.434, 'train_loss': 0.03983610658980787, 'epoch': 3.0}


('./NER_trained_mbert/tokenizer_config.json',
 './NER_trained_mbert/special_tokens_map.json',
 './NER_trained_mbert/vocab.txt',
 './NER_trained_mbert/added_tokens.json',
 './NER_trained_mbert/tokenizer.json')

In [33]:
# test model
test_results = ner_trainer.evaluate(eval_dataset=NER_tokenized_dataset["test"])
output_file_path = os.path.join("./test_metrics.json")
with open(output_file_path, "w") as f:
    json.dump(test_results, f)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [00:44<00:00,  3.54it/s]


In [41]:
#this is meant to be a training loop for the other languages. It first uses masked language modeling to train on new domain, 
# then attatches a prediction head (a linear classification layer) and trains that too: 

totalLanguages = ['ne' ,'jv', 'mt', 'ug' ,'bo' ,'ms', 'si']

for language in totalLanguages:


    ## initial data collection loop 
    #load dataset
    dataset = load_dataset("wikiann", language)

    #extracting "ner_tags" names
    label_names = dataset["train"].features["ner_tags"].feature.names
    #using tokenize_function. I'm splitting into words because that's what the paper code does

    def tokenize_function(examples):
        return tokenizer(examples["tokens"], truncation=True, padding=True,is_split_into_words=True, )

    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", use_fast=True)
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["tokens", "ner_tags", "langs", "spans"])


    id2label = {id_: label for id_, label in enumerate(label_names)}
    label2id = {label: id_ for id_, label in enumerate(label_names)}

    # prepare model
    config = AutoConfig.from_pretrained("bert-base-multilingual-cased", id2label=id2label, label2id=label2id)
    model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased", config=config)


    #print(tokenized_dataset["train"].features)

    #putting model onto my mac's 'mps' system

    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

    model = model.to(device)
    training_args = TrainingArguments(
        learning_rate=1e-4,
        num_train_epochs=3,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        weight_decay=0.01,
        output_dir="./training_output/" + language,
        overwrite_output_dir=True,
        save_total_limit=1,
        load_best_model_at_end=True,
        save_only_model=True,
    )
    #trainer values. (I'm using regular trainer, not adaptaTrainer)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            data_collator=data_collator,
            tokenizer=tokenizer,
            #compute_metrics=lambda p: compute_metrics(p, label_names)
        )




    trainer.train()
    model.save_pretrained("./trained_mbert/" + language)
    tokenizer.save_pretrained("./trained_mbert/" + language)

    def tokenize_adjust_labels(all_samples_per_split):
            tokenized_samples = tokenizerNER.batch_encode_plus(all_samples_per_split["tokens"], padding=True, truncation=True, is_split_into_words=True)
            # tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
            # so the new keys [input_ids, labels (after adjustment)]
            # can be added to the datasets dict for each train test validation split
            total_adjusted_labels = []
            for k in range(0, len(tokenized_samples["input_ids"])):
                prev_wid = -1
                word_ids_list = tokenized_samples.word_ids(batch_index=k)
                existing_label_ids = all_samples_per_split["ner_tags"][k]
                i = -1
                adjusted_label_ids = []
            
                for wid in word_ids_list:
                    if(wid is None):
                        adjusted_label_ids.append(-100)
                    elif(wid!=prev_wid):
                        i = i + 1
                        adjusted_label_ids.append(existing_label_ids[i])
                        prev_wid = wid
                    else:
                        label_name = label_names[existing_label_ids[i]]
                        adjusted_label_ids.append(existing_label_ids[i])
                    
                total_adjusted_labels.append(adjusted_label_ids)
            tokenized_samples["labels"] = total_adjusted_labels
            tokenized_samples["labels"] = [list(map(int, x)) for x in tokenized_samples["labels"]]

            return tokenized_samples



    datasetNER = load_dataset("wikiann", language)
    label_names = datasetNER["train"].features["ner_tags"].feature.names
    id2label = {id_: label for id_, label in enumerate(label_names)}
    label2id = {label: id_ for id_, label in enumerate(label_names)}

    tokenizerNER = AutoTokenizer.from_pretrained("./trained_mbert/" + language)
    NER_tokenized_dataset = datasetNER.map(tokenize_adjust_labels, batched=True)

    # prepare model


    previous_mlm_model = BertForMaskedLM.from_pretrained("./trained_mbert/" + language)

    # Extract the base BERT model
    base_model = previous_mlm_model.bert

    # Create a token classification model
    NERconfig = BertConfig.from_pretrained("./trained_mbert/" + language)
    NERconfig.num_labels = 7  # Set the number of labels for token classification

    NERconfig.num_labels = len(label_names)  # Automatically set number of labels
    NERconfig.id2label = id2label
    NERconfig.label2id = label2id

    NER_token_classification_model = AutoModelForTokenClassification.from_config(NERconfig)

    # Assign the base BERT model to the token classification model
    NER_token_classification_model.bert = base_model

    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

    NER_token_classification_model = NER_token_classification_model.to(device)




    def compute_metrics(p, label_names):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        metric = load("seqeval")
        results = metric.compute(predictions=true_predictions, references=true_labels)
        flattened_results = {
            "overall_precision": results["overall_precision"],
            "overall_recall": results["overall_recall"],
            "overall_f1": results["overall_f1"],
            "overall_accuracy": results["overall_accuracy"],
        }
        for k in results.keys():
            if k not in flattened_results.keys():
                flattened_results[k+"_f1"] = results[k]["f1"]
        return flattened_results


    # Training arguments
    ner_training_args = TrainingArguments(
        learning_rate=1e-4,
        num_train_epochs=3,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        weight_decay=0.01,
        output_dir="./NER_training_output/" + language,
        overwrite_output_dir=True,
        save_total_limit=1,
        load_best_model_at_end=True,
        save_only_model=True,
    )
    #trainer values. (I'm using regular trainer, not adaptaTrainer)
    ner_trainer = Trainer(
            model=NER_token_classification_model,
            args=ner_training_args,
            train_dataset=NER_tokenized_dataset["train"],
            eval_dataset=NER_tokenized_dataset["validation"],
            
            data_collator=DataCollatorForTokenClassification(tokenizerNER),
            tokenizer=tokenizerNER,
            compute_metrics=lambda p: compute_metrics(p, label_names)
        )




    ner_trainer.train()
    NER_token_classification_model.save_pretrained("./NER_trained_mbert/" + language)
    tokenizerNER.save_pretrained("./NER_trained_mbert/" + language)


    test_results = ner_trainer.evaluate(eval_dataset=NER_tokenized_dataset["test"])
    results_path = "./test_metrics_" + language + ".json"
    output_file_path = os.path.join(results_path)
    with open(output_file_path, "w") as f:
        json.dump(test_results, f)



    

    


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 12462.65 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  2%|‚ñè         | 21/939 [27:58:13<1222:42:27, 4794.93s/it]
 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:03<00:06,  1.63s/it]
 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:05<00:06,  1.63s/it]

{'eval_loss': 2.424934148788452, 'eval_runtime': 1.8765, 'eval_samples_per_second': 53.289, 'eval_steps_per_second': 1.066, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:09<00:04,  2.28s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:11<00:04,  2.28s/it]

{'eval_loss': 2.168142318725586, 'eval_runtime': 1.9714, 'eval_samples_per_second': 50.727, 'eval_steps_per_second': 1.015, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:15<00:00,  2.55s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:21<00:00,  2.55s/it]

{'eval_loss': 2.1499619483947754, 'eval_runtime': 5.8801, 'eval_samples_per_second': 17.006, 'eval_steps_per_second': 0.34, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:22<00:00,  3.72s/it]


{'train_runtime': 22.3311, 'train_samples_per_second': 13.434, 'train_steps_per_second': 0.269, 'train_loss': 2.4992359479268393, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:03<00:02,  1.84it/s]

{'eval_loss': 1.3130499124526978, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.5382247306310929, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 1.8432, 'eval_samples_per_second': 54.253, 'eval_steps_per_second': 1.085, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:04<00:02,  1.17s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:05<00:02,  1.17s/it]

{'eval_loss': 1.1081463098526, 'eval_overall_precision': 0.31746031746031744, 'eval_overall_recall': 0.049019607843137254, 'eval_overall_f1': 0.08492569002123143, 'eval_overall_accuracy': 0.6362237044638276, 'eval_LOC_f1': 0.12571428571428572, 'eval_ORG_f1': 0.10843373493975902, 'eval_PER_f1': 0.0, 'eval_runtime': 1.0687, 'eval_samples_per_second': 93.571, 'eval_steps_per_second': 1.871, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:07<00:00,  1.11s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:08<00:00,  1.11s/it]

{'eval_loss': 0.9725674986839294, 'eval_overall_precision': 0.5229885057471264, 'eval_overall_recall': 0.22303921568627452, 'eval_overall_f1': 0.3127147766323024, 'eval_overall_accuracy': 0.689584402257568, 'eval_LOC_f1': 0.5128205128205128, 'eval_ORG_f1': 0.11180124223602485, 'eval_PER_f1': 0.16216216216216217, 'eval_runtime': 0.9299, 'eval_samples_per_second': 107.544, 'eval_steps_per_second': 2.151, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:08<00:00,  1.48s/it]


{'train_runtime': 8.8852, 'train_samples_per_second': 33.764, 'train_steps_per_second': 0.675, 'train_loss': 1.4020217259724934, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  4.04it/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 26781.84 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 45432.24 examples/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 49583.92 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 20135.88 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 17870.15 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 19971.93 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another 

{'eval_loss': 3.4103853702545166, 'eval_runtime': 0.8849, 'eval_samples_per_second': 113.007, 'eval_steps_per_second': 2.26, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:06<00:02,  1.48s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:06<00:02,  1.48s/it]

{'eval_loss': 4.2117600440979, 'eval_runtime': 0.2271, 'eval_samples_per_second': 440.317, 'eval_steps_per_second': 8.806, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:08<00:00,  1.35s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:09<00:00,  1.35s/it]

{'eval_loss': 3.2677245140075684, 'eval_runtime': 0.2859, 'eval_samples_per_second': 349.749, 'eval_steps_per_second': 6.995, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:10<00:00,  1.68s/it]


{'train_runtime': 10.0471, 'train_samples_per_second': 29.859, 'train_steps_per_second': 0.597, 'train_loss': 3.689899126688639, 'epoch': 3.0}


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 15713.12 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 14367.50 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 15098.29 examples/s]
  _warn_prf(average, modifier, msg_start, len(result))

 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:01<00:01,  2.94it/s]

{'eval_loss': 1.253534197807312, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.5974025974025974, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 0.4994, 'eval_samples_per_second': 200.24, 'eval_steps_per_second': 4.005, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:02<00:01,  1.78it/s]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:02<00:01,  1.78it/s]

{'eval_loss': 1.0553373098373413, 'eval_overall_precision': 0.07692307692307693, 'eval_overall_recall': 0.016129032258064516, 'eval_overall_f1': 0.026666666666666665, 'eval_overall_accuracy': 0.6473526473526473, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.04597701149425288, 'eval_PER_f1': 0.037037037037037035, 'eval_runtime': 0.5517, 'eval_samples_per_second': 181.264, 'eval_steps_per_second': 3.625, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:03<00:00,  1.57it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:04<00:00,  1.57it/s]

{'eval_loss': 0.9801709651947021, 'eval_overall_precision': 0.18867924528301888, 'eval_overall_recall': 0.053763440860215055, 'eval_overall_f1': 0.08368200836820085, 'eval_overall_accuracy': 0.6713286713286714, 'eval_LOC_f1': 0.02247191011235955, 'eval_ORG_f1': 0.08888888888888889, 'eval_PER_f1': 0.16666666666666666, 'eval_runtime': 0.5074, 'eval_samples_per_second': 197.075, 'eval_steps_per_second': 3.941, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:05<00:00,  1.14it/s]


{'train_runtime': 5.2476, 'train_samples_per_second': 57.168, 'train_steps_per_second': 1.143, 'train_loss': 1.2298861344655354, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  3.15it/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 25987.01 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 40713.49 examples/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 48099.82 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 15444.08 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 17394.37 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 15356.44 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another 

{'eval_loss': 4.684937953948975, 'eval_runtime': 1.1615, 'eval_samples_per_second': 86.096, 'eval_steps_per_second': 1.722, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:09<00:04,  2.32s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:10<00:04,  2.32s/it]

{'eval_loss': 4.889511585235596, 'eval_runtime': 0.8924, 'eval_samples_per_second': 112.052, 'eval_steps_per_second': 2.241, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:14<00:00,  2.34s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:16<00:00,  2.34s/it]

{'eval_loss': 4.131168365478516, 'eval_runtime': 0.8687, 'eval_samples_per_second': 115.116, 'eval_steps_per_second': 2.302, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:16<00:00,  2.76s/it]


{'train_runtime': 16.5451, 'train_samples_per_second': 18.132, 'train_steps_per_second': 0.363, 'train_loss': 4.775767008463542, 'epoch': 3.0}


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 12227.94 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 12306.15 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 11403.76 examples/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:01<00:02,  1.73it/s]

{'eval_loss': 1.091901421546936, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.6950425638457687, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 0.5638, 'eval_samples_per_second': 177.355, 'eval_steps_per_second': 3.547, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:03<00:01,  1.29it/s]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:03<00:01,  1.29it/s]

{'eval_loss': 0.8646403551101685, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.6970455683525288, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 0.6834, 'eval_samples_per_second': 146.333, 'eval_steps_per_second': 2.927, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:05<00:00,  1.14it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:06<00:00,  1.14it/s]

{'eval_loss': 0.796040952205658, 'eval_overall_precision': 0.041666666666666664, 'eval_overall_recall': 0.00423728813559322, 'eval_overall_f1': 0.007692307692307692, 'eval_overall_accuracy': 0.7095643465197796, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.017857142857142856, 'eval_PER_f1': 0.0, 'eval_runtime': 0.5926, 'eval_samples_per_second': 168.756, 'eval_steps_per_second': 3.375, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:06<00:00,  1.13s/it]


{'train_runtime': 6.7632, 'train_samples_per_second': 44.358, 'train_steps_per_second': 0.887, 'train_loss': 1.0446496804555256, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  3.09it/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 31258.79 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 41813.42 examples/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 49333.15 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 6167.10 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 7151.05 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 12433.09 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another ar

{'eval_loss': 2.4783997535705566, 'eval_runtime': 35.8718, 'eval_samples_per_second': 2.788, 'eval_steps_per_second': 0.056, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [01:28<00:48, 24.24s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [02:19<00:48, 24.24s/it]

{'eval_loss': 2.8094327449798584, 'eval_runtime': 51.1203, 'eval_samples_per_second': 1.956, 'eval_steps_per_second': 0.039, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [03:13<00:00, 36.17s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [03:48<00:00, 36.17s/it]

{'eval_loss': 3.012944221496582, 'eval_runtime': 34.511, 'eval_samples_per_second': 2.898, 'eval_steps_per_second': 0.058, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [03:50<00:00, 38.47s/it]


{'train_runtime': 230.7503, 'train_samples_per_second': 1.3, 'train_steps_per_second': 0.026, 'train_loss': 4.6798702875773115, 'epoch': 3.0}


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 3568.50 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 5178.92 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 9757.83 examples/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:05<00:05,  1.44s/it]

{'eval_loss': 1.2174054384231567, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.7544123169357867, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 2.2098, 'eval_samples_per_second': 45.252, 'eval_steps_per_second': 0.905, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:07<00:03,  1.73s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:11<00:03,  1.73s/it]

{'eval_loss': 1.101224422454834, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.7544123169357867, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 3.7642, 'eval_samples_per_second': 26.566, 'eval_steps_per_second': 0.531, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:13<00:00,  2.25s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:16<00:00,  2.25s/it]

{'eval_loss': 1.0729403495788574, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.7544123169357867, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.0, 'eval_runtime': 2.8296, 'eval_samples_per_second': 35.34, 'eval_steps_per_second': 0.707, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:17<00:00,  2.90s/it]


{'train_runtime': 17.3637, 'train_samples_per_second': 17.277, 'train_steps_per_second': 0.346, 'train_loss': 1.4373068809509277, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.70it/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 20713.64 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 44229.72 examples/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 40685.85 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 6316.06 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 20186.27 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 23207.57 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another a

{'eval_loss': 2.090147018432617, 'eval_runtime': 6.2575, 'eval_samples_per_second': 15.981, 'eval_steps_per_second': 0.32, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:10<00:05,  2.62s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:52<00:05,  2.62s/it]

{'eval_loss': 8.72470474243164, 'eval_runtime': 42.5929, 'eval_samples_per_second': 2.348, 'eval_steps_per_second': 0.047, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:07<00:00, 14.83s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:53<00:00, 14.83s/it]

{'eval_loss': 6.358280181884766, 'eval_runtime': 45.8806, 'eval_samples_per_second': 2.18, 'eval_steps_per_second': 0.044, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:56<00:00, 19.34s/it]


{'train_runtime': 115.9588, 'train_samples_per_second': 2.587, 'train_steps_per_second': 0.052, 'train_loss': 5.752504984537761, 'epoch': 3.0}


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 3053.13 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 13737.85 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 17455.17 examples/s]
  _warn_prf(average, modifier, msg_start, len(result))

 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:03<00:01,  2.50it/s]

{'eval_loss': 2.83341908454895, 'eval_overall_precision': 0.1445012787723785, 'eval_overall_recall': 0.3505687693898656, 'eval_overall_f1': 0.20464835496528824, 'eval_overall_accuracy': 0.14492753623188406, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.0, 'eval_PER_f1': 0.2524199553239017, 'eval_runtime': 2.5687, 'eval_samples_per_second': 38.93, 'eval_steps_per_second': 0.779, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:04<00:02,  1.12s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:06<00:02,  1.12s/it]

{'eval_loss': 1.7586097717285156, 'eval_overall_precision': 0.16112531969309463, 'eval_overall_recall': 0.390899689762151, 'eval_overall_f1': 0.22819197102324176, 'eval_overall_accuracy': 0.16155157715260018, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.3449367088607595, 'eval_PER_f1': 0.22729193071398396, 'eval_runtime': 2.5664, 'eval_samples_per_second': 38.965, 'eval_steps_per_second': 0.779, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:07<00:00,  1.36s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:11<00:00,  1.36s/it]

{'eval_loss': 1.5501452684402466, 'eval_overall_precision': 0.16751918158567775, 'eval_overall_recall': 0.40641158221303, 'eval_overall_f1': 0.23724720796860854, 'eval_overall_accuracy': 0.1679454390451833, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.13153042409342347, 'eval_PER_f1': 0.41690962099125367, 'eval_runtime': 3.4893, 'eval_samples_per_second': 28.659, 'eval_steps_per_second': 0.573, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:12<00:00,  2.11s/it]


{'train_runtime': 12.6383, 'train_samples_per_second': 23.737, 'train_steps_per_second': 0.475, 'train_loss': 1.6229526201883953, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.91it/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 252076.69 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 440717.03 examples/s]
Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:00<00:00, 1424695.65 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 27758.65 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 27489.21 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:00<00:00, 29826.10 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another t

{'eval_loss': 1.8009531497955322, 'eval_runtime': 7.7821, 'eval_samples_per_second': 128.499, 'eval_steps_per_second': 2.056, 'epoch': 1.0}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 500/939 [16:35<14:34,  1.99s/it]

{'loss': 2.1889, 'grad_norm': 11.362300872802734, 'learning_rate': 4.6751863684771034e-05, 'epoch': 1.6}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [20:44<08:47,  1.68s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [20:52<08:47,  1.68s/it]

{'eval_loss': 1.5535551309585571, 'eval_runtime': 7.3463, 'eval_samples_per_second': 136.122, 'eval_steps_per_second': 2.178, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [31:13<00:00,  1.69s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [31:20<00:00,  1.69s/it]

{'eval_loss': 1.5099536180496216, 'eval_runtime': 7.1084, 'eval_samples_per_second': 140.679, 'eval_steps_per_second': 2.251, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [31:21<00:00,  2.00s/it]


{'train_runtime': 1881.3911, 'train_samples_per_second': 31.891, 'train_steps_per_second': 0.499, 'train_loss': 1.8349257724099775, 'epoch': 3.0}


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 21315.23 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 19675.31 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:00<00:00, 22781.50 examples/s]
 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [03:11<05:39,  1.85it/s]
 33%|‚ñà‚ñà‚ñà‚ñé      | 313/939 [03:15<05:39,  1.85it/s]

{'eval_loss': 0.1247803345322609, 'eval_overall_precision': 0.9192448872574724, 'eval_overall_recall': 0.9154046997389034, 'eval_overall_f1': 0.9173207744636317, 'eval_overall_accuracy': 0.9651211801896733, 'eval_LOC_f1': 0.9643835616438357, 'eval_ORG_f1': 0.8879668049792532, 'eval_PER_f1': 0.888504753673293, 'eval_runtime': 3.4526, 'eval_samples_per_second': 289.636, 'eval_steps_per_second': 4.634, 'epoch': 1.0}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 500/939 [05:10<04:30,  1.62it/s]

{'loss': 0.1638, 'grad_norm': 1.1155438423156738, 'learning_rate': 4.6751863684771034e-05, 'epoch': 1.6}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [06:28<02:49,  1.85it/s]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 626/939 [06:31<02:49,  1.85it/s]

{'eval_loss': 0.0987996980547905, 'eval_overall_precision': 0.926954732510288, 'eval_overall_recall': 0.9409921671018276, 'eval_overall_f1': 0.9339207048458149, 'eval_overall_accuracy': 0.9736564805057956, 'eval_LOC_f1': 0.979702300405954, 'eval_ORG_f1': 0.9052631578947369, 'eval_PER_f1': 0.905759162303665, 'eval_runtime': 3.3132, 'eval_samples_per_second': 301.823, 'eval_steps_per_second': 4.829, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [09:44<00:00,  1.85it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [09:48<00:00,  1.85it/s]

{'eval_loss': 0.09813285619020462, 'eval_overall_precision': 0.9441836202399583, 'eval_overall_recall': 0.9451697127937336, 'eval_overall_f1': 0.9446764091858036, 'eval_overall_accuracy': 0.9773445732349842, 'eval_LOC_f1': 0.9782903663500678, 'eval_ORG_f1': 0.9219269102990033, 'eval_PER_f1': 0.9254766031195841, 'eval_runtime': 3.235, 'eval_samples_per_second': 309.114, 'eval_steps_per_second': 4.946, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 939/939 [09:48<00:00,  1.59it/s]


{'train_runtime': 588.8486, 'train_samples_per_second': 101.894, 'train_steps_per_second': 1.595, 'train_loss': 0.10752889168021271, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:02<00:00,  5.47it/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 19638.09 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 21753.56 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 19884.81 examples/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

{'eval_loss': 3.1998937129974365, 'eval_runtime': 0.9826, 'eval_samples_per_second': 101.774, 'eval_steps_per_second': 2.035, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:04<00:02,  1.19s/it]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:05<00:02,  1.19s/it]

{'eval_loss': 2.4259259700775146, 'eval_runtime': 0.2036, 'eval_samples_per_second': 491.248, 'eval_steps_per_second': 9.825, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:07<00:00,  1.05s/it]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:07<00:00,  1.05s/it]

{'eval_loss': 2.734860897064209, 'eval_runtime': 0.2241, 'eval_samples_per_second': 446.311, 'eval_steps_per_second': 8.926, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:08<00:00,  1.37s/it]


{'train_runtime': 8.2086, 'train_samples_per_second': 36.547, 'train_steps_per_second': 0.731, 'train_loss': 3.704561233520508, 'epoch': 3.0}


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 14934.32 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 16005.13 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 15517.79 examples/s]
  _warn_prf(average, modifier, msg_start, len(result))

 33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:01<00:01,  3.50it/s]

{'eval_loss': 1.4743931293487549, 'eval_overall_precision': 0.18085106382978725, 'eval_overall_recall': 0.13821138211382114, 'eval_overall_f1': 0.1566820276497696, 'eval_overall_accuracy': 0.5597122302158274, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.2677165354330709, 'eval_PER_f1': 0.0, 'eval_runtime': 0.5345, 'eval_samples_per_second': 187.086, 'eval_steps_per_second': 3.742, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:01<00:00,  2.04it/s]
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [00:02<00:00,  2.04it/s]

{'eval_loss': 1.3270492553710938, 'eval_overall_precision': 0.07954545454545454, 'eval_overall_recall': 0.056910569105691054, 'eval_overall_f1': 0.06635071090047394, 'eval_overall_accuracy': 0.6028776978417266, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.11570247933884296, 'eval_PER_f1': 0.0, 'eval_runtime': 0.6468, 'eval_samples_per_second': 154.596, 'eval_steps_per_second': 3.092, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:03<00:00,  1.72it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:04<00:00,  1.72it/s]

{'eval_loss': 1.2496193647384644, 'eval_overall_precision': 0.08974358974358974, 'eval_overall_recall': 0.056910569105691054, 'eval_overall_f1': 0.06965174129353234, 'eval_overall_accuracy': 0.6143884892086331, 'eval_LOC_f1': 0.0, 'eval_ORG_f1': 0.12612612612612611, 'eval_PER_f1': 0.0, 'eval_runtime': 0.4489, 'eval_samples_per_second': 222.781, 'eval_steps_per_second': 4.456, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:04<00:00,  1.28it/s]


{'train_runtime': 4.6894, 'train_samples_per_second': 63.974, 'train_steps_per_second': 1.279, 'train_loss': 1.4594939549763997, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.82it/s]


In [40]:
language = "./test_metrics_" + "ne" + ".json"
output_file_path = os.path.join(language)
with open(output_file_path, "w") as f:
    json.dump(test_results, f)