In [1]:
from torch.utils.data import Dataset, random_split
import json

max_dataset_size = 220000
train_set_size = 200000
valid_set_size = 20000

class TRANS(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, dafa_file):
        Data = {}
        with open(dafa_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if idx >= max_dataset_size:
                    break
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

data = TRANS('data/translation2019zh/translation2019zh_train.json')
train_data, valid_data = random_split(data, [train_set_size, valid_set_size])
test_data = TRANS('data/translation2019zh/translation2019zh_valid.json')

print(f"train set size: {len(train_data)}")
print(f"valid set size: {len(valid_data)}")
print(next(iter(train_data)))

train set size: 200000
valid set size: 20000
{'english': 'Again, improved efficiency confers great benefits in the longer term. However, to the extent that firms are able to find further cost-cutting measures as output expands, they may delay hiring.', 'chinese': '提高效率赋予了企业长远而巨大的好处，但是，只要企业在提高产出时能够找到进一步削减成本的措施，他们可能会延迟聘用新员工。'}


In [2]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [3]:
zh_sentence = train_data[0]['chinese']
en_sentence = train_data[0]['english']

inputs = tokenizer(zh_sentence)
targets = tokenizer(text_target=en_sentence)

wrong_targets = tokenizer(en_sentence)
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(targets['input_ids']))
print(tokenizer.convert_ids_to_tokens(wrong_targets['input_ids']))

['▁', '提高效率', '赋予', '了', '企业', '长远', '而', '巨大的', '好处', ',', '但是', ',', '只要', '企业', '在', '提高', '产出', '时', '能够', '找到', '进一步', '削减', '成本', '的措施', ',', '他们', '可能会', '延迟', '聘用', '新', '员工', '。', '</s>']
['▁Again', ',', '▁improved', '▁efficiency', '▁confer', 's', '▁great', '▁benefits', '▁in', '▁the', '▁longer', '▁term', '.', '▁However', ',', '▁to', '▁the', '▁extent', '▁that', '▁firms', '▁are', '▁able', '▁to', '▁find', '▁further', '▁cost', '-', 'cutting', '▁measures', '▁as', '▁output', '▁expand', 's', ',', '▁they', '▁may', '▁delay', '▁hiring', '.', '</s>']
['▁A', 'g', 'ain', ',', '▁', 'im', 'pro', 've', 'd', '▁', 'e', 'ff', 'ic', 'i', 'en', 'cy', '▁con', 'f', 'ers', '▁g', 're', 'at', '▁be', 'ne', 'fi', 'ts', '▁in', '▁the', '▁l', 'ong', 'er', '▁', 'ter', 'm', '.', '▁How', 'ever', ',', '▁to', '▁the', '▁', 'ex', 't', 'ent', '▁that', '▁f', 'ir', 'ms', '▁are', '▁', 'able', '▁to', '▁f', 'ind', '▁f', 'ur', 'ther', '▁c', 'ost', '-', 'c', 'ut', 'ting', '▁me', 'as', 'ure', 's', '▁', 'as', '▁out', 'p', '

In [4]:
import torch 
max_input_length = 128
max_target_length = 128

inputs = [train_data[s_idx]['chinese'] for s_idx in range(4)]
targets = [train_data[s_idx]['english'] for s_idx in range(4)]

model_inputs = tokenizer(
    inputs,
    padding=True,
    max_length=max_input_length,
    truncation=True,
    return_tensors='pt'
)
labels = tokenizer(
    text_target=targets,
    padding=True,
    max_length=max_target_length,
    truncation=True,
    return_tensors='pt'
)['input_ids']

end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx + 1:] = -100

print("batch_X shape: ", {k: v.shape for k, v in model_inputs.items()})
print("batch_y.shape: ", labels.shape)
print(model_inputs)
print(labels)


batch_X shape:  {'input_ids': torch.Size([4, 35]), 'attention_mask': torch.Size([4, 35])}
batch_y.shape:  torch.Size([4, 43])
{'input_ids': tensor([[    7, 20138,  4448,    55,  1667, 22949,   166,  8534, 10980,     2,
          1443,     2,  5600,  1667,    36,  1002,  5353,   142,   577,  2314,
           784,  6664,  3861,  2573,     2,   327,  3382, 11453, 12940,   636,
         11747,     9,     0, 65000, 65000],
        [    7,  7395,   452,     2,  1003,   309,  1944, 20318,    13,  1010,
          1107,   155, 19904,  1121, 13961,  2476,     2, 27013, 16144, 45268,
             9,   582,     2,   327,   365, 61772,  4974,   864,  2973,   453,
          1102,  2585,     9,     0, 65000],
        [10814, 11202,    35,  9803,  2532,   999,  1596, 13699,  8792,     2,
           330,   475,    36, 29187,  4663, 30440,     2,  6319, 34909,  3265,
          1164,  3818,     2, 35294, 14842,  6128, 41965,     2,  6128,   785,
         14246,   241,    48,     0, 65000],
        [ 4159

In [5]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM

max_length = 128

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['chinese'])
        batch_targets.append(sample['english'])
    batch_data = tokenizer(
        batch_inputs,
        text_target=batch_targets,
        padding=True,
        max_length=max_length,
        truncation=True,
        return_tensors='pt'
    )
    batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(batch_data['labels'])
    end_token_index = torch.where(batch_data['labels'] == tokenizer.eos_token_id)[1]
    for idx, end_idx in enumerate(end_token_index):
        batch_data['labels'][idx][end_idx + 1:] = -100
    return batch_data

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

Using cpu device


In [6]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
batch shape: {'input_ids': torch.Size([32, 54]), 'attention_mask': torch.Size([32, 54]), 'labels': torch.Size([32, 50]), 'decoder_input_ids': torch.Size([32, 50])}
{'input_ids': tensor([[  106,   173,   728,  ..., 65000, 65000, 65000],
        [    7,  8357,  4067,  ..., 65000, 65000, 65000],
        [    7,  3045,  1381,  ..., 65000, 65000, 65000],
        ...,
        [ 4743,    31,    80,  ..., 65000, 65000, 65000],
        [    7,    65,  1098,  ..., 65000, 65000, 65000],
        [    7,  6571,    16,  ..., 65000, 65000, 65000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[   66,     7,     5,  ...,  -100,  -100,  -100],
        [10274,  1552, 60718,  ...,  -100,  -100,  -100],
        [15298,     6,

In [None]:
from torch import nn
from transformers import AutoConfig
from transformers import MarianPreTrainedModel, MarianModel

class MarianForMT(MarianPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = MarianModel(config)
        target_vocab_size = config.decoder_vocab_size
        self.register_buffer("final_logits_bias", torch.zeros((1, target_vocab_size)))
        self.lm_head = nn.Linear(config.d_model, target_vocab_size, bias=False)
        self.post_init()
    
    def forward(self, x):
        output = self.model(**x)
        sequence_output = output.last_hidden_state
        lm_logits = self.lm_head(sequence_output) + self.fina_logits_bias
        return lm_logits
    
    def other_func(self):
        pass

config = AutoConfig.from_pretrained(model_checkpoint)
model = MarianForMT.from_pretrained(model_checkpoint, config=config).to(device)
print(model)

loading configuration file config.json from cache at /Users/zhouke/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-zh-en/snapshots/cf109095479db38d6df799875e34039d4938aaa6/config.json
Model config MarianConfig {
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 65000,
  "decoder_vocab_size": 65001,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 65001,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",

MarianForMT(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, 

Attempting to create safetensors variant
Safetensors PR exists


In [18]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f"loss: {0:>7f}")
    finish_batch_num = epoch * len(dataloader)

    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f"loss: {total_loss / (finish_batch_num + batch):>7f}")
        progress_bar.update(1)
    return total_loss

In [10]:
from sacrebleu.metrics import BLEU

predictions =[
    "This plugin lets you translate web pages between several languages automatically."
]
bad_predictions_1 = ["This This This This"]
bad_predictions_2 = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

bleu = BLEU()
print(bleu.corpus_score(predictions, references).score)
print(bleu.corpus_score(bad_predictions_1, references).score)
print(bleu.corpus_score(bad_predictions_2, references).score)

46.750469682990186
1.683602693167689
0.0


In [11]:
from sacrebleu.metrics import BLEU
predictions = [
    "我在苏州大学学习计算机，苏州大学很美丽。"
]

references = [
    [
        "我在环境优美的苏州大学学习计算机。"
    ]
]

bleu = BLEU(tokenize='zh')
print(f'BLEU: {bleu.corpus_score(predictions, references).score}')
bleu = BLEU()
print(f'wrong BLEU: {bleu.corpus_score(predictions, references).score}')

BLEU: 45.340106118883234
wrong BLEU: 0.0


In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

setence = "我叫张三，我住在苏州。"

sentence_inputs = tokenizer(setence, return_tensors='pt').to(device)
setence_generated_tokens = model.generate(
    sentence_inputs["input_ids"],
    attention_mask=sentence_inputs['attention_mask'],
    max_length=128
)

sentence_decoder_pred = tokenizer.decode(setence_generated_tokens[0], skip_special_tokens=True)
print(sentence_decoder_pred)

Using cpu device


loading configuration file config.json from cache at /Users/zhouke/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-zh-en/snapshots/cf109095479db38d6df799875e34039d4938aaa6/config.json
Model config MarianConfig {
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 65000,
  "decoder_vocab_size": 65001,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 65001,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",

My name is Zhang San, and I live in Suzhou.


Safetensors PR exists


In [14]:
from sacrebleu.metrics import BLEU
import numpy as np
bleu = BLEU()


def test_loop(dataloader, model):
    preds, labels = [], []
    model.eval()

    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_lenth=max_length,
            ).cpu().numpy()
            label_tokens = batch_data["labels"].cpu().numpy()

            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

            preds += [pred.strip() for pred in decoded_preds]
            labels += [[label.strip()] for label in decoded_labels]
    return bleu.corpus_score(preds, labels).score

In [19]:
from transformers import get_scheduler
from torch.optim import AdamW

lr = 2e-5
epoch_num = 1

optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num * len(train_dataloader)
)

total_loss = 0.
best_bleu = 0.
for t in range(epoch_num):
    print(f"Epoch {t + 1} / {epoch_num} \n-------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t + 1, total_loss)
    valid_bleu = test_loop(valid_dataloader, model, mode="Valid")
    print(f"BLEU: {valid_bleu:>0.2f}\n")
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        print("Saving new weights ... \n")
        torch.save(model.state_dict(), f"ecpoch_{t + 1}_valid_bleu_{valid_bleu:>0.2f}_model_weights.bin")
print("Done!")

Epoch 1 / 1 
-------------------------



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

TypeError: test_loop() got an unexpected keyword argument 'mode'