In [1]:
import torch
from transformers import XLMRobertaTokenizer

#Data "Source lang" : "target lang"
from datasets import load_dataset

ted = load_dataset('ted_multi', split='train')
ted

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['translations', 'talk_name'],
    num_rows: 258098
})

In [3]:
idx = ted[0]['translations']['language'].index('pl')
source = ted[0]['translations']['translation'][idx]
source

'Wśród problemów z jakimi zmagamy się dziś , mamy na myśli przede wszystkim te finansowe i ekonomiczne . Problemem , który mnie interesuje najbardziej jest deficyt politycznego dialogu , brak umiejętności odniesienia się do współczesnych konfliktów , dotarcia do ich źródła i zrozumienia kim są ich kluczowi gracze i jak z nimi postępować .'

## 1. Przygotowanie danych

In [32]:
from tqdm.auto import tqdm

source = 'ja'
target = 'pl'
train_x = []
id = 0

for row in tqdm(ted):
    if source in row['translations']['language']:
        idx = row['translations']['language'].index(source)
        source_sentence = row['translations']['translation'][idx].strip()
        for i, lang in enumerate(row['translations']['language']):
            if lang == target:
                translation_sentence = row['translations']['translation'][i].strip()
                train_x.append({'id': id, 'translation': {source: source_sentence, target: translation_sentence}})
                id += 1

100%|███████████████████████████████████████████████████████████████████████| 258098/258098 [00:15<00:00, 16143.82it/s]


In [37]:
import pandas as pd
#Zapisz Dane do Jsona
df = pd.DataFrame.from_records(train_x)
df.to_json(f"ted_multi_jp-pl.json", orient="records", lines=True, force_ascii=False)

print(len(train_x))
train_x[0]

165758


{'id': 0,
 'translation': {'ja': '我々が今日直面している 様々な機能不全のなかで — 財政や経済が最初に思いつきますが — 私が一番 憂慮しているのは 政治的対話の欠乏です 我々が 近年の紛争において 状況を把握し その根本原因を探り 中心人物を理解し 彼らと交渉をする能力です',
  'pl': 'Wśród problemów z jakimi zmagamy się dziś , mamy na myśli przede wszystkim te finansowe i ekonomiczne . Problemem , który mnie interesuje najbardziej jest deficyt politycznego dialogu , brak umiejętności odniesienia się do współczesnych konfliktów , dotarcia do ich źródła i zrozumienia kim są ich kluczowi gracze i jak z nimi postępować .'}}

### Zapisanie danych do pliku

In [26]:
import gzip
import os

if not os.path.exists('./data'):
    os.mkdir('./data')

#zapisz do pliku
for lang_pair in train_x.keys():
    with gzip.open(f'./data/ted-train-{lang_pair}.txt', 'wt', encoding='utf-8') as f:
        f.write('\n'.join(train_x[lang_pair]))

TypeError: sequence item 0: expected str instance, dict found

### Tokenizer

In [6]:
from transformers import XLMRobertaTokenizer

sentences = train_x[f'{source}-{target}'][0].split('\t')

xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
print(xlmr_tokenizer.tokenize(sentences[0]))

['▁', '我々', 'が', '今日', '直', '面', 'している', '▁', '様々な', '機能', '不', '全', 'のなかで', '▁—', '▁', '財政', 'や', '経済', 'が', '最初に', '思い', 'つき', 'ますが', '▁—', '▁', '私が', '一番', '▁', '憂', '慮', 'している', 'のは', '▁', '政治', '的', '対', '話', 'の', '欠', '乏', 'です', '▁', '我々', 'が', '▁', '近年', 'の', '紛', '争', 'において', '▁', '状況', 'を', '把握', 'し', '▁その', '根本', '原因', 'を', '探', 'り', '▁', '中心', '人物', 'を理解し', '▁', '彼ら', 'と', '交渉', 'をする', '能力', 'です']


## 2. Transfer learning (teacher-student)

In [7]:
from sentence_transformers import models, SentenceTransformer
xlmr = models.Transformer('xlm-roberta-base')
pooler = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True)

student_model = SentenceTransformer(modules=[xlmr, pooler], device='cuda')
student_model

  with safe_open(checkpoint_file, framework="pt") as f:
  return self.fget.__get__(instance, owner)()
  storage = cls(wrap_storage=untyped_storage)
  with safe_open(filename, framework="pt", device=device) as f:
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [8]:
teacher_model = SentenceTransformer('paraphrase-distilroberta-base-v2', device='cuda')
teacher_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [9]:
from sentence_transformers import ParallelSentencesDataset

data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=32, use_embedding_cache=True)

max_sentences_per_language = 500000
train_max_sentence_length = 250

#loading data
train_files = [f for f in os.listdir('./data') if 'train' in f]
for f in train_files:
    print(f)
    data.load_data('./data/'+f, max_sentences=max_sentences_per_language)

ted-train-ja-pl.tsv.gz


In [10]:
data

<sentence_transformers.datasets.ParallelSentencesDataset.ParallelSentencesDataset at 0x20949835180>

### Dataloader

In [11]:
from torch.utils.data import DataLoader

dataLoader = DataLoader(data, shuffle=True, batch_size=32)

### Loss

In [12]:
from sentence_transformers.losses import MSELoss

loss = MSELoss(model=student_model)

## Trening

In [13]:
epochs = 1
warmup_steps = int(len(dataLoader) * epochs * 0.1)

student_model.fit(
    train_objectives=[(dataLoader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./xlmr-ted',
    optimizer_params={'lr': 2e-5, 'eps': 1e-6},
    save_best_model=True,
    show_progress_bar=True
)

Epoch:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]
  labels = torch.tensor(labels)

Iteration:   0%|                                                                    | 1/8024 [00:00<2:01:26,  1.10it/s][A
Iteration:   0%|                                                                    | 2/8024 [00:01<1:21:12,  1.65it/s][A
Iteration:   0%|                                                                    | 3/8024 [00:01<1:03:51,  2.09it/s][A
Iteration:   0%|                                                                    | 4/8024 [00:02<1:20:38,  1.66it/s][A
Iteration:   0%|                                                                    | 5/8024 [00:04<1:52:04,  1.19it/s][A
Iteration:   0%|                                                                   | 6/8024 [00:50<21:02:00,  9.44s/it][A
Iteration:   0%|                                                                   | 7/8024 [00:52<18:10:42, 