In [None]:
import datasets

ted = datasets.load_dataset('ted_multi', split='train')
ted



ted[0]



In [None]:
# get the index
idx = ted[0]['translations']['language'].index('en')
idx

In [None]:


# use the index to get the corresponding translation
source = ted[0]['translations']['translation'][idx]
source



In [None]:
# use that info to create all (source, translation) pairs
pairs = []
for i, translation in enumerate(ted[0]['translations']['translation']):
    # we don't want to use the source language (English) as a translation
    if i != idx:
        pairs.append((source, translation))

# let's see what we have
pairs[0]


In [None]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar

# initialize list of languages to keep
lang_list = ['it', 'es', 'ar', 'fr', 'de']
# create dict to store our pairs
train_samples = {f'en-{lang}': [] for lang in lang_list}

# now build our training samples list
for row in tqdm(ted):
    # get source (English)
    idx = row['translations']['language'].index('en')
    source = row['translations']['translation'][idx].strip()
    # loop through translations
    for i, lang in enumerate(row['translations']['language']):
        # check if lang is in lang list
        if lang in lang_list:
            translation = row['translations']['translation'][i].strip()
            train_samples[f'en-{lang}'].append(
                source+'\t'+translation
            )

In [None]:


# how many pairs for each language?
for lang_pair in train_samples.keys():
    print(f'{lang_pair}: {len(train_samples[lang_pair])}')



In [None]:


source+'\t'+translation



In [None]:


import gzip

if not os.path.exists('./data'):
    os.mkdir('./data')

# save to file, sentence transformers reader will expect tsv.gz file
for lang_pair in train_samples.keys():
    with gzip.open(f'./data/ted-train-{lang_pair}.tsv.gz', 'wt', encoding='utf-8') as f:
        f.write('\n'.join(train_samples[lang_pair]))



In [None]:
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:


sentences = [
    'we will include several languages',
    '一些中文单词',
    'το ελληνικό αλφάβητο είναι πολύ ωραίο',
    'ჩვენ გვაქვს ქართული'
]

for text in sentences:
    print(bert_tokenizer.tokenize(text))



In [None]:
from transformers import XLMRobertaTokenizer

xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')


In [None]:


for text in sentences:
    print(xlmr_tokenizer.tokenize(text))



In [None]:
from sentence_transformers import models

xlmr = models.Transformer('xlm-roberta-base')
pooler = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

student = SentenceTransformer(modules=[xlmr, pooler])
student

In [None]:


from sentence_transformers import SentenceTransformer

teacher = SentenceTransformer('all-mpnet-base-v2')
teacher



In [None]:


teacher = SentenceTransformer('paraphrase-distilroberta-base-v2')
teacher



In [None]:
from sentence_transformers import ParallelSentencesDataset
data = ParallelSentencesDataset(student_model=student, teacher_model=teacher, batch_size=32, use_embedding_cache=True)

In [None]:
max_sentences_per_language = 500000
train_max_sentence_length = 250 # max num of characters per sentence

train_files = [f for f in os.listdir('./data') if 'train' in f]
for f in train_files:
    print(f)
    data.load_data('./data/'+f, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length)

In [None]:


from torch.utils.data import DataLoader

loader = DataLoader(data, shuffle=True, batch_size=32)



In [None]:
from sentence_transformers import losses

loss = losses.MSELoss(model=student)

In [None]:


from sentence_transformers import evaluation
import numpy as np

epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

student.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./xlmr-ted',
    optimizer_params={'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False},
    save_best_model=True,
    show_progress_bar=False
)



In [None]:
import datasets

en = datasets.load_dataset('stsb_multi_mt', 'en', split='test')
en

In [None]:
it = datasets.load_dataset('stsb_multi_mt', 'it', split='test')
it

In [None]:
en[0]

it[0]

In [None]:
en = en.map(lambda x: {'similarity_score': x['similarity_score'] / 5.0})
it = it.map(lambda x: {'similarity_score': x['similarity_score'] / 5.0})

en[0]

In [None]:
from sentence_transformers import InputExample

en_samples = []
it_samples = []
en_it_samples = []

for i in range(len(en)):
    en_samples.append(InputExample(
        texts=[en[i]['sentence1'], en[i]['sentence2']],
        label=en[i]['similarity_score']
    ))
    it_samples.append(InputExample(
        texts=[it[i]['sentence1'], it[i]['sentence2']],
        label=it[i]['similarity_score']
    ))
    en_it_samples.append(InputExample(
        texts=[en[i]['sentence1'], it[i]['sentence2']],
        label=en[i]['similarity_score']
    ))


In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

en_eval = EmbeddingSimilarityEvaluator.from_input_examples(
    en_samples, write_csv=False
)
it_eval = EmbeddingSimilarityEvaluator.from_input_examples(
    it_samples, write_csv=False
)
en_it_eval = EmbeddingSimilarityEvaluator.from_input_examples(
    en_it_samples, write_csv=False
)


In [None]:


from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./xlmr-ted')

en_eval(model)



In [None]:


it_eval(model)



In [None]:


en_it_eval(model)



In [None]:


from sentence_transformers import models

xlmr = models.Transformer('xlm-roberta-base')
pooler = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

student = SentenceTransformer(modules=[xlmr, pooler])



In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')


embed = model.encode
