In [None]:
!pip3 install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
import sentence_transformers.util
from sklearn.neighbors import NearestNeighbors

import os
import logging
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import io
import torch

In [None]:
def cos_sim(x,y):
    return np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [None]:
def download_dataset(dataset_list):
    if not isinstance(dataset_list, list):
        dataset_list = [dataset_list]

    for filepath in dataset_list:
        if not os.path.exists(filepath):
            print(filepath, "does not exist. Downloading dataset from server")
            filename = os.path.basename(filepath)
            url = "https://sbert.net/datasets/" + filename
            sentence_transformers.util.http_get(url, filepath)


def make_parallel_sentences_dataset(parallel_sentences_path,train_corpus,source_languages,target_languages,max_dev_sentences = 2000, max_test_sentences = 2000):
    os.makedirs(parallel_sentences_path, exist_ok=True)
    train_files = []
    dev_files = []
    test_files = []
    files_to_create = []
    for source_lang in source_languages:
        for target_lang in target_languages:
            output_filename_train = os.path.join(parallel_sentences_path, "TED2020-{}-{}-train.tsv.gz".format(source_lang, target_lang))
            output_filename_dev = os.path.join(parallel_sentences_path, "TED2020-{}-{}-dev.tsv.gz".format(source_lang, target_lang))
            output_filename_test = os.path.join(parallel_sentences_path, "TED2020-{}-{}-test.tsv.gz".format(source_lang, target_lang))
            train_files.append(output_filename_train)
            dev_files.append(output_filename_dev)
            test_files.append(output_filename_test)
            if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
                files_to_create.append({'src_lang': source_lang, 'trg_lang': target_lang,
                                        'fTrain': gzip.open(output_filename_train, 'wt', encoding='utf8'),
                                        'fDev': gzip.open(output_filename_dev, 'wt', encoding='utf8'),
                                        'fTest': gzip.open(output_filename_test, 'wt', encoding='utf8'),
                                        'devCount': 0,
                                        'testCount': 0
                                        })

    if len(files_to_create) > 0:
        print("Parallel sentences files {} do not exist. Create these files now".format(", ".join(map(lambda x: x['src_lang']+"-"+x['trg_lang'], files_to_create))))
        with gzip.open(train_corpus, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
            for line in tqdm(reader, desc="Sentences"):
                for outfile in files_to_create:
                    src_text = line[outfile['src_lang']].strip()
                    trg_text = line[outfile['trg_lang']].strip()

                    if src_text != "" and trg_text != "":
                        num = np.random.randint(100)
                        fOut = outfile['fTrain']
                        if outfile['devCount'] < max_dev_sentences and num < 10:
                            fOut = outfile['fDev']
                            outfile['devCount'] += 1
                        elif outfile['testCount'] < max_test_sentences and num >= 10 and num < 20:
                            fOut = outfile['fTest']
                            outfile['testCount'] += 1

                        fOut.write("{}\t{}\n".format(src_text, trg_text))

        for outfile in files_to_create:
            outfile['fTrain'].close()
            outfile['fDev'].close()
            outfile['fTest'].close()

    return train_files, dev_files, test_files

In [None]:
class SentenceEncoder():

    def __init__(self,teacher_model_name,student_model_name,
                 source_languages,target_languages,
                 max_seq_length = 128,
                 train_batch_size = 64,
                 inference_batch_size = 64,
                 max_sentences_per_language = 50000,
                 train_max_sentence_length = 500,
                 num_epochs = 5,
                 num_warmup_steps = 10000,
                 num_evaluation_steps = 1000,
                 dev_sentences = 2000,
                 output_path = 'output'):
        

        self.teacher_model_name = teacher_model_name
        self.student_model_name = student_model_name

        ''' Hyper Parameters for Training '''

        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.inference_batch_size = inference_batch_size
        self.max_sentences_per_language = max_sentences_per_language
        self.train_max_sentence_length = train_max_sentence_length

        self.num_epochs = num_epochs
        self.num_warmup_steps = num_warmup_steps

        self.num_evaluation_steps = num_evaluation_steps
        self.dev_sentences = dev_sentences

        self.source_languages = source_languages
        self.target_languages = target_languages

        self.output_path = output_path+"/Multilingual-Model-for-"+"-".join(sorted(list(source_languages))+sorted(list(target_languages)))


    def build_model(self):

        ''' Importing Teacher and Student Models '''

        self.teacher_model = SentenceTransformer(self.teacher_model_name)
        word_embedding_model = models.Transformer(self.student_model_name, max_seq_length=self.max_seq_length)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
        self.student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    def prepare_dataloader_and_evaluators(self, train_files, dev_files):

        ''' Creating Dataloader, Loss functions and Evaluators to be used for Training '''

        train_data = ParallelSentencesDataset(student_model=self.student_model, teacher_model=self.teacher_model, batch_size=self.inference_batch_size, use_embedding_cache=True)
        for train_file in train_files:
            train_data.load_data(train_file, max_sentences=self.max_sentences_per_language, max_sentence_length=self.train_max_sentence_length)

        self.train_dataloader = DataLoader(train_data, shuffle=True, batch_size=self.train_batch_size)
        self.train_loss = losses.MSELoss(model=self.student_model)

        self.evaluators = []

        for dev_file in dev_files:
            src_sentences = []
            trg_sentences = []
            with gzip.open(dev_file, 'rt', encoding='utf8') as fIn:
                for line in fIn:
                    splits = line.strip().split('\t')
                    if splits[0] != "" and splits[1] != "":
                        src_sentences.append(splits[0])
                        trg_sentences.append(splits[1])


            dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file), teacher_model=self.teacher_model, batch_size=self.inference_batch_size)
            self.evaluators.append(dev_mse)

            dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file),batch_size=self.inference_batch_size)
            self.evaluators.append(dev_trans_acc)

    
    def train_student_model(self):

        ''' Training Student Model '''

        self.student_model.fit(train_objectives = [(self.train_dataloader, self.train_loss)],
                evaluator = evaluation.SequentialEvaluator(self.evaluators, main_score_function=lambda scores: np.mean(scores)),
                epochs = self.num_epochs,
                warmup_steps = self.num_warmup_steps,
                evaluation_steps = self.num_evaluation_steps,
                output_path = self.output_path,
                save_best_model = True,
                optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
                )
        
    
    def load_student_model(self,path = None):

        ''' Loading a Student Model '''

        if path is None:
            path = self.output_path
        self.student_model = SentenceTransformer(path)

In [None]:
''' Languages to be provided for Teacher and Student Models '''
source_languages = set(['en'])
target_languages = set(['de', 'ru', 'fr'])

''' Downloading Ted Dataset and creating Parallel Datasets '''
#download_dataset(["/datasets/ted2020.tsv.gz"])
train_files, dev_files,test_files = make_parallel_sentences_dataset("parallel-sentences/","/datasets/ted2020.tsv.gz",source_languages,target_languages)

''' Pre Trained Teacher and Student Models '''
teacher_model = 'bert-base-nli-stsb-mean-tokens'
student_model = 'sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens'

In [None]:
''' Creating an instance of Encoder and Training '''

encoder = SentenceEncoder(teacher_model_name = teacher_model, 
                          student_model_name = student_model, 
                          source_languages = source_languages, 
                          target_languages = target_languages)
encoder.build_model()

''' Uncomment the following files for creating DataLoader and Training the Model'''
#encoder.prepare_dataloader_and_evaluators(train_files, dev_files)
#encoder.train_student_model()

100%|██████████| 405M/405M [00:14<00:00, 27.2MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=541.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1112256686.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=147.0, style=ProgressStyle(description_…




In [None]:
encoder.load_student_model('output/Multilingual-Model-for-en-de-fr-ru')

In [None]:
def similarity_scores(lang1,lang2,test_file):
    avg_cos_score = 0
    count = 0
    with gzip.open(test_file, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            sentence_encodings = []
            for key in row.keys():
                sentence_encodings.append(encoder.student_model.encode(row[key], convert_to_tensor = True))
            cos_score = cos_sim(sentence_encodings[0].detach().numpy(), sentence_encodings[1].detach().numpy())
            avg_cos_score += cos_score
            count += 1
    
    print('Similarity between {} and {} sentences : {}'.format(lang1,lang2,avg_cos_score/count))

In [None]:
similarity_scores('English', 'French', 'parallel-sentences/TED2020-en-fr-test.tsv.gz')
similarity_scores('English', 'German', 'parallel-sentences/TED2020-en-de-test.tsv.gz')
similarity_scores('English', 'Russian', 'parallel-sentences/TED2020-en-ru-test.tsv.gz')

Similarity between English and French sentences : 0.9192259997258028
Similarity between English and German sentences : 0.9146437048539333
Similarity between English and Russian sentences : 0.8768994552094961


In [None]:
def f1_score(sent1_path, sent2_path, gold_path,
             k_neighbors = 5, score_threshold = 1.06,
             cos_sim_threshold = 0.75, verbose = False):
    
    s1_sentences = {}
    s2_sentences = {}

    with open(sent1_path) as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        for row in csv_reader:
            s1_sentences[row[0]] = row[1]

    with open(sent2_path) as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        for row in csv_reader:
            s2_sentences[row[0]] = row[1]

    if verbose:
        print('Number of sentences of language 1 : {} and language 2 : {}'.format(len(s1_sentences),len(s2_sentences)))

    s1_embed = {}
    s2_embed = {}

    for s1_id in s1_sentences.keys():
        embed = encoder.student_model.encode(s1_sentences[s1_id],convert_to_tensor = True)
        embed = embed.detach().numpy()
        s1_embed[s1_id] = embed/np.linalg.norm(embed)

    for s2_id in s2_sentences.keys():
        embed = encoder.student_model.encode(s2_sentences[s2_id],convert_to_tensor = True)
        embed = embed.detach().numpy()
        s2_embed[s2_id] = embed/np.linalg.norm(embed)

    if verbose:
        print('Completed Calculating Embeddings of Sentences.')

    s1_data = []
    s2_data = []
    for s1_id in s1_embed:
        s1_data.append(s1_embed[s1_id])
    for s2_id in s2_embed:
        s2_data.append(s2_embed[s2_id])
    s1_data = np.array(s1_data)
    s2_data = np.array(s2_data)

    cos_sims = np.matmul(s1_data,s2_data.T)
    if verbose:
        print('Completed Calculating Cosine Similarities between Sentences.')

    s1_sum_topk = []
    s2_sum_topk = []
    for i in range(cos_sims.shape[0]):
        ind = np.argpartition(cos_sims[i], -1*k_neighbors)[-1*k_neighbors:]
        s1_sum_topk.append(np.sum(cos_sims[i,ind])/(2*k_neighbors))
    for i in range(cos_sims.shape[1]):
        ind = np.argpartition(cos_sims[:,i], -1*k_neighbors)[-1*k_neighbors:]
        s2_sum_topk.append(np.sum(cos_sims[ind,i])/(2*k_neighbors))

    scores = np.zeros_like(cos_sims)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            if cos_sims[i,j] > cos_sim_threshold:
                scores[i,j] = cos_sims[i,j]/(s1_sum_topk[i]+s2_sum_topk[j])
        
    if verbose:
        print('Completed Calculating Scores between Sentences.')

    s1_ids = list(s1_sentences.keys())
    s2_ids = list(s2_sentences.keys())

    gold_pairs = []
    with open(gold_path) as csvfile:
        csv_reader = csv.reader(csvfile, delimiter='\t')
        for row in csv_reader:
            gold_pairs.append((row[1],row[0]))

    if verbose:
        print('Number of Gold Standard Pairs : {}'.format(len(gold_pairs)))

    TP,FP,FN,TN = 0,0,0,0
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            pair = (s1_ids[i],s2_ids[j])
            flag = pair in gold_pairs
            score_i_j = scores[i][j]
            if flag and score_i_j >= score_threshold:
                TP += 1
            elif flag and score_i_j < score_threshold:
                FN += 1
            elif score_i_j >= score_threshold and not flag:
                FP += 1
            elif score_i_j < score_threshold and not flag:
                TN += 1

    P = TP/(TP+FP)
    R = TP/(TP+FN)
    F1 = (2*P*R)/(P+R)
    if verbose:
        print('F1-Score : {} \t Precision : {} \t Recall : {}'.format(F1,P,R))


In [None]:
''' F1 Score of En-De '''
f1_score('datasets/bucc2017-de-en.sample-gold/bucc2017/de-en/en-sent.csv', 
         'datasets/bucc2017-de-en.sample-gold/bucc2017/de-en/de-sent.csv', 
         'datasets/bucc2017-de-en.sample-gold/bucc2017/de-en/de-en.sample.gold',
          verbose=True)

Number of sentences of language 1 : 9663 and language 2 : 13567
Completed Calculating Embeddings of Sentences.
Completed Calculating Cosine Similarities between Sentences.
Completed Calculating Scores between Sentences.
Number of Gold Standard Pairs : 1038
F1-Score : 0.8538851351351351 	 Precision : 0.7601503759398496 	 Recall : 0.9739884393063584


In [None]:
''' F1 Score of En-Fr '''
f1_score('datasets/bucc2017-fr-en.sample-gold/bucc2017/fr-en/en-sent.csv', 
         'datasets/bucc2017-fr-en.sample-gold/bucc2017/fr-en/fr-sent.csv', 
         'datasets/bucc2017-fr-en.sample-gold/bucc2017/fr-en/fr-en.sample.gold',
          verbose=True)

Number of sentences of language 1 : 9097 and language 2 : 9116
Completed Calculating Embeddings of Sentences.
Completed Calculating Cosine Similarities between Sentences.
Completed Calculating Scores between Sentences.
Number of Gold Standard Pairs : 929
F1-Score : 0.8624210014584346 	 Precision : 0.7863475177304965 	 Recall : 0.9547900968783638


In [None]:
''' F1 Score of En-Ru '''
f1_score('datasets/bucc2017-ru-en.sample-gold/bucc2017/ru-en/en-sent.csv', 
         'datasets/bucc2017-ru-en.sample-gold/bucc2017/ru-en/ru-sent.csv', 
         'datasets/bucc2017-ru-en.sample-gold/bucc2017/ru-en/ru-en.gold',
          verbose=True)

Number of sentences of language 1 : 10678 and language 2 : 9927
Completed Calculating Embeddings of Sentences.
Completed Calculating Cosine Similarities between Sentences.
Completed Calculating Scores between Sentences.
Number of Gold Standard Pairs : 904
F1-Score : 0.7481513701609396 	 Precision : 0.6164874551971327 	 Recall : 0.9513274336283186
