In [29]:
import json
import pandas as pd
from tqdm.notebook import tqdm
from transformers import MarianTokenizer, MarianMTModel
import torch
import copy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [34]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [35]:
source_langs = ['fr', 'uk', 'lt', 'be', 'ba'] #
target_langs = ['en', 'de', 'ru', 'fr']

In [36]:
def find_representation(lang, questions):
    for q in questions:
        if q['language'] == lang:
            return q['string']
    return None

# Helsinki NLP

In [37]:
def process(data_path, d_type='train'):
    for src in source_langs:
        clear_dict = dict()
        dataset = read_json(data_path)
        dataset_new = copy.deepcopy(dataset)
        for tgt in target_langs:
            if tgt == src:
                continue
            try:
                model = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{src}-{tgt}').to(device)
                tokenizer = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{src}-{tgt}')
            except:
                model = None
                print('no model exists')
                continue
            print(src, tgt)
            for i in range(len(dataset['questions'])):
                try:
                    text = find_representation(src, dataset['questions'][i]['question']) # scan for representations in QALD+
                    if not text:
                        dataset_new['questions'][i]['question'].clear()
                        continue

                    batch = tokenizer([text], return_tensors="pt", padding=True).to(device)
                    batch["input_ids"] = batch["input_ids"][:, :512]
                    batch["attention_mask"] = batch["attention_mask"][:, :512]

                    translation = model.generate(**batch)
                    translation = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
                    # append to the dataset
                    if dataset['questions'][i]['id'] not in clear_dict.keys():
                        dataset_new['questions'][i]['question'].clear()
                        clear_dict[dataset['questions'][i]['id']] = True
                    dataset_new['questions'][i]['question'].append({'language': tgt, 'string': translation})
                except Exception as e:
                    print(str(e))
        if model:
            dataset_new['questions'] = [q for q in dataset_new['questions'] if len(q['question']) > 0]
            write_json(dataset_new, f"../data/translated/qald_{d_type}_wikidata-helsinki-{src}.json")

In [38]:
process("/home/ins-alex/qald_plus/data/qald_train_wikidata.json", 'train')

fr en
fr de
fr ru
uk en
uk de
uk ru
uk fr


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-lt-en/resolve/main/config.json


no model exists
lt de
lt ru
lt fr


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-en/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-de/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-ru/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-fr/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-en/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-de/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-ru/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-fr/resolve/main/config.json


no model exists


In [39]:
process("/home/ins-alex/qald_plus/data/qald_test_wikidata.json", 'test')

fr en
fr de
fr ru
uk en
uk de
uk ru
uk fr


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-lt-en/resolve/main/config.json


no model exists
lt de
lt ru
lt fr


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-en/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-de/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-ru/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-be-fr/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-en/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-de/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-ru/resolve/main/config.json


no model exists


404 Client Error: Not Found for url: https://huggingface.co/Helsinki-NLP/opus-mt-ba-fr/resolve/main/config.json


no model exists


# Yandex

In [40]:
import requests
import json
import os

headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer t1.9euelZqdmMacnZLLkMjOzMjJkZKJie3rnpWay8bOl5yJycbNk5abm52VmpHl8_caGztz-e8qOCIe_t3z91pJOHP57yo4Ih7-.YFtkWCIwAOKzsZY2kMrvj3MSgejqD3YyDQgxCso0LbwfEGOLIAkHBpK22N_h6CUocHFGTBS0pYfpS8qw9bWaCg'
}

In [41]:
def process_yandex(data_path, d_type='train'):
    for src in source_langs:
        clear_dict = dict()
        dataset = read_json(data_path)
        dataset_new = copy.deepcopy(dataset)
        for tgt in target_langs:
            if tgt == src:
                continue
            print(src, tgt)
            for i in range(len(dataset['questions'])):
                try:
                    text = find_representation(src, dataset['questions'][i]['question']) # scan for representations in QALD+
                    
                    if not text:
                        dataset_new['questions'][i]['question'].clear()
                        continue
                    print(text)
                    data = {
                        "folder_id": "b1gokvlkpm64tv8932m8",
                        "texts": [text],
                        "sourceLanguageCode": src if src != 'arm' else 'hy',
                        "targetLanguageCode": tgt
                    }
                    print(data)
                    response = requests.post(
                        "https://translate.api.cloud.yandex.net/translate/v2/translate",
                        json=data,
                        headers=headers,
                        timeout=10
                    ).json()
                    
                    translation = response['translations'][0]['text']
                    # append to the dataset
                    if dataset['questions'][i]['id'] not in clear_dict.keys():
                        dataset_new['questions'][i]['question'].clear()
                        clear_dict[dataset['questions'][i]['id']] = True
                    dataset_new['questions'][i]['question'].append({'language': tgt, 'string': translation})
                except Exception as e:
                    dataset_new['questions'][i]['question'].clear()
                    print(str(e))
                    
        dataset_new['questions'] = [q for q in dataset_new['questions'] if len(q['question']) > 0]
        write_json(dataset_new, f"../data/translated/qald_{d_type}_wikidata-yandex-{src}.json")

In [43]:
process_yandex("/home/ins-alex/qald_plus/data/qald_test_wikidata.json", 'test')

fr en
Quels acteurs jouent dans la Théorie du Big Bang?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Quels acteurs jouent dans la Théorie du Big Bang?'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Lequel des spécialistes en informatique a remporté un Oscar?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Lequel des spécialistes en informatique a remporté un Oscar?'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Qui a écrit Harry Potter?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Qui a écrit Harry Potter?'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Donne-moi tous les écrivains qui ont reçu le prix Nobel de littérature.
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Donne-moi tous les écrivains qui ont reçu le prix Nobel de littérature.'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Donnez-moi des acteurs anglais jouant dans " Remember All Links»
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Donnez-moi des acteurs anglais

In [42]:
process_yandex("/home/ins-alex/qald_plus/data/qald_train_wikidata.json", 'train')

fr en
Listez tous les jeux de société de GMT.
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Listez tous les jeux de société de GMT.'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Qui a développé Skype ?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Qui a développé Skype ?'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Qui est né à Heraklion ?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Qui est né à Heraklion ?'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Dans quel Etat des Etats-Unis se situe l'Area 51 ?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ["Dans quel Etat des Etats-Unis se situe l'Area 51 ?"], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Qui est le maire de la cité de New York ?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Qui est le maire de la cité de New York ?'], 'sourceLanguageCode': 'fr', 'targetLanguageCode': 'en'}
Où est décédé Abraham Lincoln ?
{'folder_id': 'b1gokvlkpm64tv8932m8', 'texts': ['Où est dé