In [18]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [19]:
from conllu import parse
import pandas as pd
import os

# Define the language code, used in the file names
lang_code = "PT"

# Main path
main_path = "/home/tajak/Parlamint-translation"

# Check whether the path to the folder with conllu files is ok
path = "{}/Source-data/ParlaMint-{}.conllu/ParlaMint-{}.conllu".format(main_path, lang_code, lang_code)

# Define other paths
extracted_dataframe_path = "{}/results/{}/ParlaMint-{}-extracted-source-data.csv".format(main_path, lang_code, lang_code)

We need to translate the following corpora into English:
- Belgian (BE) - in Dutch and French, separate CONLL-us (!!). French: ["fr", "itc","roa"], Dutch: ["nl", "gem", "gmw"]
- Bulgarian (BG): ["bg", "sla", "zls"]
- Croatian (HR) -> "zls"
- Czech (CZ) -> "cs"
- Danish (DK): ["da", "gmq", "gem"]
- Dutch (NL): ["nl", "gem", "gmw"]
- French (FR): ["fr", "itc","roa"]
- Hungarian (HU): ["hu", "fiu", "urj"]
- Icelandic (IS): ["is","gmq", "gem"]
- Italian (IT): ["it", "roa", "itc"]
- Latvian (LV): ["lv", "bat"]
- Lithuanian (LT): only "bat"
- Polish (PL): ["pl", "sla", "zlw"]
- Slovenian (SI) - We will use "Slavic MT" based on the results of the manual analysis --> "sla"
- Spanish? (ES): ["es", "roa", "itc"]
- Turkish (TR): ["tr", "trk" ]
- Austrian (AT): ["de", "gem", "gmw"]
- Basque (ES-PV): ["eu", "mul"]
- Bosnian (BA): ["sla", "zls"]
- Catalan (ES-CT): ["ca", "roa", "itc"]
- Estonian (EE): ["et", "urj", "fiu"]
- Finnish (FI): ["fi", "urj", "fiu"]
- Galician (ES-GA): ["gl", "roa", "itc"]
- Greek (GR):  ["el", "grk"]
- Norwegian (NO): ["gem", "gmq"]
- Portuguese (PT): ["pt", "roa", "itc"]
- Romanian (RO): ["roa", "itc"]
- Serbian (RS): ["sla", "zls"]
- Swedish (SE): ["sv", "gmq", "gem"]
- Ukrainian (UA): ["uk", "sla", "zle"]

Languages with only one option: Lithuanian ("LT": "bat")

Explanation of language codes:
- sla = Slavic
- zls = South Slavic
- zlw = West Slavic
- zle = East Slavic
- gmq = North Germanic
- gem = Germanic
- gmw = West Germanic
- roa = Romance
- itc = Italic
- bat = Baltic
- trk = Turkic
- urj = Uralic
- fiu = Finno-Ugrian

In [25]:
from easynmt import EasyNMT

# Define the model
model = EasyNMT('opus-mt')

print("\n\nAll languages with target_lang=de. I.e., we can translate from these languages to English (en).")
print(model.get_languages(source_lang='en'))





All languages with target_lang=de. I.e., we can translate from these languages to English (en).
['aav', 'af', 'alv', 'ar', 'az', 'bat', 'bcl', 'bem', 'ber', 'bg', 'bi', 'bnt', 'bzs', 'ca', 'ceb', 'cel', 'chk', 'cpf', 'crs', 'cs', 'cus', 'cy', 'da', 'de', 'dra', 'ee', 'efi', 'el', 'eo', 'es', 'et', 'eu', 'euq', 'fi', 'fj', 'fr', 'ga', 'gaa', 'gil', 'gl', 'grk', 'guw', 'gv', 'ha', 'he', 'hi', 'hil', 'ho', 'ht', 'hu', 'hy', 'id', 'ig', 'ilo', 'is', 'iso', 'it', 'jap', 'kg', 'kj', 'kqn', 'kwn', 'kwy', 'lg', 'ln', 'loz', 'lu', 'lua', 'lue', 'lun', 'luo', 'lus', 'map', 'mfe', 'mg', 'mh', 'mk', 'mkh', 'ml', 'mos', 'mr', 'mt', 'mul', 'ng', 'nic', 'niu', 'nl', 'nso', 'ny', 'nyk', 'om', 'pag', 'pap', 'phi', 'pis', 'pon', 'poz', 'pqe', 'pqw', 'rn', 'rnd', 'ro', 'roa', 'ru', 'run', 'rw', 'sal', 'sg', 'sit', 'sk', 'sm', 'sn', 'sq', 'ss', 'st', 'sv', 'sw', 'swc', 'tdt', 'ti', 'tiv', 'tl', 'tll', 'tn', 'to', 'toi', 'tpi', 'trk', 'ts', 'tut', 'tvl', 'tw', 'ty', 'uk', 'umb', 'ur', 'vi', 'xh', 'zh']


In [31]:
def choose_model(lang_code, extracted_dataframe_path):
	"""
	Compare a small sample of translations of all OPUS-MT models that are available
	for the language, to decide which one to use. The function prints out a dataframe with all translations of the sample and saves it as ParlaMint-{lang_code}-sample-model-comparison.csv.

	Args:
	- lang_code: the lang code that is used in the names of the files, it should be the same as for extract_text()
	- extracted_dataframe_path: path to the final output of 1-conllu-to-df.py
	"""
	import pandas as pd
	import regex as re
	from easynmt import EasyNMT
	from IPython.display import display
	
	lang_models_dict = {"BG": ["bg", "sla", "zls"], "HR": ["zls", "sla"], "CZ": ["cs", "sla", "zlw" ], "DK": ["da", "gmq", "gem"], "NL": ["nl", "gem", "gmw"], "FR": ["fr", "itc","roa"], "HU": ["hu", "fiu", "urj"], "IS": ["is","gmq", "gem"], "IT": ["it", "roa", "itc"], "LV": ["lv","bat"], "LT": ["bat"], "PL": ["pl", "sla", "zlw"], "SI": ["sla"], "ES": ["es", "roa", "itc"], "TR": ["tr", "trk" ], "AT": ["de", "gem", "gmw"], "ES-PV": ["eu", "mul"], "BA": ["sla", "zls"], "ES-CT": ["ca", "roa", "itc"], "EE": ["et", "urj", "fiu"], "FI": ["fi", "urj", "fiu"], "ES-GA": ["gl", "roa", "itc"], "GR": ["el","grk"], "NO": ["gem", "gmq"], "PT": ["roa", "itc"], "RO":["roa", "itc"], "RS": ["zls", "sla"], "SE": ["sv", "gmq", "gem"], "UA":["uk", "sla", "zle"]}


	# Open the file, created in the previous step
	df = pd.read_csv("{}".format(extracted_dataframe_path), sep="\t", index_col=0)

	# Define the model
	model = EasyNMT('opus-mt')

	print("Entire corpus has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a smaller sample - just a couple of sentences from one file
	df = df[df.file == list(df["file"].unique())[0]][:30]

	print("Sample files has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a list of sentences from the df
	sentence_list = df.text.to_list()

	# Translate the sample using all available models for this language
	for opus_lang_code in lang_models_dict[lang_code]:
		translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')

		# Add the translations to the df
		df["translation-{}".format(opus_lang_code)] = translation_list
	
	df = df.drop(columns=["file", "sentence_id", "tokenized_text", "proper_nouns", "length"])

	# For Portuguese, let's try another Portuguese model that is on HF, but is not OPUS-MT
	if lang_code == "PT":
		new_translation_list = []

		from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
		ckpt = 'Narrativa/mbart-large-50-finetuned-opus-pt-en-translation'

		tokenizer = MBart50TokenizerFast.from_pretrained(ckpt)
		model = MBartForConditionalGeneration.from_pretrained(ckpt).to("cuda")

		tokenizer.src_lang = 'pt_XX'

		def translate(text):
			inputs = tokenizer(text, return_tensors='pt')
			input_ids = inputs.input_ids.to('cuda')
			attention_mask = inputs.attention_mask.to('cuda')
			output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['en_XX'])
			return tokenizer.decode(output[0], skip_special_tokens=True)

		for sentence in sentence_list:
			translation = translate(sentence)
			new_translation_list.append(translation)
		
		df["translation-narrativa"] = new_translation_list

	# Save the df
	df.to_csv("/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv".format(lang_code, lang_code))

	print("The file is saved as/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv. ".format(lang_code, lang_code))

	return df


In [32]:
df = choose_model(lang_code, extracted_dataframe_path)

Entire corpus has 458643 sentences and 18336113 words.
Sample files has 30 sentences and 1376 words.


Downloading (…)olve/main/source.spm: 100%|██████████| 800k/800k [00:00<00:00, 1.23MB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 779k/779k [00:00<00:00, 1.21MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.46M/1.46M [00:00<00:00, 1.94MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 265/265 [00:00<00:00, 111kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.36k/1.36k [00:00<00:00, 689kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 312M/312M [00:08<00:00, 35.7MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 178kB/s]
Downloading (…)olve/main/source.spm: 100%|██████████| 792k/792k [00:00<00:00, 1.22MB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 787k/787k [00:00<00:00, 1.22MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.27M/1.27M [00:00<00:00, 1.47MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 17.2kB/s]
Downloading (…)lve/main/config.j

The file is saved as/home/tajak/Parlamint-translation/results/PT/ParlaMint-PT-sample-model-comparison.csv. 


In [5]:
# Then open the sample and manually evaluate which model is better in the column "comparison"
# Open the analysed sample

sample = pd.read_csv("/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv".format(lang_code, lang_code), index_col = 0)
sample.head(2)

Unnamed: 0,file_path,text,translation-bg,translation-sla,translation-zls
0,/home/tajak/Parlamint-translation/Source-data/...,"Уважаеми народни представители, добър ден!","Dear MPs, good afternoon!","Ladies and gentlemen, good afternoon!","Dear People's Representatives, good afternoon!"
1,/home/tajak/Parlamint-translation/Source-data/...,Моля да се регистрираме.,Please register.,Please sign up.,Please sign up.


In [6]:
sample.comparison.value_counts()

zls    10
sla     1
Name: comparison, dtype: int64

We will use the "zls" model.