In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=3


In [2]:
from conllu import parse
import pandas as pd
import os

# Define the language code, used in the file names
lang_code = "AT"

# Main path
main_path = "/home/tajak/Parlamint-translation"

# Check whether the path to the folder with conllu files is ok
path = "{}/Source-data/ParlaMint-{}.conllu/ParlaMint-{}.conllu".format(main_path, lang_code, lang_code)

# Define other paths
extracted_dataframe_path = "{}/results/{}/ParlaMint-{}-extracted-source-data.csv".format(main_path, lang_code, lang_code)

We need to translate the following corpora into English:
- Belgian (BE) - in Dutch and French, separate CONLL-us (!!). French: ["fr", "itc","roa"], Dutch: ["nl", "gem", "gmw"]
- Bulgarian (BG): ["bg", "sla", "zls"]
- Croatian (HR) -> "zls"
- Czech (CZ) -> "cs"
- Danish (DK): ["da", "gmq", "gem"]
- Dutch (NL): ["nl", "gem", "gmw"]
- French (FR): ["fr", "itc","roa"]
- Hungarian (HU): ["hu", "fiu", "urj"]
- Icelandic (IS): ["is","gmq", "gem"]
- Italian (IT): ["it", "roa", "itc"]
- Latvian (LV): ["lv", "bat"]
- Lithuanian (LT): only "bat"
- Polish (PL): ["pl", "sla", "zlw"]
- Slovenian (SI) - We will use "Slavic MT" based on the results of the manual analysis --> "sla"
- Spanish? (ES): ["es", "roa", "itc"]
- Turkish (TR): ["tr", "trk" ]
- Austrian (AT): ["de", "gem", "gmw"]
- Basque (ES-PV): ["eu", "mul"]
- Bosnian (BA): ["sla", "zls"]
- Catalan (ES-CT): ["ca", "roa", "itc"]
- Estonian (EE): ["et", "urj", "fiu"]
- Finnish (FI): ["fi", "urj", "fiu"]
- Galician (ES-GA): ["gl", "roa", "itc"]
- Greek (GR):  ["el", "grk"]
- Norwegian (NO): ["gem", "gmq"]
- Portuguese (PT): ["pt", "roa", "itc"]
- Romanian (RO): ["roa", "itc"]
- Serbian (RS): ["sla", "zls"]
- Swedish (SE): ["sv", "gmq", "gem"]
- Ukrainian (UA): ["uk", "sla", "zle"]

Languages with only one option: Lithuanian ("LT": "bat")

Explanation of language codes:
- sla = Slavic
- zls = South Slavic
- zlw = West Slavic
- zle = East Slavic
- gmq = North Germanic
- gem = Germanic
- gmw = West Germanic
- roa = Romance
- itc = Italic
- bat = Baltic
- trk = Turkic
- urj = Uralic
- fiu = Finno-Ugrian

In [3]:
def choose_model(lang_code, extracted_dataframe_path):
	"""
	Compare a small sample of translations of all OPUS-MT models that are available
	for the language, to decide which one to use. The function prints out a dataframe with all translations of the sample and saves it as ParlaMint-{lang_code}-sample-model-comparison.csv.

	Args:
	- lang_code: the lang code that is used in the names of the files, it should be the same as for extract_text()
	- extracted_dataframe_path: path to the final output of 1-conllu-to-df.py
	"""
	import pandas as pd
	import regex as re
	from easynmt import EasyNMT
	from IPython.display import display
	
	lang_models_dict = {"BG": ["bg", "sla", "zls"], "HR": ["zls", "sla"], "CZ": ["cs", "sla", "zlw" ], "DK": ["da", "gmq", "gem"], "NL": ["nl", "gem", "gmw"], "FR": ["fr", "itc","roa"], "HU": ["hu", "fiu", "urj"], "IS": ["is","gmq", "gem"], "IT": ["it", "roa", "itc"], "LV": ["lv","bat"], "LT": ["bat"], "PL": ["pl", "sla", "zlw"], "SI": ["sla"], "ES": ["es", "roa", "itc"], "TR": ["tr", "trk" ], "AT": ["de", "gem", "gmw"], "ES-PV": ["eu", "mul"], "BA": ["sla", "zls"], "ES-CT": ["ca", "roa", "itc"], "EE": ["et", "urj", "fiu"], "FI": ["fi", "urj", "fiu"], "ES-GA": ["gl", "roa", "itc"], "GR": ["el","grk"], "NO": ["gem", "gmq"], "PT": ["pt", "roa", "itc"], "RO":["roa", "itc"], "RS": ["zls", "sla"], "SE": ["sv", "gmq", "gem"], "UA":["uk", "sla", "zle"]}


	# Open the file, created in the previous step
	df = pd.read_csv("{}".format(extracted_dataframe_path), sep="\t", index_col=0)

	# Define the model
	model = EasyNMT('opus-mt')

	print("Entire corpus has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a smaller sample - just a couple of sentences from one file
	df = df[df.file == list(df["file"].unique())[0]][:30]

	print("Sample files has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a list of sentences from the df
	sentence_list = df.text.to_list()

	# Translate the sample using all available models for this language
	for opus_lang_code in lang_models_dict[lang_code]:
		translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')

		# Add the translations to the df
		df["translation-{}".format(opus_lang_code)] = translation_list
	
	df = df.drop(columns=["file", "sentence_id", "tokenized_text", "proper_nouns", "length"])

	# Save the df
	df.to_csv("/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv".format(lang_code, lang_code))

	print("The file is saved as/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv. ".format(lang_code, lang_code))

	return df


In [4]:
df = choose_model(lang_code, extracted_dataframe_path)

  from .autonotebook import tqdm as notebook_tqdm
2023-02-06 08:42:18.226823: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-06 08:42:19.025168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-06 08:42:19.025237: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


Entire corpus has 3919672 sentences and 59959897 words.
Sample files has 30 sentences and 349 words.


Downloading (…)olve/main/source.spm: 100%|██████████| 797k/797k [00:00<00:00, 879kB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 768k/768k [00:00<00:00, 1.15MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.27M/1.27M [00:01<00:00, 901kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 19.3kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.38k/1.38k [00:00<00:00, 534kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 298M/298M [00:42<00:00, 6.96MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 80.5kB/s]
Downloading (…)olve/main/source.spm: 100%|██████████| 790k/790k [00:00<00:00, 1.42MB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 784k/784k [00:00<00:00, 1.16MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.28M/1.28M [00:01<00:00, 1.15MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 21.1kB/s]
Downloading (…)lve/main/config

The file is saved as/home/tajak/Parlamint-translation/results/AT/ParlaMint-AT-sample-model-comparison.csv. 


In [5]:
# Then open the sample and manually evaluate which model is better in the column "comparison"
# Open the analysed sample

sample = pd.read_csv("/home/tajak/Parlamint-translation/results/{}/ParlaMint-{}-sample-model-comparison.csv".format(lang_code, lang_code), index_col = 0)
sample.head(2)

Unnamed: 0,file_path,text,translation-de,translation-gem,translation-gmw
0,/home/tajak/Parlamint-translation/Source-data/...,"Guten Tag, meine Damen und Herren!","Hello, ladies and gentlemen!","Good afternoon, ladies and gentlemen!","Good day, ladies and gentlemen!"
1,/home/tajak/Parlamint-translation/Source-data/...,Ich eröffne die 190. Sitzung des Nationalrates...,I'll open the 190. Meeting of the National Cou...,I'll open the 190. Meeting of the National Cou...,I'll open the 190. Meeting of the National Cou...


In [6]:
sample.comparison.value_counts()

zls    10
sla     1
Name: comparison, dtype: int64

We will use the "zls" model.