In [None]:
def download_unzip(url, folder_name):
	"""
	The function downloads the zipped folder from an URL (from the CLARIN.SI repository),
	unzips it and saves the TMX file. It should be used for the MaCoCu data which is zipped
	using the GZ appendix and the name of the file is the same as the name of the folder.

	Args:
	- url(string): the URL from which the file can be downloaded
	- folder_name (string): name of the zipped folder; without ".GZ"
	"""
	
	import gzip
	import shutil
	import wget
	import regex as re
	import pandas as pd
	import numpy as np
	import json

	# Downloading the file by sending the request to the URL
	corpus_file = wget.download(url)
	print('Downloading Completed')

	# Unzip the file
	with gzip.open(f'{folder_name}.gz', 'rb') as f_in:
		with open(f'{folder_name}', 'wb') as f_out:
			shutil.copyfileobj(f_in, f_out)

def tmx_to_json(file_name, lang_code):
	"""
	Takes the TMX file of the MaCoCu corpora and transforms it into a JSON.
	It saved the JSON file.

	Args:
	- file_name (string): name of the TMX file
	- lang_code (string): the language code for the language, used along English - it is the same as in the name of the corpus (e.g. "mk" for MaCoCu-mk-en)
	"""
	corpus = open(f"{file_name}", "r").read()

	# Prepare all the regexes
	# Compile all tus
	tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)

	# Compile relevant information inside tus
	bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
	biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
	translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
	en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
	en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
	en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
	en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
	en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
	sl_source_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
	sl_par_id_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
	sl_par_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<seg>(.*?)</seg>', re.DOTALL)

	# Create a list of all tus from the sample corpus
	tus_list_sample = tu_re.findall(corpus_sample)

	# View the tus_list
	print("A sample of the tus in the corpora:\n")
	print(tus_list_sample[1])

	# Check if regexes work
	regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

	print("A check if regexes work:")

	for rex in regexes:
		test_list = rex.findall(tus_list_sample[1])
		print(test_list)

	# Create a list of all tus from the corpus
	tus_list = tu_re.findall(corpus)
	print("All tus from the corpora were extracted. The number of sentence pairs (tus) is:\n")
	print(len(tus_list))

		# Create a list of dictionaries from the tus_list based on regexes
	tus_content = []

	for i in tus_list:
		# Find all relevant information based on regexes
		bi_score = bi_score_re.search(i).group(1)
		biroamer = biroamer_re.search(i).group(1)
		translation_dir = translation_dir_re.search(i).group(1)
		en_source = en_source_re.search(i).group(1)
		en_par_id = en_par_id_re.search(i).group(1)
		en_par = en_par_re.search(i).group(1)
		en_var_doc = en_var_doc_re.search(i).group(1)
		en_var_dom = en_var_dom_re.search(i).group(1)
		sl_source = sl_source_re.search(i).group(1)
		sl_par_id = sl_par_id_re.search(i).group(1)
		sl_par = sl_par_re.search(i).group(1)

		# Add information to the dictionary
		current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, f"{lang_code}_source": sl_source, f"{lang_code}_par_id": sl_par_id, f"{lang_code}_par": sl_par}
		# Append the dictionary to the list
		tus_content.append(current_tu)

	print("The JSON format created. A sample of an instance: \n")
	# Print some instances of the tus_content
	print(tus_content[:2])

	# Save json

	with open(f"MaCoCu-{lang_code}-en.json", "w") as file:
		json.dump(tus_content,file, indent= "")
	
	print("The JSON file is saved.")