In [1]:
import gzip
import shutil
import wget
import regex as re
import pandas as pd
import numpy as np
import json

In [2]:
# Download the file

#Defining the zip file URL
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1525/MaCoCu-mt-en.tmx.gz"

# Downloading the file by sending the request to the URL
corpus_file = wget.download(url)
print('Downloading Completed')

Downloading Completed


In [3]:
# Unzip the file
with gzip.open('MaCoCu-mt-en.tmx.gz', 'rb') as f_in:
    with open('MaCoCu-mt-en.tmx', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [4]:
corpus = open('MaCoCu-mt-en.tmx', "r").read()

In [5]:
# Create and view a sample of the corpus
corpus_sample = open('MaCoCu-mt-en.tmx', "r").read(5000)
print(corpus_sample)

<?xml version="1.0"?>
<tmx version="1.4">
 <header
   adminlang="C"
   srclang="en"
   o-tmf="PlainText"
   creationtool="bitextor"
   creationtoolversion="8.2"
   datatype="PlainText"
   segtype="sentence"
   creationdate="20220420T120753"
   o-encoding="utf-8">
 </header>
 <body>
   <tu tuid="1" datatype="Text">
    <prop type="score-bicleaner-ai">0.999</prop>
    <prop type="biroamer-entities">No</prop>
    <prop type="translation-direction">en-orig</prop>
    <prop type="type">1:1</prop>
    <prop type="info">different numbers in TUVs</prop>
    <tuv xml:lang="en">
     <prop type="source-document">https://eur-lex.europa.eu/legal-content/en/TXT/?uri=CELEX:62009CJ0325</prop>
     <prop type="checksum-seg">f24019104ded0be7</prop>
     <prop type="paragraph-id">p319s0</prop>
    <prop type="english-variant-document">A</prop>
    <prop type="english-variant-domain">B</prop>
     <seg>18 Income support is a means-tested benefit for various groups of persons.</seg>
    </tuv>
    <tuv xm

In [6]:
# Prepare all the regexes
# Compile all tus
tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)
# Compile relevant information inside tus
bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
sl_source_re = re.compile('<tuv xml:lang="mt">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
sl_par_id_re = re.compile('<tuv xml:lang="mt">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
sl_par_re = re.compile('<tuv xml:lang="mt">.*?<seg>(.*?)</seg>', re.DOTALL)


In [7]:
# Create a list of all tus from the sample corpus
tus_list_sample = tu_re.findall(corpus_sample)
len(tus_list_sample)

3

In [8]:
# View the tus_list
tus_list_sample[1]

'    <prop type="score-bicleaner-ai">1.000</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">mt-orig</prop>\n    <prop type="type">1:1</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://www.europarl.europa.eu/doceo/document/A-8-2015-0209_EN.html</prop>\n     <prop type="checksum-seg">56f5b20ff10cb6e5</prop>\n     <prop type="paragraph-id">p168s0</prop>\n    <prop type="english-variant-document">B</prop>\n    <prop type="english-variant-domain">B</prop>\n     <seg>Stresses the importance of taking into account the conclusions of the numerous experiments being undertaken by the book industry to establish fair, balanced and viable business models;</seg>\n    </tuv>\n    <tuv xml:lang="mt">\n     <prop type="source-document">https://www.europarl.europa.eu/doceo/document/A-8-2015-0209_MT.html</prop>\n     <prop type="checksum-seg">3f1f2fccb1774b2c</prop>\n     <prop type="paragraph-id">p168s0</prop>\n     <seg>Jenfasizza l-i

In [9]:
# Check if regexes work
regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

for rex in regexes:
    test_list = rex.findall(tus_list_sample[1])
    print(test_list)

['1.000']
['No']
['mt-orig']
['https://www.europarl.europa.eu/doceo/document/A-8-2015-0209_EN.html']
['p168s0']
['Stresses the importance of taking into account the conclusions of the numerous experiments being undertaken by the book industry to establish fair, balanced and viable business models;']
['B']
['B']
['https://www.europarl.europa.eu/doceo/document/A-8-2015-0209_MT.html']
['p168s0']
["Jenfasizza l-importanza li jiġu meqjusa l-konklużjonijiet ta' diversi esperimenti li qed jitwettqu mill-industrija tal-kotba biex jiġu stabbiliti mudelli tan-negozju ġusti, bilanċjati u vijabbli;"]


In [10]:
# Create a list of all tus from the corpus
tus_list = tu_re.findall(corpus)
len(tus_list)

1231654

In [11]:
# Inspect an instance
tus_list[:1]

['    <prop type="score-bicleaner-ai">0.999</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">en-orig</prop>\n    <prop type="type">1:1</prop>\n    <prop type="info">different numbers in TUVs</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://eur-lex.europa.eu/legal-content/en/TXT/?uri=CELEX:62009CJ0325</prop>\n     <prop type="checksum-seg">f24019104ded0be7</prop>\n     <prop type="paragraph-id">p319s0</prop>\n    <prop type="english-variant-document">A</prop>\n    <prop type="english-variant-domain">B</prop>\n     <seg>18 Income support is a means-tested benefit for various groups of persons.</seg>\n    </tuv>\n    <tuv xml:lang="mt">\n     <prop type="source-document">https://eur-lex.europa.eu/legal-content/MT/TXT/?uri=CELEX:62009CJ0162</prop>\n     <prop type="checksum-seg">871be9dd9a460ba4</prop>\n     <prop type="paragraph-id">p285s0</prop>\n     <seg>L-appoġġ tad-dħul huwa benefiċċju mogħti skont ir-riżorsi lil g

In [12]:
# Create a list of dictionaries from the tus_list based on regexes
tus_content = []

for i in tus_list:
	# Find all relevant information based on regexes
	bi_score = bi_score_re.search(i).group(1)
	biroamer = biroamer_re.search(i).group(1)
	translation_dir = translation_dir_re.search(i).group(1)
	en_source = en_source_re.search(i).group(1)
	en_par_id = en_par_id_re.search(i).group(1)
	en_par = en_par_re.search(i).group(1)
	en_var_doc = en_var_doc_re.search(i).group(1)
	en_var_dom = en_var_dom_re.search(i).group(1)
	sl_source = sl_source_re.search(i).group(1)
	sl_par_id = sl_par_id_re.search(i).group(1)
	sl_par = sl_par_re.search(i).group(1)
	# Add information to the dictionary
	current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, "mt_source": sl_source, "mt_par_id": sl_par_id, "mt_par": sl_par}
	# Append the dictionary to the list
	tus_content.append(current_tu)

print(len(tus_content))

# Print some instances of the tus_content
tus_content[:2]

1231654


[{'score_bicleaner_ai': 0.999,
  'biroamer_entities': 'No',
  'translation_direction': 'en-orig',
  'en_source': 'https://eur-lex.europa.eu/legal-content/en/TXT/?uri=CELEX:62009CJ0325',
  'en_par_id': 'p319s0',
  'en_par': '18 Income support is a means-tested benefit for various groups of persons.',
  'en_var_doc': 'A',
  'en_var_dom': 'B',
  'mt_source': 'https://eur-lex.europa.eu/legal-content/MT/TXT/?uri=CELEX:62009CJ0162',
  'mt_par_id': 'p285s0',
  'mt_par': "L-appoġġ tad-dħul huwa benefiċċju mogħti skont ir-riżorsi lil gruppi differenti ta' persuni."},
 {'score_bicleaner_ai': 1.0,
  'biroamer_entities': 'No',
  'translation_direction': 'mt-orig',
  'en_source': 'https://www.europarl.europa.eu/doceo/document/A-8-2015-0209_EN.html',
  'en_par_id': 'p168s0',
  'en_par': 'Stresses the importance of taking into account the conclusions of the numerous experiments being undertaken by the book industry to establish fair, balanced and viable business models;',
  'en_var_doc': 'B',
  'en_v

In [13]:
# Save json

with open("Macocu-mt-en.json", "w") as file:
	json.dump(tus_content,file, indent= "")