In [1]:
import gzip
import shutil
import wget
import regex as re
import pandas as pd
import numpy as np
import json

In [2]:
# Download the file

#Defining the zip file URL
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1524/MaCoCu-is-en.tmx.gz"

# Downloading the file by sending the request to the URL
corpus_file = wget.download(url)
print('Downloading Completed')

Downloading Completed


In [3]:
# Unzip the file
with gzip.open('MaCoCu-is-en.tmx.gz', 'rb') as f_in:
    with open('MaCoCu-is-en.tmx', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [4]:
corpus = open('MaCoCu-is-en.tmx', "r").read()

In [5]:
# Create and view a sample of the corpus
corpus_sample = open('MaCoCu-is-en.tmx', "r").read(5000)
print(corpus_sample)

<?xml version="1.0"?>
<tmx version="1.4">
 <header
   adminlang="C"
   srclang="en"
   o-tmf="PlainText"
   creationtool="bitextor"
   creationtoolversion="8.2"
   datatype="PlainText"
   segtype="sentence"
   creationdate="20220420T120753"
   o-encoding="utf-8">
 </header>
 <body>
   <tu tuid="1" datatype="Text">
    <prop type="score-bicleaner-ai">0.962</prop>
    <prop type="biroamer-entities">Yes</prop>
    <prop type="translation-direction">en-orig</prop>
    <prop type="type">1:1</prop>
    <tuv xml:lang="en">
     <prop type="source-document">https://en.alda.is/about-alda/</prop>
     <prop type="checksum-seg">a89f39aaf4b2adf4</prop>
     <prop type="paragraph-id">p15s0</prop>
    <prop type="english-variant-document">B</prop>
    <prop type="english-variant-domain">A</prop>
     <seg>Alda is not a political party and never will be.</seg>
    </tuv>
    <tuv xml:lang="is">
     <prop type="source-document">https://alda.is/um-oldu/</prop>
     <prop type="checksum-seg">214c1a7cb0

In [6]:
# Prepare all the regexes
# Compile all tus
tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)
# Compile relevant information inside tus
bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
sl_source_re = re.compile('<tuv xml:lang="is">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
sl_par_id_re = re.compile('<tuv xml:lang="is">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
sl_par_re = re.compile('<tuv xml:lang="is">.*?<seg>(.*?)</seg>', re.DOTALL)


In [7]:
# Create a list of all tus from the sample corpus
tus_list_sample = tu_re.findall(corpus_sample)
len(tus_list_sample)

4

In [8]:
# View the tus_list
tus_list_sample[1]

'    <prop type="score-bicleaner-ai">0.958</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">is-orig</prop>\n    <prop type="type">1:1</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://www.norden.org/en/nominee/sara-lundberg</prop>\n     <prop type="checksum-seg">82ffb63f9904d0a5</prop>\n     <prop type="paragraph-id">p163s8</prop>\n    <prop type="english-variant-document">B</prop>\n    <prop type="english-variant-domain">B</prop>\n     <seg>And I’m scared that I’m holding too tightly, So I open my hands... and it makes a run for it and flies!</seg>\n    </tuv>\n    <tuv xml:lang="is">\n     <prop type="source-document">https://www.norden.org/is/nominee/sara-lundberg</prop>\n     <prop type="checksum-seg">9499845d74253251+8ca0aeac53794842</prop>\n     <prop type="paragraph-id">p163s8+p163s9</prop>\n     <seg>Og ég verð hrædd um að ég haldi of fast, svo opna ég lófana... Þá hoppar hann og flýgur!</seg>\n    </tuv>\n   

In [9]:
# Check if regexes work
regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

for rex in regexes:
    test_list = rex.findall(tus_list_sample[1])
    print(test_list)

['0.958']
['No']
['is-orig']
['https://www.norden.org/en/nominee/sara-lundberg']
['p163s8']
['And I’m scared that I’m holding too tightly, So I open my hands... and it makes a run for it and flies!']
['B']
['B']
['https://www.norden.org/is/nominee/sara-lundberg']
['p163s8+p163s9']
['Og ég verð hrædd um að ég haldi of fast, svo opna ég lófana... Þá hoppar hann og flýgur!']


In [10]:
# Create a list of all tus from the corpus
tus_list = tu_re.findall(corpus)
len(tus_list)

355100

In [11]:
# Inspect an instance
tus_list[:1]

['    <prop type="score-bicleaner-ai">0.962</prop>\n    <prop type="biroamer-entities">Yes</prop>\n    <prop type="translation-direction">en-orig</prop>\n    <prop type="type">1:1</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://en.alda.is/about-alda/</prop>\n     <prop type="checksum-seg">a89f39aaf4b2adf4</prop>\n     <prop type="paragraph-id">p15s0</prop>\n    <prop type="english-variant-document">B</prop>\n    <prop type="english-variant-domain">A</prop>\n     <seg>Alda is not a political party and never will be.</seg>\n    </tuv>\n    <tuv xml:lang="is">\n     <prop type="source-document">https://alda.is/um-oldu/</prop>\n     <prop type="checksum-seg">214c1a7cb018fb57</prop>\n     <prop type="paragraph-id">p20s0</prop>\n     <seg>Alda er ekki stjórnmálaflokkur og mun ekki verða.</seg>\n    </tuv>\n   ']

In [12]:
# Create a list of dictionaries from the tus_list based on regexes
tus_content = []

for i in tus_list:
	# Find all relevant information based on regexes
	bi_score = bi_score_re.search(i).group(1)
	biroamer = biroamer_re.search(i).group(1)
	translation_dir = translation_dir_re.search(i).group(1)
	en_source = en_source_re.search(i).group(1)
	en_par_id = en_par_id_re.search(i).group(1)
	en_par = en_par_re.search(i).group(1)
	en_var_doc = en_var_doc_re.search(i).group(1)
	en_var_dom = en_var_dom_re.search(i).group(1)
	sl_source = sl_source_re.search(i).group(1)
	sl_par_id = sl_par_id_re.search(i).group(1)
	sl_par = sl_par_re.search(i).group(1)
	# Add information to the dictionary
	current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, "is_source": sl_source, "is_par_id": sl_par_id, "is_par": sl_par}
	# Append the dictionary to the list
	tus_content.append(current_tu)

print(len(tus_content))

# Print some instances of the tus_content
tus_content[:2]

355100


[{'score_bicleaner_ai': 0.962,
  'biroamer_entities': 'Yes',
  'translation_direction': 'en-orig',
  'en_source': 'https://en.alda.is/about-alda/',
  'en_par_id': 'p15s0',
  'en_par': 'Alda is not a political party and never will be.',
  'en_var_doc': 'B',
  'en_var_dom': 'A',
  'is_source': 'https://alda.is/um-oldu/',
  'is_par_id': 'p20s0',
  'is_par': 'Alda er ekki stjórnmálaflokkur og mun ekki verða.'},
 {'score_bicleaner_ai': 0.958,
  'biroamer_entities': 'No',
  'translation_direction': 'is-orig',
  'en_source': 'https://www.norden.org/en/nominee/sara-lundberg',
  'en_par_id': 'p163s8',
  'en_par': 'And I’m scared that I’m holding too tightly, So I open my hands... and it makes a run for it and flies!',
  'en_var_doc': 'B',
  'en_var_dom': 'B',
  'is_source': 'https://www.norden.org/is/nominee/sara-lundberg',
  'is_par_id': 'p163s8+p163s9',
  'is_par': 'Og ég verð hrædd um að ég haldi of fast, svo opna ég lófana... Þá hoppar hann og flýgur!'}]

In [13]:
# Save json

with open("Macocu-is-en.json", "w") as file:
	json.dump(tus_content,file, indent= "")