In [1]:
import gzip
import shutil
import wget
import regex as re
import pandas as pd
import numpy as np
import json

In [2]:
# Download the file

#Defining the zip file URL - the TMX
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1513/MaCoCu-en-mk.tmx.gz"

# Downloading the file by sending the request to the URL
corpus_file = wget.download(url)
print('Downloading Completed')

Downloading Completed


In [3]:
# Unzip the file
with gzip.open('MaCoCu-en-mk.tmx.gz', 'rb') as f_in:
    with open('MaCoCu-en-mk.tmx', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [4]:
corpus = open('MaCoCu-en-mk.tmx', "r").read()

In [5]:
# Create and view a sample of the corpus
corpus_sample = open('MaCoCu-en-mk.tmx', "r").read(5000)
print(corpus_sample)

<?xml version="1.0"?>
<tmx version="1.4">
 <header
   adminlang="C"
   srclang="en"
   o-tmf="PlainText"
   creationtool="bitextor"
   creationtoolversion="8.2"
   datatype="PlainText"
   segtype="sentence"
   creationdate="20220420T120753"
   o-encoding="utf-8">
 </header>
 <body>
   <tu tuid="1" datatype="Text">
    <prop type="score-bicleaner-ai">0.785</prop>
    <prop type="biroamer-entities">No</prop>
    <prop type="translation-direction">en-orig</prop>
    <prop type="type">1:1</prop>
    <tuv xml:lang="en">
     <prop type="source-document">https://www.sobranie.mk/2020-2024-delegations-ns_article-delegation-to-the-parliamentary-assembly-of-the-organization-for-security-and-cooperation-in-europe.nspx</prop>
     <prop type="checksum-seg">a24a091bca5c2c8e</prop>
     <prop type="paragraph-id">p4s0</prop>
    <prop type="english-variant-document">UNK</prop>
    <prop type="english-variant-domain">B</prop>
     <seg>DELEGATION TO THE PARLIAMENTARY ASSEMBLY OF THE ORGANIZATION FOR S

In [6]:
# Prepare all the regexes
# Compile all tus
tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)
# Compile relevant information inside tus
bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
sl_source_re = re.compile('<tuv xml:lang="mk">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
sl_par_id_re = re.compile('<tuv xml:lang="mk">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
sl_par_re = re.compile('<tuv xml:lang="mk">.*?<seg>(.*?)</seg>', re.DOTALL)


In [7]:
# Create a list of all tus from the sample corpus
tus_list_sample = tu_re.findall(corpus_sample)
len(tus_list_sample)

4

In [8]:
# View the tus_list
tus_list_sample[1]

'    <prop type="score-bicleaner-ai">0.947</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">mk-orig</prop>\n    <prop type="type">1:1</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">http://fbe.edu.mk/index.php?option=com_content&amp;view=article&amp;id=144&amp;Itemid=166&amp;lang=en</prop>\n     <prop type="checksum-seg">1ede8430c336b603</prop>\n     <prop type="paragraph-id">p66s3</prop>\n    <prop type="english-variant-document">UNK</prop>\n    <prop type="english-variant-domain">A</prop>\n     <seg>Depending on the demonstrated quality (depth and scope of research and knowledge), the best students will receive the highest grade.</seg>\n    </tuv>\n    <tuv xml:lang="mk">\n     <prop type="source-document">http://www.fbe.edu.mk/index.php?option=com_content&amp;view=article&amp;id=144&amp;Itemid=166&amp;lang=mk</prop>\n     <prop type="checksum-seg">8dbe7ef10015787a</prop>\n     <prop type="paragraph-id">p65s3</prop>\n    

In [9]:
# Check if regexes work
regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

for rex in regexes:
    test_list = rex.findall(tus_list_sample[1])
    print(test_list)

['0.947']
['No']
['mk-orig']
['http://fbe.edu.mk/index.php?option=com_content&amp;view=article&amp;id=144&amp;Itemid=166&amp;lang=en']
['p66s3']
['Depending on the demonstrated quality (depth and scope of research and knowledge), the best students will receive the highest grade.']
['UNK']
['A']
['http://www.fbe.edu.mk/index.php?option=com_content&amp;view=article&amp;id=144&amp;Itemid=166&amp;lang=mk']
['p65s3']
['Зависно од покажаниот квалитет (длабочина и опфат на истражувањата и сознанијата), најдобрите студенти ќе добијат и највисоки оценки.']


In [10]:
# Create a list of all tus from the corpus
tus_list = tu_re.findall(corpus)
len(tus_list)

478059

In [11]:
# Inspect an instance
tus_list[:1]

['    <prop type="score-bicleaner-ai">0.785</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">en-orig</prop>\n    <prop type="type">1:1</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://www.sobranie.mk/2020-2024-delegations-ns_article-delegation-to-the-parliamentary-assembly-of-the-organization-for-security-and-cooperation-in-europe.nspx</prop>\n     <prop type="checksum-seg">a24a091bca5c2c8e</prop>\n     <prop type="paragraph-id">p4s0</prop>\n    <prop type="english-variant-document">UNK</prop>\n    <prop type="english-variant-domain">B</prop>\n     <seg>DELEGATION TO THE PARLIAMENTARY ASSEMBLY OF THE ORGANIZATION FOR SECURITY AND COOPERATION IN EUROPE (OSCE PA)</seg>\n    </tuv>\n    <tuv xml:lang="mk">\n     <prop type="source-document">https://tvm.mk/vesti/makedonija?start=2235</prop>\n     <prop type="checksum-seg">19c0c13c2e8d42b5</prop>\n     <prop type="paragraph-id">p14s0</prop>\n     <seg>"Вредностите на поли

In [12]:
# Create a list of dictionaries from the tus_list based on regexes
tus_content = []

for i in tus_list:
	# Find all relevant information based on regexes
	bi_score = bi_score_re.search(i).group(1)
	biroamer = biroamer_re.search(i).group(1)
	translation_dir = translation_dir_re.search(i).group(1)
	en_source = en_source_re.search(i).group(1)
	en_par_id = en_par_id_re.search(i).group(1)
	en_par = en_par_re.search(i).group(1)
	en_var_doc = en_var_doc_re.search(i).group(1)
	en_var_dom = en_var_dom_re.search(i).group(1)
	sl_source = sl_source_re.search(i).group(1)
	sl_par_id = sl_par_id_re.search(i).group(1)
	sl_par = sl_par_re.search(i).group(1)
	# Add information to the dictionary
	current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, "mk_source": sl_source, "mk_par_id": sl_par_id, "mk_par": sl_par}
	# Append the dictionary to the list
	tus_content.append(current_tu)

print(len(tus_content))

# Print some instances of the tus_content
tus_content[:2]

478059


[{'score_bicleaner_ai': 0.785,
  'biroamer_entities': 'No',
  'translation_direction': 'en-orig',
  'en_source': 'https://www.sobranie.mk/2020-2024-delegations-ns_article-delegation-to-the-parliamentary-assembly-of-the-organization-for-security-and-cooperation-in-europe.nspx',
  'en_par_id': 'p4s0',
  'en_par': 'DELEGATION TO THE PARLIAMENTARY ASSEMBLY OF THE ORGANIZATION FOR SECURITY AND COOPERATION IN EUROPE (OSCE PA)',
  'en_var_doc': 'UNK',
  'en_var_dom': 'B',
  'mk_source': 'https://tvm.mk/vesti/makedonija?start=2235',
  'mk_par_id': 'p14s0',
  'mk_par': '"Вредностите на политичкиот плурализам" е тема на тркалезна маса на која денеска ќе дебатираат лидери и претставници на парламентарните партии во Македонија.'},
 {'score_bicleaner_ai': 0.947,
  'biroamer_entities': 'No',
  'translation_direction': 'mk-orig',
  'en_source': 'http://fbe.edu.mk/index.php?option=com_content&amp;view=article&amp;id=144&amp;Itemid=166&amp;lang=en',
  'en_par_id': 'p66s3',
  'en_par': 'Depending on the

In [13]:
# Save json

with open("Macocu-mk-en.json", "w") as file:
	json.dump(tus_content,file, indent= "")