In [1]:
import gzip
import shutil
import wget
import regex as re
import pandas as pd
import numpy as np
import json

In [2]:
# Download the file

#Defining the zip file URL - the TMX
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1520/MaCoCu-tr-en.tmx.gz"

# Downloading the file by sending the request to the URL
corpus_file = wget.download(url)
print('Downloading Completed')

Downloading Completed


In [3]:
# Unzip the file
with gzip.open('MaCoCu-tr-en.tmx.gz', 'rb') as f_in:
    with open('MaCoCu-tr-en.tmx', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [4]:
corpus = open('MaCoCu-tr-en.tmx', "r").read()

In [5]:
# Create and view a sample of the corpus
corpus_sample = open('MaCoCu-tr-en.tmx', "r").read(5000)
print(corpus_sample)

<?xml version="1.0"?>
<tmx version="1.4">
 <header
   adminlang="C"
   srclang="en"
   o-tmf="PlainText"
   creationtool="bitextor"
   creationtoolversion="8.2"
   datatype="PlainText"
   segtype="sentence"
   creationdate="20220422T100340"
   o-encoding="utf-8">
 </header>
 <body>
   <tu tuid="1" datatype="Text">
    <prop type="score-bicleaner-ai">0.994</prop>
    <prop type="biroamer-entities">No</prop>
    <prop type="translation-direction">tr-orig</prop>
    <prop type="type">1:1</prop>
    <prop type="info">different numbers in TUVs</prop>
    <tuv xml:lang="en">
     <prop type="source-document">https://kocaeli.ktb.gov.tr/EN-176348/united-kingdom.html</prop>
     <prop type="checksum-seg">5b26b2c1eff5d02e</prop>
     <prop type="paragraph-id">p63s0</prop>
    <prop type="english-variant-document">UNK</prop>
    <prop type="english-variant-domain">B</prop>
     <seg>His resistance at Galippoli peninsula in WW I and his ingenious struggle in the War of Independence has brought him

In [6]:
# Prepare all the regexes
# Compile all tus
tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)
# Compile relevant information inside tus
bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
sl_source_re = re.compile('<tuv xml:lang="tr">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
sl_par_id_re = re.compile('<tuv xml:lang="tr">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
sl_par_re = re.compile('<tuv xml:lang="tr">.*?<seg>(.*?)</seg>', re.DOTALL)


In [7]:
# Create a list of all tus from the sample corpus
tus_list_sample = tu_re.findall(corpus_sample)
len(tus_list_sample)

4

In [8]:
# View the tus_list
tus_list_sample[1]

'    <prop type="score-bicleaner-ai">0.974</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">en-orig</prop>\n    <prop type="type">1:1</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://www.anadoluhayat.com.tr/en/privacy-policy</prop>\n     <prop type="checksum-seg">cd1dfb0d3c063de9</prop>\n     <prop type="paragraph-id">p59s0</prop>\n    <prop type="english-variant-document">B</prop>\n    <prop type="english-variant-domain">UNK</prop>\n     <seg>Anadolu Hayat Emeklilik reserves the right to change this policy at any time to maintain the up-to-datedness of privacy and data protection principles and to enable compliance of the said to applicable legislation or when necessary in terms of new services provided by Anadolu Hayat Emeklilik.</seg>\n    </tuv>\n    <tuv xml:lang="tr">\n     <prop type="source-document">https://www.izbas.net/cerez-politikamiz</prop>\n     <prop type="checksum-seg">11ee1cccf796d248</prop>\n     <

In [9]:
# Check if regexes work
regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

for rex in regexes:
    test_list = rex.findall(tus_list_sample[1])
    print(test_list)

['0.974']
['No']
['en-orig']
['https://www.anadoluhayat.com.tr/en/privacy-policy']
['p59s0']
['Anadolu Hayat Emeklilik reserves the right to change this policy at any time to maintain the up-to-datedness of privacy and data protection principles and to enable compliance of the said to applicable legislation or when necessary in terms of new services provided by Anadolu Hayat Emeklilik.']
['B']
['UNK']
['https://www.izbas.net/cerez-politikamiz']
['p93s0']
['Şirket, işbu Çerez Politikası hükümlerini dilediği zaman değiştirebilir. Güncel Çerez Politikası, Veri Sahibi’ne herhangi bir yöntemle sunulduğu tarihte yürürlük kazanır.']


In [10]:
# Create a list of all tus from the corpus
tus_list = tu_re.findall(corpus)
len(tus_list)

10323996

In [11]:
# Inspect an instance
tus_list[:1]

['    <prop type="score-bicleaner-ai">0.994</prop>\n    <prop type="biroamer-entities">No</prop>\n    <prop type="translation-direction">tr-orig</prop>\n    <prop type="type">1:1</prop>\n    <prop type="info">different numbers in TUVs</prop>\n    <tuv xml:lang="en">\n     <prop type="source-document">https://kocaeli.ktb.gov.tr/EN-176348/united-kingdom.html</prop>\n     <prop type="checksum-seg">5b26b2c1eff5d02e</prop>\n     <prop type="paragraph-id">p63s0</prop>\n    <prop type="english-variant-document">UNK</prop>\n    <prop type="english-variant-domain">B</prop>\n     <seg>His resistance at Galippoli peninsula in WW I and his ingenious struggle in the War of Independence has brought him glamorous victories.</seg>\n    </tuv>\n    <tuv xml:lang="tr">\n     <prop type="source-document">https://www.ktb.gov.tr/TR-96495/ingiltere.html</prop>\n     <prop type="checksum-seg">c997e14feac4c7e6</prop>\n     <prop type="paragraph-id">p61s0</prop>\n     <seg>1. Cihan Savaşında, Gelibolu yarımada

In [12]:
# Create a list of dictionaries from the tus_list based on regexes
tus_content = []

for i in tus_list:
	# Find all relevant information based on regexes
	bi_score = bi_score_re.search(i).group(1)
	biroamer = biroamer_re.search(i).group(1)
	translation_dir = translation_dir_re.search(i).group(1)
	en_source = en_source_re.search(i).group(1)
	en_par_id = en_par_id_re.search(i).group(1)
	en_par = en_par_re.search(i).group(1)
	en_var_doc = en_var_doc_re.search(i).group(1)
	en_var_dom = en_var_dom_re.search(i).group(1)
	sl_source = sl_source_re.search(i).group(1)
	sl_par_id = sl_par_id_re.search(i).group(1)
	sl_par = sl_par_re.search(i).group(1)
	# Add information to the dictionary
	current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, "tr_source": sl_source, "tr_par_id": sl_par_id, "tr_par": sl_par}
	# Append the dictionary to the list
	tus_content.append(current_tu)

print(len(tus_content))

# Print some instances of the tus_content
tus_content[:2]

10323996


[{'score_bicleaner_ai': 0.994,
  'biroamer_entities': 'No',
  'translation_direction': 'tr-orig',
  'en_source': 'https://kocaeli.ktb.gov.tr/EN-176348/united-kingdom.html',
  'en_par_id': 'p63s0',
  'en_par': 'His resistance at Galippoli peninsula in WW I and his ingenious struggle in the War of Independence has brought him glamorous victories.',
  'en_var_doc': 'UNK',
  'en_var_dom': 'B',
  'tr_source': 'https://www.ktb.gov.tr/TR-96495/ingiltere.html',
  'tr_par_id': 'p61s0',
  'tr_par': "1. Cihan Savaşında, Gelibolu yarımadasındaki kahramanlık destanı olan mücadelede ve Kurtuluş Savaşı'daki davada yüksek dehâsı kendisine tam ve parlak zaferler kazandırmıştır."},
 {'score_bicleaner_ai': 0.974,
  'biroamer_entities': 'No',
  'translation_direction': 'en-orig',
  'en_source': 'https://www.anadoluhayat.com.tr/en/privacy-policy',
  'en_par_id': 'p59s0',
  'en_par': 'Anadolu Hayat Emeklilik reserves the right to change this policy at any time to maintain the up-to-datedness of privacy and 

In [13]:
# Save json

with open("Macocu-tr-en.json", "w") as file:
	json.dump(tus_content,file, indent= "")