In [1]:
import pandas as pd
import numpy as np
import regex as re
import json

## Preparation of document format from sentence format

In [2]:
# Import json
with open("Macocu-tr-en.json", "r") as file:
	tus_content = json.load(file)

In [3]:
# Convert data to a dataframe

corpus_df = pd.DataFrame(tus_content)

corpus_df.head()

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par
0,0.994,No,tr-orig,https://kocaeli.ktb.gov.tr/EN-176348/united-ki...,p63s0,His resistance at Galippoli peninsula in WW I ...,UNK,B,https://www.ktb.gov.tr/TR-96495/ingiltere.html,p61s0,"1. Cihan Savaşında, Gelibolu yarımadasındaki k..."
1,0.974,No,en-orig,https://www.anadoluhayat.com.tr/en/privacy-policy,p59s0,Anadolu Hayat Emeklilik reserves the right to ...,B,UNK,https://www.izbas.net/cerez-politikamiz,p93s0,"Şirket, işbu Çerez Politikası hükümlerini dile..."
2,0.591,No,tr-orig,https://www.fatfreekitchen.com/home-remedy/con...,p30s0,Prevention of Constipation,A,UNK,https://www.posta.com.tr/bebeklerde-kabizlik-n...,p53s0,Bebeklerde kabızlık belirtileri
3,0.939,Yes,tr-orig,http://teklas.com/bizbizeyeterizturkiyem,p12s0,"As Teklas, we support the 'National Solidarity...",UNK,UNK,https://www.bgazete.com.tr/haberleri/milli-day...,p17s0,"Yataş Grup'tan yapılan açıklamada, Biz Bize Ye..."
4,0.99,No,tr-orig,https://www.savunmahaber.com/en/desan-shipyard...,p42s0,The data distribution systems and the entire e...,B,UNK,https://www.savunmahaber.com/desan-tersanesi-a...,p42s0,Veri dağıtım sistemleri ve tüm elektrik-elektr...


In [5]:
# Sort by english url and then by en_par_id to order the paragraphs into texts
corpus_df = corpus_df.sort_values(by = ["en_source", "en_par_id"])

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par
4980941,0.632,No,en-orig,http://1-----a-oabya.hotels-tokyo-jp.com/en/,p154s0+p154s1,Located 300 metres to Nishi-shinjuku undergrou...,UNK,UNK,http://ambassador-hotel.hotel-istanbul.net/tr/,p151s0+p160s0,Lapis Inn Hotel &amp; Spa 3* Sultanahmet Meyda...
9390665,0.853,No,tr-orig,http://1-----a-oabya.hotels-tokyo-jp.com/en/,p155s0+p155s1,The accommodation is placed within a 5-minute ...,UNK,UNK,http://ambassador-hotel.hotel-istanbul.net/tr/,p161s0,"Otel, Çemberlitaş Sütunu'a 0.4 km'den daha az ..."


In [6]:
# Add information about domains
domain_re=re.compile(r'^https?://(?:www\.)?(.+?)[/$]')
#domain_re = re.compile('(?:https|http)?:\/\/(?:www\.)?(?:en|is|english|m|old|prod|staging)?(?:\.)?(.*?)\.(?:com|net|is|org|edu|info|biz|fun|cat|space|club|museum|gov|site|digital|int|store|coop|dev|online|blog|kpmg|mobi|guru|help|bible|art|academy|top|one|live)\/')

In [7]:
en_domain_list = [domain_re.search(i).group(1) for i in corpus_df.en_source.to_list()]

en_domain_list[:3]

['1-----a-oabya.hotels-tokyo-jp.com',
 '1-----a-oabya.hotels-tokyo-jp.com',
 '1-----a-oabya.hotels-tokyo-jp.com']

In [8]:
corpus_df["en_domain"] = en_domain_list

In [9]:
# Repeat with Turkish domain
sl_domain_list = [domain_re.search(i).group(1) for i in corpus_df.tr_source.to_list()]
corpus_df["tr_domain"] = sl_domain_list

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain
4980941,0.632,No,en-orig,http://1-----a-oabya.hotels-tokyo-jp.com/en/,p154s0+p154s1,Located 300 metres to Nishi-shinjuku undergrou...,UNK,UNK,http://ambassador-hotel.hotel-istanbul.net/tr/,p151s0+p160s0,Lapis Inn Hotel &amp; Spa 3* Sultanahmet Meyda...,1-----a-oabya.hotels-tokyo-jp.com,ambassador-hotel.hotel-istanbul.net
9390665,0.853,No,tr-orig,http://1-----a-oabya.hotels-tokyo-jp.com/en/,p155s0+p155s1,The accommodation is placed within a 5-minute ...,UNK,UNK,http://ambassador-hotel.hotel-istanbul.net/tr/,p161s0,"Otel, Çemberlitaş Sütunu'a 0.4 km'den daha az ...",1-----a-oabya.hotels-tokyo-jp.com,ambassador-hotel.hotel-istanbul.net


In [10]:
# Add information whether the domains are the same
corpus_df["same_domains"] = np.where(corpus_df["en_domain"] == corpus_df["tr_domain"], "yes", 'no')

# Add column for domains that are different
corpus_df["different_domains"] = corpus_df["en_domain"] + " " + corpus_df["tr_domain"]

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain,same_domains,different_domains
4980941,0.632,No,en-orig,http://1-----a-oabya.hotels-tokyo-jp.com/en/,p154s0+p154s1,Located 300 metres to Nishi-shinjuku undergrou...,UNK,UNK,http://ambassador-hotel.hotel-istanbul.net/tr/,p151s0+p160s0,Lapis Inn Hotel &amp; Spa 3* Sultanahmet Meyda...,1-----a-oabya.hotels-tokyo-jp.com,ambassador-hotel.hotel-istanbul.net,no,1-----a-oabya.hotels-tokyo-jp.com ambassador-h...
9390665,0.853,No,tr-orig,http://1-----a-oabya.hotels-tokyo-jp.com/en/,p155s0+p155s1,The accommodation is placed within a 5-minute ...,UNK,UNK,http://ambassador-hotel.hotel-istanbul.net/tr/,p161s0,"Otel, Çemberlitaş Sütunu'a 0.4 km'den daha az ...",1-----a-oabya.hotels-tokyo-jp.com,ambassador-hotel.hotel-istanbul.net,no,1-----a-oabya.hotels-tokyo-jp.com ambassador-h...


In [11]:
corpus_df["same_domains"].value_counts()

yes    5704063
no     4619933
Name: same_domains, dtype: int64

In [12]:
# Number of texts and sentences up to now
previous_no_sentences = corpus_df.en_source.count()
previous_no_texts = len(corpus_df.en_source.unique())
print(f"Number of sentences: {previous_no_sentences}")
print(f"Number of texts: {previous_no_texts}")

Number of sentences: 10323996
Number of texts: 796473


In [14]:
# Analyze instances from different domains
print(corpus_df[corpus_df["same_domains"] != "yes"]["different_domains"] .value_counts().to_markdown())

|                                                               |   different_domains |
|:--------------------------------------------------------------|--------------------:|
| wordplanet.org m-p-c.org                                      |               13699 |
| truthmeter.mk vistinomer.mk                                   |                3885 |
| marxist.com novaiskra.mk                                      |                2838 |
| en.scoop.mk scoop.mk                                          |                2585 |
| mcgo.org.mk mk.mcgo.org.mk                                    |                2566 |
| factchecking.mk proverkanafakti.mk                            |                2308 |
| clp.mk civilmedia.mk                                          |                2047 |
| biblegateway.com m-p-c.org                                    |                1846 |
| megjashi.org.mk childrensembassy.org.mk                       |                1592 |
| wp.mpc.org.mk mpc.org.mk      

In [13]:
# Discard instances that are from different domains
corpus_df = corpus_df[corpus_df["same_domains"] == "yes"]

In [14]:
# See number of discarded texts and sentences
def calculate_discarded(previous_no_sentences, previous_no_texts):
	new_number_sentences = corpus_df.en_source.count()
	new_number_texts = len(corpus_df.en_source.unique())

	print(f"New number of sentences: {new_number_sentences}")
	print(f"New number of texts: {new_number_texts}")

	print(f"No. of discarded sentences: {previous_no_sentences-new_number_sentences}, percentage: {(previous_no_sentences-new_number_sentences)/previous_no_sentences}")

	print(f"No. of discarded texts: {previous_no_texts-new_number_texts}, percentage: {(previous_no_texts-new_number_texts)/previous_no_texts}")

	return new_number_sentences, new_number_texts

In [16]:
sentences_same_domains, texts_same_domains = calculate_discarded(previous_no_sentences, previous_no_texts)

New number of sentences: 5704063
New number of texts: 465720
No. of discarded sentences: 4619933, percentage: 0.44749465226449137
No. of discarded texts: 330753, percentage: 0.415272080786166


In [17]:
# Save the dataframe to csv
corpus_df.to_csv("Macocu-tr-en-doc-format-filtered.csv", sep= "\t")

In [18]:
# Calculate average bicleaner ai score based on the en_source
corpus_df["average_score"] = corpus_df["score_bicleaner_ai"].groupby(corpus_df['en_source']).transform('mean')

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain,same_domains,different_domains,average_score
7311776,0.988,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim apartment welcomes travellers ...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815
3169889,0.975,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0+p157s1,Istanbul Museum of Modern Art is a 5-minute dr...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0,"Daire, Taksim semtinde İstanbul Arkeoloji Müze...",1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815


In [19]:
# Join par id and text
corpus_df["en-par-text"] = corpus_df["en_par_id"] + "-" + corpus_df["en_par"]
corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain,same_domains,different_domains,average_score,en-par-text
7311776,0.988,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim apartment welcomes travellers ...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p156s0-1 Bed In Taksim apartment welcomes trav...
3169889,0.975,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0+p157s1,Istanbul Museum of Modern Art is a 5-minute dr...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0,"Daire, Taksim semtinde İstanbul Arkeoloji Müze...",1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p157s0+p157s1-Istanbul Museum of Modern Art is...


In [20]:
# Discard all duplicated English paragraphs with the same par id
corpus_df = corpus_df.drop_duplicates("en-par-text")

In [21]:
sentences_dupl_sent, text_dupl_sent = calculate_discarded(sentences_same_domains, texts_same_domains)

New number of sentences: 2971997
New number of texts: 455778
No. of discarded sentences: 2732066, percentage: 0.47896841251578043
No. of discarded texts: 9942, percentage: 0.021347590827106414


In [22]:
# Add to each instance from the same en_source joint text from all sentences
corpus_df["en_doc"] = corpus_df["en_par"].groupby(corpus_df['en_source']).transform(' '.join)

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain,same_domains,different_domains,average_score,en-par-text,en_doc
7311776,0.988,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim apartment welcomes travellers ...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p156s0-1 Bed In Taksim apartment welcomes trav...,1 Bed In Taksim apartment welcomes travellers ...
3169889,0.975,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0+p157s1,Istanbul Museum of Modern Art is a 5-minute dr...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0,"Daire, Taksim semtinde İstanbul Arkeoloji Müze...",1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p157s0+p157s1-Istanbul Museum of Modern Art is...,1 Bed In Taksim apartment welcomes travellers ...


In [23]:
# Repeat with the text in other language
corpus_df["tr_doc"] = corpus_df["tr_par"].groupby(corpus_df['tr_source']).transform(' '.join)
corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain,same_domains,different_domains,average_score,en-par-text,en_doc,tr_doc
7311776,0.988,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim apartment welcomes travellers ...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p156s0-1 Bed In Taksim apartment welcomes trav...,1 Bed In Taksim apartment welcomes travellers ...,1 Bed In Taksim dairesinde mutfağın yanı sıra ...
3169889,0.975,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0+p157s1,Istanbul Museum of Modern Art is a 5-minute dr...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p157s0,"Daire, Taksim semtinde İstanbul Arkeoloji Müze...",1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p157s0+p157s1-Istanbul Museum of Modern Art is...,1 Bed In Taksim apartment welcomes travellers ...,1 Bed In Taksim dairesinde mutfağın yanı sıra ...


In [24]:
# Keep only one example of each text
corpus_df = corpus_df.drop_duplicates("en_doc")

In [26]:
# Here, the number of sentences is not applicable anymore, because all sentences have been merged into texts - see only number of texts.
sentences_after_text_deduplication, texts_after_text_deduplication = calculate_discarded(sentences_dupl_sent, text_dupl_sent)

New number of sentences: 453645
New number of texts: 453645
No. of discarded sentences: 2518352, percentage: 0.8473602093138048
No. of discarded texts: 2133, percentage: 0.004679909956162869


In [27]:
# Add information about length
corpus_df["en_length"] = corpus_df.en_doc.str.split().str.len()

corpus_df.head(3)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,tr_par,en_domain,tr_domain,same_domains,different_domains,average_score,en-par-text,en_doc,tr_doc,en_length
7311776,0.988,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim apartment welcomes travellers ...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p156s0-1 Bed In Taksim apartment welcomes trav...,1 Bed In Taksim apartment welcomes travellers ...,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,55
78914,0.988,No,tr-orig,http://1-hotel-central-park.hotelinewyork.com/,p159s0,1 Hotel Central Park New York is situated in v...,UNK,UNK,http://1-hotel-central-park.hotelinewyork.com/tr/,p159s0,"New York şehrinde Times Meydanı'den 1,2 km uza...",1-hotel-central-park.hotelinewyork.com,1-hotel-central-park.hotelinewyork.com,yes,1-hotel-central-park.hotelinewyork.com 1-hotel...,0.90275,p159s0-1 Hotel Central Park New York is situat...,1 Hotel Central Park New York is situated in v...,"New York şehrinde Times Meydanı'den 1,2 km uza...",81
9742185,0.814,No,tr-orig,http://10-karakoy-a-morgans-original.hotel-ist...,p159s0,Situated near restaurants and bars about 600 m...,B,UNK,http://10-karakoy-a-morgans-original.hotel-ist...,p150s0+p159s0,"10 Karakoy Istanbul Otel 5* Döviz bürosu, eman...",10-karakoy-a-morgans-original.hotel-istanbul.net,10-karakoy-a-morgans-original.hotel-istanbul.net,yes,10-karakoy-a-morgans-original.hotel-istanbul.n...,0.895667,p159s0-Situated near restaurants and bars abou...,Situated near restaurants and bars about 600 m...,"10 Karakoy Istanbul Otel 5* Döviz bürosu, eman...",65


In [28]:
print(corpus_df.en_length.describe().to_markdown())

|       |   en_length |
|:------|------------:|
| count |  453645     |
| mean  |     163.379 |
| std   |     311.403 |
| min   |       1     |
| 25%   |      33     |
| 50%   |      74     |
| 75%   |     175     |
| max   |   26552     |


In [30]:
# Add information about Turkish length
corpus_df["tr_length"] = corpus_df.tr_doc.str.split().str.len()

corpus_df.head(3)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,tr_source,tr_par_id,...,en_domain,tr_domain,same_domains,different_domains,average_score,en-par-text,en_doc,tr_doc,en_length,tr_length
7311776,0.988,No,tr-orig,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,1 Bed In Taksim apartment welcomes travellers ...,B,UNK,http://1-bedroom-apartment-in-taksim.hotel-ist...,p156s0,...,1-bedroom-apartment-in-taksim.hotel-istanbul.net,1-bedroom-apartment-in-taksim.hotel-istanbul.net,yes,1-bedroom-apartment-in-taksim.hotel-istanbul.n...,0.9815,p156s0-1 Bed In Taksim apartment welcomes trav...,1 Bed In Taksim apartment welcomes travellers ...,1 Bed In Taksim dairesinde mutfağın yanı sıra ...,55,69
78914,0.988,No,tr-orig,http://1-hotel-central-park.hotelinewyork.com/,p159s0,1 Hotel Central Park New York is situated in v...,UNK,UNK,http://1-hotel-central-park.hotelinewyork.com/tr/,p159s0,...,1-hotel-central-park.hotelinewyork.com,1-hotel-central-park.hotelinewyork.com,yes,1-hotel-central-park.hotelinewyork.com 1-hotel...,0.90275,p159s0-1 Hotel Central Park New York is situat...,1 Hotel Central Park New York is situated in v...,"New York şehrinde Times Meydanı'den 1,2 km uza...",81,92
9742185,0.814,No,tr-orig,http://10-karakoy-a-morgans-original.hotel-ist...,p159s0,Situated near restaurants and bars about 600 m...,B,UNK,http://10-karakoy-a-morgans-original.hotel-ist...,p150s0+p159s0,...,10-karakoy-a-morgans-original.hotel-istanbul.net,10-karakoy-a-morgans-original.hotel-istanbul.net,yes,10-karakoy-a-morgans-original.hotel-istanbul.n...,0.895667,p159s0-Situated near restaurants and bars abou...,Situated near restaurants and bars about 600 m...,"10 Karakoy Istanbul Otel 5* Döviz bürosu, eman...",65,80


In [31]:
# Discard instances that have length less than  79 (median from other datasets)
corpus_df = corpus_df[corpus_df["en_length"] > 78]

corpus_df.shape

(218554, 21)

In [32]:
# Here, the number of sentences is not applicable anymore, because all sentences have been merged into texts - see only number of texts.
sentences_after_length, texts_after_length = calculate_discarded(sentences_after_text_deduplication, texts_after_text_deduplication)

New number of sentences: 218554
New number of texts: 218554
No. of discarded sentences: 235091, percentage: 0.5182268073052717
No. of discarded texts: 235091, percentage: 0.5182268073052717


In [33]:
corpus_df.columns

Index(['score_bicleaner_ai', 'biroamer_entities', 'translation_direction',
       'en_source', 'en_par_id', 'en_par', 'en_var_doc', 'en_var_dom',
       'tr_source', 'tr_par_id', 'tr_par', 'en_domain', 'tr_domain',
       'same_domains', 'different_domains', 'average_score', 'en-par-text',
       'en_doc', 'tr_doc', 'en_length', 'tr_length'],
      dtype='object')

In [34]:
# Discard irrelevant columns
corpus_df = corpus_df.drop(columns = ['score_bicleaner_ai', 'en_par_id', 'en_par', 'tr_par_id', 'tr_par', 'en-par-text', 'same_domains', 'different_domains'])

In [35]:
corpus_df.head(1)

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,tr_source,en_domain,tr_domain,average_score,en_doc,tr_doc,en_length,tr_length
78914,No,tr-orig,http://1-hotel-central-park.hotelinewyork.com/,UNK,UNK,http://1-hotel-central-park.hotelinewyork.com/tr/,1-hotel-central-park.hotelinewyork.com,1-hotel-central-park.hotelinewyork.com,0.90275,1 Hotel Central Park New York is situated in v...,"New York şehrinde Times Meydanı'den 1,2 km uza...",81,92


In [36]:
# Filter out the non-textual texts

# Calculate ratio of punctuations per words

def paragraph_punct_ratio(text):
    token_re=re.compile(r'\w+|\S',re.UNICODE)
    tokens=token_re.findall(text)
    punct=len([e for e in tokens if e in '.;,!?:'])
    ratio = punct/len(tokens)
    return ratio

corpus_df["punct_ratio"] = corpus_df.en_doc.apply(paragraph_punct_ratio)

corpus_df.head(3)

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,tr_source,en_domain,tr_domain,average_score,en_doc,tr_doc,en_length,tr_length,punct_ratio
78914,No,tr-orig,http://1-hotel-central-park.hotelinewyork.com/,UNK,UNK,http://1-hotel-central-park.hotelinewyork.com/tr/,1-hotel-central-park.hotelinewyork.com,1-hotel-central-park.hotelinewyork.com,0.90275,1 Hotel Central Park New York is situated in v...,"New York şehrinde Times Meydanı'den 1,2 km uza...",81,92,0.103093
5162533,No,tr-orig,http://1001documentary.net/festival_en.php?pag...,A,UNK,http://1001documentary.net/festival.php?sayfa=...,1001documentary.net,1001documentary.net,0.942778,Aims and Goals of 1001 Documentary Film Festiv...,1001 Uluslararası Belgesel Film Festivali'nin ...,201,147,0.07489
2137420,No,tr-orig,http://1001documentary.net/index_en.php,A,UNK,http://1001documentary.net/,1001documentary.net,1001documentary.net,0.967143,18th International 1001 Documentary Film Festi...,18. Uluslararası 1001 Belgesel Film Festivali ...,181,89,0.085859


In [37]:
corpus_df.punct_ratio.describe()

count    218554.000000
mean          0.096994
std           0.037664
min           0.000000
25%           0.074598
50%           0.091030
75%           0.111607
max           0.475728
Name: punct_ratio, dtype: float64

In [38]:
# With the ratio < 0.015 we catch non-textual texts without any punctuation
for i in corpus_df.query("punct_ratio < 0.015").en_doc.to_list()[:5]:
	print(i)

Investigation of ignition advance effects for CNG usage in a sequential dual ignition gasoline engine by using in-cylinder combustion cfd analysis [Sirali çift ateşlemeli bir benzin motorunda CNG kullanimi için ateşleme avansi etkilerinin silindir içi yanma had analizi ile incelenmesi] Investigation of ignition advance effects for CNG usage in a sequential dual ignition gasoline engine by using in-cylinder combustion cfd analysis [Sirali çift ateşlemeli bir benzin motorunda CNG kullanimi için ateşleme avansi etkilerinin silindir içi yanma had analizi ile incelenmesi]
CHAIRMAN OF THE BOARD OF DIRECTORS AHMET EMİN MAKASCI PUBLISHED A MESSAGE FOR NOVEMBER 10 Akşehir Chamber of Commerce and Industry Chairman Ahmet Emin Makascı issued a message about November 10 CHAIRMAN OF THE BOARD OF DIRECTORS AHMET EMİN MAKASCI PUBLISHED A MESSAGE FOR MAWLİD AN-NABİ (THE BİRTH OF PROPHET MUHAMMAD) CHAIRMAN OF THE BOARD OF DIRECTORS AHMET EMİN MAKASCI PUBLISHED A MESSAGE FOR MAWLİD AN-NABİ (THE BİRTH OF 

In [39]:
# With ratio above 0.2, we catch non-textual texts with a lot of punctuations
for i in corpus_df.query("punct_ratio > 0.2").en_doc.to_list():
	print(i)

Black Section Stories about true life war, violence, social depression and the people who experienced or resisted these times... Red Section The conditions of work and labor, workers and laborers in the world order. Orange Section Modern day stories about close or far geography... Yellow Section Migrating... Being away from home... Blue Section Ordinary stories about extraordinary people, extraordinary stories about ordinary people. Purple Section Women in society, women in family, women at work, women against violence and violence against women... Green Section Nature, people, cities... Films that struggle for sustainable world.
Erogul, O., Department of Ophthalmology, Faculty of Medicine, Afyonkarahisar University of Health Sciences, Afyonkarahisar, Turkey; Yozgat, Z., Department of Ophthalmology, Faculty of Medicine, Afyonkarahisar University of Health Sciences, Afyonkarahisar, Turkey; Sabaner, M.C., Department of Ophthalmology, Faculty of Medicine, Afyonkarahisar University of Heal

The analysis showed that filtering of non-textual texts is necessary.

In [40]:
# Filter the corpus by using only instances with ratio between 0.015 and 0.2
corpus_df = corpus_df.query("punct_ratio >= 0.015 & punct_ratio <= 0.2")

corpus_df.head()

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,tr_source,en_domain,tr_domain,average_score,en_doc,tr_doc,en_length,tr_length,punct_ratio
78914,No,tr-orig,http://1-hotel-central-park.hotelinewyork.com/,UNK,UNK,http://1-hotel-central-park.hotelinewyork.com/tr/,1-hotel-central-park.hotelinewyork.com,1-hotel-central-park.hotelinewyork.com,0.90275,1 Hotel Central Park New York is situated in v...,"New York şehrinde Times Meydanı'den 1,2 km uza...",81,92,0.103093
5162533,No,tr-orig,http://1001documentary.net/festival_en.php?pag...,A,UNK,http://1001documentary.net/festival.php?sayfa=...,1001documentary.net,1001documentary.net,0.942778,Aims and Goals of 1001 Documentary Film Festiv...,1001 Uluslararası Belgesel Film Festivali'nin ...,201,147,0.07489
2137420,No,tr-orig,http://1001documentary.net/index_en.php,A,UNK,http://1001documentary.net/,1001documentary.net,1001documentary.net,0.967143,18th International 1001 Documentary Film Festi...,18. Uluslararası 1001 Belgesel Film Festivali ...,181,89,0.085859
8257785,No,tr-orig,http://115-st-joseph-apartments.gzira.hotels-m...,B,UNK,http://115-st-joseph-apartments.gzira.hotels-m...,115-st-joseph-apartments.gzira.hotels-mt.com,115-st-joseph-apartments.gzira.hotels-mt.com,0.8615,"Featuring a sauna, a Jacuzzi and a sun deck, U...","Ulusal Güzel Sanatlar Müzesi'den 1,5 km uzaklı...",89,83,0.111111
7254342,No,tr-orig,http://128a-alacati-hotel.alacati.hotels-tr.net/,UNK,UNK,http://128a-alacati-hotel.alacati.hotels-tr.ne...,128a-alacati-hotel.alacati.hotels-tr.net,128a-alacati-hotel.alacati.hotels-tr.net,0.800667,The budget 128A Alacati Hotel is nestled not f...,128A Otel ünlü Dutlu Kahve'a çok yakın bulunma...,102,82,0.117647


In [41]:
# Here, the number of sentences is not applicable anymore, because all sentences have been merged into texts - see only number of texts.
sentences_after_heuristic, texts_after_heuristic = calculate_discarded(sentences_after_length, texts_after_length)

New number of sentences: 213147
New number of texts: 213147
No. of discarded sentences: 5407, percentage: 0.024739881219286768
No. of discarded texts: 5407, percentage: 0.024739881219286768


In [42]:
# Save the dataframe to csv
corpus_df.to_csv("Macocu-tr-en-doc-format-filtered.csv", sep= "\t")

## Analysis of prepared corpus

In [43]:
# Inspect corpus information
corpus_df.describe(include="all")

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,tr_source,en_domain,tr_domain,average_score,en_doc,tr_doc,en_length,tr_length,punct_ratio
count,213147,213147,213147,213147,213147,213147,213147,213147,213147.0,213147,213147,213147.0,213147.0,213147.0
unique,2,2,213147,4,1,166026,12937,12937,,213147,165766,,,
top,No,tr-orig,http://1-hotel-central-park.hotelinewyork.com/,UNK,UNK,http://www.hotels-tokyo-jp.com/tr/near-shinjuk...,booking.com,booking.com,,1 Hotel Central Park New York is situated in v...,"SlotoHit Casino, Net Entertainment, Microgamin...",,,
freq,191120,142650,1,113025,213147,17,13928,13928,,1,17,,,
mean,,,,,,,,,0.867585,,,303.055877,377.009303,0.093762
std,,,,,,,,,0.081293,,,410.129595,582.61643,0.029125
min,,,,,,,,,0.5,,,79.0,2.0,0.015038
25%,,,,,,,,,0.817786,,,116.0,103.0,0.074324
50%,,,,,,,,,0.879833,,,184.0,202.0,0.090476
75%,,,,,,,,,0.9305,,,339.0,436.0,0.10989


In [44]:
# Inspect en_var_doc statistics
print(corpus_df.en_var_doc.value_counts(normalize = True).to_markdown())

|     |   en_var_doc |
|:----|-------------:|
| UNK |   0.530268   |
| A   |   0.338189   |
| B   |   0.124989   |
| MIX |   0.00655416 |


In [46]:
print(corpus_df.en_var_dom.value_counts(normalize = True).to_markdown())

|     |   en_var_dom |
|:----|-------------:|
| UNK |            1 |


In [47]:
# Inspect translation direction
print(corpus_df.translation_direction.value_counts(normalize = True).to_markdown())

|         |   translation_direction |
|:--------|------------------------:|
| tr-orig |                0.669256 |
| en-orig |                0.330744 |


In [48]:
print(corpus_df.average_score.describe().to_markdown())

|       |   average_score |
|:------|----------------:|
| count |  213147         |
| mean  |       0.867585  |
| std   |       0.0812931 |
| min   |       0.5       |
| 25%   |       0.817786  |
| 50%   |       0.879833  |
| 75%   |       0.9305    |
| max   |       0.9975    |


In [49]:
print(corpus_df.en_length.describe().to_markdown())

|       |   en_length |
|:------|------------:|
| count |  213147     |
| mean  |     303.056 |
| std   |     410.13  |
| min   |      79     |
| 25%   |     116     |
| 50%   |     184     |
| 75%   |     339     |
| max   |   26552     |


In [None]:
# Inspect duplicated Turkish texts
pd.set_option('display.max_colwidth', None)
duplicated = corpus_df[corpus_df.mk_doc.duplicated(keep=False)].sort_values("tr_doc")
duplicated[["tr_doc", "en_doc", "average_score"]]

In [51]:
# Analyze English domains in the corpus_df
count = pd.DataFrame({"Count": list(corpus_df.en_domain.value_counts()), "Percentage": list(corpus_df.en_domain.value_counts(normalize="True")*100)}, index = corpus_df.en_domain.value_counts().index)

print(count.to_markdown())

|                                                    |   Count |   Percentage |
|:---------------------------------------------------|--------:|-------------:|
| booking.com                                        |   13928 |   6.53446    |
| support.apple.com                                  |    6443 |   3.0228     |
| debis.deu.edu.tr                                   |    3390 |   1.59045    |
| atilim.edu.tr                                      |    2292 |   1.07531    |
| dergipark.org.tr                                   |    2283 |   1.07109    |
| yandex.com.tr                                      |    2180 |   1.02277    |
| ninova.itu.edu.tr                                  |    2166 |   1.0162     |
| gbs.gelisim.edu.tr                                 |    2095 |   0.98289    |
| vivi.com.tr                                        |    1836 |   0.861377   |
| yapikatalogu.com                                   |    1813 |   0.850587   |
| bbc.com                               