In [7]:
import pandas as pd
import numpy as np
import regex as re
import json

## Preparation of document format from sentence format

In [2]:
# Import json
with open("Macocu-sl-en.json", "r") as file:
	tus_content = json.load(file)

In [None]:
# Convert data to a dataframe

corpus_df = pd.DataFrame(tus_content)

corpus_df.head()

In [4]:
# Analyze distribution
corpus_df.describe(include="all")

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par
count,3176311.0,3176311,3176311,3176311,3176311,3176311,3176311,3176311,3176311,3176311,3176311
unique,,2,2,285892,93755,2379921,4,4,456366,86818,2542560
top,,No,sl-orig,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,You should consider whether you understand how...,B,B,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,Izdelana je iz prijetne mešanice naravnega in ...
freq,,2897662,2812317,5112,36423,745,1640246,1967189,4294,33594,207
mean,0.880997,,,,,,,,,,
std,0.1299268,,,,,,,,,,
min,0.5,,,,,,,,,,
25%,0.825,,,,,,,,,,
50%,0.944,,,,,,,,,,
75%,0.976,,,,,,,,,,


#### Bi-Cleaner Scores on sentence level

As we can see from the column on score_bicleaner_ai, it seems that there are no sentences with score lower than 0.5 - they must have been already filtered out? (?). We can also see, that was majority of sentences were originally written in Slovene (based on translation_direction), and that 700 English sentences and 200 Slovene sentences are repeated which could indicate some errors.

In [None]:
# Sort by english url and then by en_par_id to order the paragraphs into texts
corpus_df = corpus_df.sort_values(by = ["en_source", "en_par_id"])

corpus_df

In [8]:
# Add information about domains
domain_re=re.compile(r'^https?://(?:www\.)?(.+?)[/$]')

en_domain_list = [domain_re.search(i).group(1) for i in corpus_df.en_source.to_list()]

en_domain_list[:3]

['15.liffe.si', '15.liffe.si', '15.liffe.si']

In [None]:
corpus_df["en_domain"] = en_domain_list

corpus_df.head(2)

In [None]:
# Repeat with Slovene domain
sl_domain_list = [domain_re.search(i).group(1) for i in corpus_df.sl_source.to_list()]
corpus_df["sl_domain"] = sl_domain_list

corpus_df.head(2)

In [None]:
# Add information whether the domains are the same
corpus_df["same_domains"] = np.where(corpus_df["en_domain"] == corpus_df["sl_domain"], "yes", 'no')

# Add column for domains that are different
corpus_df["different_domains"] = corpus_df["en_domain"] + " " + corpus_df["sl_domain"]

corpus_df.head(2)

In [12]:
corpus_df["same_domains"].value_counts()

yes    2347120
no      829191
Name: same_domains, dtype: int64

In [13]:
# Analyze instances from different domains
corpus_df[corpus_df["same_domains"] != "yes"]["different_domains"] .value_counts()

eur-lex.europa.eu uradni-list.si                 21956
eur-lex.europa.eu europarl.europa.eu             16475
europarl.europa.eu eur-lex.europa.eu              8431
croatiabeachacc.com privatapartmajihrvaska.si     6907
predsednik.si up-rs.si                            6439
                                                 ...  
eesc.europa.eu okolje.si                             1
eesc.europa.eu nova-gorica.si                        1
eesc.europa.eu plastika-lvp.si                       1
eesc.europa.eu e-uprava.gov.si                       1
savacommission.org posavskiobzornik.si               1
Name: different_domains, Length: 118892, dtype: int64

In [None]:
# Calculate average bicleaner ai score based on the en_source
corpus_df["average_score"] = corpus_df["score_bicleaner_ai"].groupby(corpus_df['en_source']).transform('mean')

corpus_df.head(2)

In [None]:
# Add to each instance from the same en_source joint text from all sentences
corpus_df["en_doc"] = corpus_df["en_par"].groupby(corpus_df['en_source']).transform(' '.join)

corpus_df.head(2)

In [None]:
# Repeat with the Slovene text
corpus_df["sl_doc"] = corpus_df["sl_par"].groupby(corpus_df['sl_source']).transform(' '.join)
corpus_df.head(2)

In [18]:
# Keep only one example of each text
corpus_df = corpus_df.drop_duplicates("en_doc")

corpus_df.shape

(281475, 18)

In [19]:
corpus_df.describe(include="all")

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc
count,281475.0,281475,281475,281475,281475,281475,281475,281475,281475,281475,281475,281475,281475,281475,281475,281475.0,281475,281475
unique,,2,2,281475,15104,268451,4,4,232628,13766,250662,11959,17279,2,52579,,281475,230097
top,,No,sl-orig,http://15.liffe.si/?lang_chg=en,p10s0,You should consider whether you understand how...,UNK,B,https://www.realitica.com/en/listing/2447504,p26s0,"Če želite izvedeti več, lahko preberete našo p...",rumenestrani.si,rumenestrani.si,yes,rumenestrani.si rumenestrani.si,,It went out with a bang. The evening sparkled ...,"Če želite izvedeti več, lahko preberete našo p..."
freq,,245570,241480,1,12036,181,155208,129844,13,9302,101,17392,16582,191600,16494,,1,53
mean,0.841095,,,,,,,,,,,,,,,0.846546,,
std,0.143561,,,,,,,,,,,,,,,0.109657,,
min,0.5,,,,,,,,,,,,,,,0.5,,
25%,0.74,,,,,,,,,,,,,,,0.78575,,
50%,0.898,,,,,,,,,,,,,,,0.876,,
75%,0.963,,,,,,,,,,,,,,,0.93175,,


In [None]:
corpus_df.head()

In [None]:
# Add information about length
corpus_df["en_length"] = corpus_df.en_doc.str.split().str.len()

corpus_df.head(3)

In [22]:
corpus_df.en_length.describe()

count    281475.000000
mean        258.774364
std        1384.547486
min           1.000000
25%          29.000000
50%          79.000000
75%         207.000000
max      106893.000000
Name: en_length, dtype: float64

In [None]:
# Add information about Slovene length
corpus_df["sl_length"] = corpus_df.sl_doc.str.split().str.len()

corpus_df.head(3)

In [48]:
# Analyze Slovene length
corpus_df.sl_length.describe()

count    141066.000000
mean        453.143635
std        2290.479800
min           2.000000
25%          78.000000
50%         148.000000
75%         307.000000
max       98530.000000
Name: sl_length, dtype: float64

In [23]:
# Discard instances that have length less than the median - less than 79
corpus_df = corpus_df[corpus_df["en_length"] > 78]

corpus_df.shape

(141066, 19)

After discarding texts that have length less than the median - less than 79 words, the Slovene-English corpus consists of 141066 aligned texts.

In [24]:
corpus_df.columns

Index(['score_bicleaner_ai', 'biroamer_entities', 'translation_direction',
       'en_source', 'en_par_id', 'en_par', 'en_var_doc', 'en_var_dom',
       'sl_source', 'sl_par_id', 'sl_par', 'en_domain', 'sl_domain',
       'same_domains', 'different_domains', 'average_score', 'en_doc',
       'sl_doc', 'en_length'],
      dtype='object')

In [25]:
# Discard irrelevant columns
corpus_df = corpus_df.drop(columns = ['score_bicleaner_ai', 'en_par_id', 'en_par', 'sl_par_id', 'sl_par'])
corpus_df.head()

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc,en_length
2584979,No,sl-orig,http://15.liffe.si/?lang_chg=en,B,B,http://15.liffe.si/?lang_chg=sl,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...,601
1212933,No,sl-orig,http://16.liffe.si/?lang_chg=en,B,B,http://16.liffe.si/index.php?menu_item=domov,16.liffe.si,16.liffe.si,yes,16.liffe.si 16.liffe.si,0.9,Some days ago the organisers of the 17th Liffe...,Pred dnevi smo se iz 59. mednarodnega filmskeg...,293
598330,Yes,sl-orig,http://17.liffe.si/?lang_chg=en,B,B,http://17.liffe.si/?lang_chg=sl,17.liffe.si,17.liffe.si,yes,17.liffe.si 17.liffe.si,0.957875,17th LIFFe was brought to an end with the best...,S podelitvijo nagrad in predvajanjem Režiserja...,445
2796982,No,sl-orig,http://1proti1.mg-lj.si/en/concept/,UNK,UNK,http://www.mg-lj.si/si/razstave/98/vmesna-postaja,1proti1.mg-lj.si,mg-lj.si,no,1proti1.mg-lj.si mg-lj.si,0.945633,"Concept Not interested in showcasing art, 1:1 ...",VMESNO POSTAJO 1:1 organiziramo v koprodukciji...,827
642105,Yes,sl-orig,http://2006.fdf.si/?lang_chg=en,B,B,http://2006.fdf.si/?menu_item=podatki&amp;menu...,2006.fdf.si,2006.fdf.si,yes,2006.fdf.si 2006.fdf.si,0.941667,The 8th International Documentary Film Festiva...,S slovenskim filmom Poročno potovanje se je ko...,423


In [26]:
# Inspect corpus information
corpus_df.describe(include="all")

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc,en_length
count,141066,141066,141066,141066,141066,141066,141066,141066,141066,141066,141066.0,141066,141066,141066.0
unique,2,2,141066,4,4,126398,8507,11529,2,25040,,141066,126147,
top,No,sl-orig,http://15.liffe.si/?lang_chg=en,UNK,B,https://www.sofascore.com/sl/ekipa/nogomet/vik...,oblacila.si,oblacila.si,yes,oblacila.si oblacila.si,,It went out with a bang. The evening sparkled ...,"Ali se strinjate, da na vaš računalnik namesti...",
freq,122869,124561,1,55840,77117,9,3982,3952,104853,3949,,1,23,
mean,,,,,,,,,,,0.875767,,,483.234918
std,,,,,,,,,,,0.074288,,,1929.667052
min,,,,,,,,,,,0.502,,,79.0
25%,,,,,,,,,,,0.833667,,,126.0
50%,,,,,,,,,,,0.894077,,,207.0
75%,,,,,,,,,,,0.932,,,392.0


In [55]:
# Inspect en_var_doc statistics
print(corpus_df.en_var_doc.value_counts(normalize = True).to_markdown())

|     |   en_var_doc |
|:----|-------------:|
| B   |    0.390137  |
| UNK |    0.389793  |
| A   |    0.162199  |
| MIX |    0.0578715 |


In [57]:
print(corpus_df.en_var_dom.value_counts(normalize = True).to_markdown())

|     |   en_var_dom |
|:----|-------------:|
| B   |   0.539765   |
| MIX |   0.284293   |
| A   |   0.16629    |
| UNK |   0.00965161 |


In [58]:
# Inspect translation direction
print(corpus_df.translation_direction.value_counts(normalize = True).to_markdown())

|         |   translation_direction |
|:--------|------------------------:|
| sl-orig |                0.886241 |
| en-orig |                0.113759 |


In [60]:
print(corpus_df.average_score.describe().to_markdown())

|       |   average_score |
|:------|----------------:|
| count |  104853         |
| mean  |       0.886736  |
| std   |       0.0682533 |
| min   |       0.502     |
| 25%   |       0.8519    |
| 50%   |       0.904     |
| 75%   |       0.93675   |
| max   |       0.9905    |


#### Main information on the corpus

As we can see, almost all of the documents were originally written in Slovene (88%). Most of them are identified as "UNK" (40%), followed by British (39%), and much less American texts, by English variety detection on document level. On the domain level, most of them (55%) were identified to be British. Most of the texts have quality higher than 0.90 based on the bicleaner score. There are 23 Slovene duplicated texts.

In [40]:
# Analyze instances from different domains
corpus_df["same_domains"].value_counts()

yes    104853
no      36213
Name: same_domains, dtype: int64

In [41]:
corpus_df[corpus_df["same_domains"] != "yes"]["different_domains"] .value_counts()

eugo.gov.si spot.gov.si                                           366
plus500.com plus500.si                                            283
croatiabeachacc.com privatapartmajihrvaska.si                     281
hajdi.net hajdi.si                                                258
bsi.si bsi.azurewebsites.net                                      257
                                                                 ... 
etnamaar.com casmatino.si                                           1
etnologija.etnoinfolab.org anglistika.ff.uni-lj.si                  1
etnologija.etnoinfolab.org ffa.uni-lj.si                            1
etnologija.etnoinfolab.org pedagogika-andragogika.ff.uni-lj.si      1
塞尔维亚.realigro.asia oglasi.si                                        1
Name: different_domains, Length: 19060, dtype: int64

In [51]:
len(corpus_df.different_domains.unique())

25040

In [None]:
# Inspect duplicated Slovene texts
pd.set_option('display.max_colwidth', None)
duplicated = corpus_df[corpus_df.sl_doc.duplicated(keep=False)].sort_values("sl_doc")
duplicated[["sl_doc", "en_doc", "average_score"]]

In [50]:
# Inspect a random sample
pd.set_option('display.max_colwidth', 300)
corpus_df.sample(n=20)

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc,en_length,sl_length
1548798,Yes,sl-orig,https://www.bled.si/en/what-to-see-do/cuisine/2019100208524126/educational-beehive,B,B,https://www.bled.si/sl/kaj-videti-poceti/kulinarika/2019100208285395/ucni-cebelnjak,bled.si,bled.si,yes,bled.si bled.si,0.902182,"Danijela and Blaž Ambrožič, who have been in the beekeeping tourism business since 2011, made it possible to observe the life of the Grey bee of Carniola (""kranjska sivka"") in the beehive from a safe distance in their village of Selo pri Bledu. Around 30 bee colonies live in the educational beeh...","Učni čebelnjak Da je življenje kranjske sivke v čebelnjaku mogoče opazovati na varnem mestu, sta na Selu pri Bledu poskrbela Danijela in Blaž Ambrožič, ki se s čebelarskim turizmom ukvarjata od leta 2011. V učnem čebelnjaku živi okoli 30 čebeljih družin, čebelnjak pa ima ročno poslikane panjske ...",245,133
84755,Yes,sl-orig,http://pefprints.pef.uni-lj.si/469/,A,B,http://pefprints.pef.uni-lj.si/450/,pefprints.pef.uni-lj.si,pefprints.pef.uni-lj.si,yes,pefprints.pef.uni-lj.si pefprints.pef.uni-lj.si,0.8105,"This paper presents the results, gathered in Slovenian kindergartens participating in the project ""Professional training of professional staff to implement specific elements of teaching the principles of Reggio Emilia concept in pre-school."" This paper presents the results, gathered in Slovenian...",V raziskavi so sodelovali vzgojitelji prvega kroga izobraževanja »Profesionalno usposabljanje strokovnih delavcev za izvajanje elementov posebnih pedagoških načel koncepta Reggio Emilia na področju predšolske vzgoje v letih 2008– 2013« na Pedagoški fakulteti v Ljubljani.,94,34
2562108,No,sl-orig,http://newsletter.etwinning.net/en/pub/2013/etnl_062013.htm,B,B,http://newsletter.etwinning.net/sl/pub/2013/etnl_062013.htm,newsletter.etwinning.net,newsletter.etwinning.net,yes,newsletter.etwinning.net newsletter.etwinning.net,0.958364,"After a careful analysis of pedagogical methods used by each of the partners we want to understand children's needs and difficulties in mathematics, make some special activity and develop new teaching practices. Many eTwinning countries have seen the development of Ambassador networks – but what...","Po analizi pedagoških metod vseh partnerjev, bomo vzeli pod drobnogled potrebe učencev in njihove težave pri učenju matematike, pričeli bomo s posebnimi aktivnostmi in razvili nove učne prakse. Mreža ambasadorjev eTwinning je vzpostavljena v večini držav, ki sodelujejo v akciji. Kdo pa pravzapra...",240,212
348403,No,sl-orig,https://oblacila.si/under-armour-under-armour-tricko-performance-polo-2.0-blu__7wyPxjjEtG,A,MIX,https://oblacila.si/siksilk-siksilk-majica-mornarska__El84a70qEH,oblacila.si,oblacila.si,yes,oblacila.si oblacila.si,0.824923,The shirt then also features short sleeves for a classic look and is produced in a lightweight construction with Climalite technology that quickly wicks sweats away from your body. The shirt then also features short sleeves for a classic look and is produced in a lightweight construction with Cl...,"Obdelava Microban® učinkovito preprečuje neprijeten vonj. Majica je kot prvi sloj namenjena različnim športnim aktivnostim v hladnejšem vremenu, še posebej dobro pa se izkaže pri aktivnostih, kjer se izdatno potimo. Izdelana je iz mehkega mrežastega materiala, ki po zaslugi tehnologije Dri-FIT o...",205,55
2031466,No,sl-orig,http://eugo.gov.si/en/activities/activity/14091/showActivity/,B,B,https://spot.gov.si/sl/dejavnosti-in-poklici/dejavnosti-skd/obdelava-semen/,eugo.gov.si,spot.gov.si,no,eugo.gov.si spot.gov.si,0.947,"This subclass includes activities related to the improvement of the quality of seed units by means of cleaning, removal of unwanted impurities, damaged or small seeds as well as by drying seeds for storage. The production, preparation for the market, import and/or marketing of agricultural seeds...","V ta podrazred spadajo dejavnosti v zvezi z izboljševanjem kakovosti semenskega materiala s čiščenjem, odstranjevanjem neželenih primesi, poškodovanih ali premajhnih semen ter s sušenjem semen za skladiščenje.",98,27
609292,No,sl-orig,https://www.pokerharder.com/en-gb/learn-poker/variants/,MIX,MIX,https://stave-online.com/video-poker-igre/,pokerharder.com,stave-online.com,no,pokerharder.com stave-online.com,0.7335,"Players receive whole poker hands and then have the option to swap any of their cards for new ones from the rest of the deck, this action is why the games are known as ‘draw’ poker. Poker Game Variants Stud poker is not classified as a single set game, like Texas Hold’em or Omaha but rather a gr...","Če sta tudi po zadnjem krogu stav vsaj dva igralca še vedno v igri, morata oba na koncu razkriti karte. Če nimate nobene kombinacije, zmaga igralec z najvišjo karto. Če sta tudi po zadnjem krogu stav vsaj dva igralca še vedno v igri, morata oba na koncu razkriti karte. Čeprav velja Texas Hold’em...",112,132
1805773,No,sl-orig,http://visitcerkno.si/en/boulders-on-the-crni-vrh-hill-above-cerkno/,A,A,http://visitcerkno.si/balvani-crni-vrh-nad-cerknim/,visitcerkno.si,visitcerkno.si,yes,visitcerkno.si visitcerkno.si,0.874625,"There are many different types of sport climbing and one of them is boulder climbing – climbing on boulders and lower parts of walls. The boulders on the Črni Vrh hill above Cerkno lie in a mixed forest at the altitude of 1,000 m, which is why summer is the best time for a visit, but it is possi...","Obstaja več različnih zvrsti športnega plezanja, eno izmed njih je balvansko plezanje – plezanje na balvanih in spodnjih delih sten. Balvani v Črnem vrhu nad Cerknim ležijo v mešanem gozdu na nadmorski višini 1.000 m, zato je poletje najprimernejši čas za obisk. Možno je plezanje preko celega le...",160,89
1737177,No,sl-orig,https://www.nuk.uni-lj.si/eng/node/489,UNK,B,https://www.nuk.uni-lj.si/nuk/organizacija/zbirka-rokopisov,nuk.uni-lj.si,nuk.uni-lj.si,yes,nuk.uni-lj.si nuk.uni-lj.si,0.83,"They include 122 manuscript items, mainly codices and individual fragments of medieval manuscripts. The largest and most comprehensive is the collection of Latin manuscripts from the 9th to the end of the 15th century, which contains 91 items. Besides some bound volumes of manuscripts from the p...","Najobsežnejša je zbirka latinskih rokopisov od 9. pa do konca 15. stoletja, izredno pomembni so tudi srednjeveški rokopisi iz zbirk slavista Jerneja Kopitarja ter barona Žige Zoisa, napisani v cirilici in glagolici. Fond novoveških rokopisov sestavljajo poleg vezanih rokopisov iz obdobja humaniz...",104,74
1993535,No,sl-orig,https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=LEGISSUM:l28056&amp;from=HR,B,B,https://www.mrezaprostor.si/aktualno/pripomocki-za-sodelovanje-javnosti-pri-urejanju-prostora/,eur-lex.europa.eu,mrezaprostor.si,no,eur-lex.europa.eu mrezaprostor.si,0.758364,"to enable public officials and authorities to help and advise the public on access to information, participation in decision-making and access to justice; to enable public officials and authorities to help and advise the public on access to information, participation in decision-making and acces...","Konvencija o dostopu do informacij, udeležbi javnosti pri odločanju in dostopu do pravnega varstva v okoljskih zadevah (Aarhuška konvencija) se sicer nanaša na varstvo okolja.",625,25
930061,No,sl-orig,https://www.agencia.si/en/print/243524-oddaja-poslovni-prostor-skladisce-obalno-kraska-sezana-storje,UNK,A,https://www.gbnepremicnine.si/nepremicnine/stanovanje.v1/trisobno.t7/Ljubljana.r1/_850eur.q/_900eur.w/p1.html,agencia.si,gbnepremicnine.si,no,agencia.si gbnepremicnine.si,0.8355,"Costs are not included in the rent. Rental conditions: monthly rent + income tax deduction + security deposit two rents, as a condition for signing the rental agreement, the lessee has to pay to the real estate agency the following costs: drafting of the rental agreement, preparation of hand ove...",Stroški niso vključeni.,139,3


In [52]:
# Discard instances document that are from different domains
corpus_df = corpus_df[corpus_df["same_domains"] == "yes"]
corpus_df.describe(include="all")

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc,en_length,sl_length
count,104853,104853,104853,104853,104853,104853,104853,104853,104853,104853,104853.0,104853,104853,104853.0,104853.0
unique,2,2,104853,4,4,95040,5980,5980,1,5980,,104853,94869,,
top,No,sl-orig,http://15.liffe.si/?lang_chg=en,B,B,https://www.sofascore.com/sl/ekipa/nogomet/viktoria-plzen/4502,oblacila.si,oblacila.si,yes,oblacila.si oblacila.si,,"It went out with a bang. The evening sparkled with glitter and gold and Gallus Hall hosted yet another closing and awards ceremony of the jubilee Ljubljana International Film Festival. Dashing in their black ties and led by eminent conductor Helmut Imig, the RTV Slovenia Symphony Orchestra perfo...","Ali se strinjate, da na vaš računalnik namestimo piškotke in omogočimo pravilno delovanje strani?",,
freq,91558,92925,1,40907,56596,9,3949,3949,104853,3949,,1,23,,
mean,,,,,,,,,,,0.886736,,,487.442829,495.742315
std,,,,,,,,,,,0.068253,,,2058.219685,2462.451608
min,,,,,,,,,,,0.502,,,79.0,2.0
25%,,,,,,,,,,,0.8519,,,128.0,92.0
50%,,,,,,,,,,,0.904,,,210.0,166.0
75%,,,,,,,,,,,0.93675,,,392.0,333.0


In [53]:
# Save the dataframe to csv
corpus_df.to_csv("Macocu-sl-en-doc-format.csv", sep= "\t")