In [1]:
import pandas as pd
import numpy as np
import regex as re
import json

## Preparation of document format from sentence format

In [2]:
# Import json
with open("Macocu-sl-en.json", "r") as file:
	tus_content = json.load(file)

In [3]:
# Convert data to a dataframe

corpus_df = pd.DataFrame(tus_content)

corpus_df.head()

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par
0,0.95,No,sl-orig,http://www.damapapir.si/en/ecology,p10s2,This significantly contributes to a clean and ...,B,MIX,http://www.damapapir.si/sl/ekologija,p10s2,S tem bomo preprečili onesnaževanje narave in ...
1,0.929,No,en-orig,https://www.mesimedical.com/mesi-mtablet/mesi-...,p38s0,MESI mTABLET TBI is not only an advanced Toe-B...,UNK,B,https://www.mesimedical.com/sl/mesi-mtablet/me...,p31s0,MESI mTABLET TBI ni samo napredna naprava za m...
2,0.95,No,sl-orig,https://www.uirs.si/en-us/About-us/Mission,p33s0,UIRS has a Research Programme Group which is c...,B,MIX,https://www.uirs.si/Poslanstvo,p33s0,"Na UIRS deluje programska skupina, ki je s str..."
3,0.984,No,sl-orig,https://www.visitkras.info/en/divaca-karst-nat...,p32s1,"You can start the trail at different points, a...",B,B,https://www.visitkras.info/krasoslovna-naravos...,p32s1,"Pot, ki jo lahko začnete na različnih točkah, ..."
4,0.886,No,sl-orig,https://fotoformat.si/en/dslr-cameras/7226-nik...,p137s0,"-3 to +1 EV in steps of 1/3 or 1/2 EV, in P, S...",A,A,https://www.pikto.si/proizvodi/13860/nikon-dig...,p101s0,Lahko jo prilagodite od –3 do +1 EV v korakih ...


In [4]:
# Analyze distribution
corpus_df.describe(include="all")

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par
count,3176311.0,3176311,3176311,3176311,3176311,3176311,3176311,3176311,3176311,3176311,3176311
unique,,2,2,285892,93755,2379921,4,4,456366,86818,2542560
top,,No,sl-orig,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,You should consider whether you understand how...,B,B,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,Izdelana je iz prijetne mešanice naravnega in ...
freq,,2897662,2812317,5112,36423,745,1640246,1967189,4294,33594,207
mean,0.880997,,,,,,,,,,
std,0.1299268,,,,,,,,,,
min,0.5,,,,,,,,,,
25%,0.825,,,,,,,,,,
50%,0.944,,,,,,,,,,
75%,0.976,,,,,,,,,,


#### Bi-Cleaner Scores on sentence level

As we can see from the column on score_bicleaner_ai, it seems that there are no sentences with score lower than 0.5 - they must have been already filtered out? (?). We can also see, that was majority of sentences were originally written in Slovene (based on translation_direction), and that 700 English sentences and 200 Slovene sentences are repeated which could indicate some errors.

In [5]:
# Sort by english url and then by en_par_id to order the paragraphs into texts
corpus_df = corpus_df.sort_values(by = ["en_source", "en_par_id"])

corpus_df

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...
1756060,0.908,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p44s0,Dashing in their black ties and led by eminent...,B,B,http://15.liffe.si/?lang_chg=sl,p44s0,Simfonični orkester RTV Slovenija je eleganten...
3012889,0.969,No,sl-orig,http://15.liffe.si/?lang_chg=en,p45s0,But the film crowd gathered with but one purpo...,B,B,http://15.liffe.si/?lang_chg=sl,p45s0,Filmski navdušenci pa so se zbrali le z enim n...
2928121,0.973,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p45s1,"First the FIPRESCI jury, of the International ...",B,B,http://15.liffe.si/?lang_chg=sl,p45s1,Najprej je žirija mednarodnega združenja films...
...,...,...,...,...,...,...,...,...,...,...,...
3139299,0.934,No,sl-orig,https://塞尔维亚.realigro.asia/%e5%87%ba%e5%94%ae/...,p105s1,The apartment consists of a living room with a...,UNK,A,https://www.nepremicnine.net/oglasi-prodaja/ko...,p95s0,"Stanovanje obsega: predprostor (hodnik), dnevn..."
642519,0.911,No,sl-orig,https://巴西.realigro.asia/%e5%87%ba%e7%a7%9f/%e...,p67s2,Are you in need of private or business finance...,UNK,UNK,https://www.knjiznica-velenje.si/events/kaj-mo...,p0s0,Kaj moramo vedeti o osebnih in poslovnih finan...
263106,0.718,No,sl-orig,https://罗马尼亚.realigro.asia/%e5%87%ba%e5%94%ae/...,p76s1,Price 2400 euros per hectare.,UNK,UNK,https://kristijanhrastar.kmeckiglas.com/post/2...,p33s2,"Povprečni pridelek naj bi bil 5,9 tone na hektar."
485460,0.744,No,sl-orig,https://西班牙.realigro.asia/%e5%87%ba%e5%94%ae/%...,p103s0,"ID:R-1470 Two rooms, close to the sea Two bedr...",UNK,UNK,http://www.oglasi.si/oglas_nepremicnine/apartm...,p33s0,"Apartma se nahaja v okolici Krškega, 50 m do a..."


In [6]:
# Add information about domains
domain_re=re.compile(r'^https?://(?:www\.)?(.+?)[/$]')

en_domain_list = [domain_re.search(i).group(1) for i in corpus_df.en_source.to_list()]

en_domain_list[:3]

['15.liffe.si', '15.liffe.si', '15.liffe.si']

In [7]:
corpus_df["en_domain"] = en_domain_list

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si


In [8]:
# Repeat with Slovene domain
sl_domain_list = [domain_re.search(i).group(1) for i in corpus_df.sl_source.to_list()]
corpus_df["sl_domain"] = sl_domain_list

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si,15.liffe.si


In [9]:
# Add information whether the domains are the same
corpus_df["same_domains"] = np.where(corpus_df["en_domain"] == corpus_df["sl_domain"], "yes", 'no')

# Add column for domains that are different
corpus_df["different_domains"] = corpus_df["en_domain"] + " " + corpus_df["sl_domain"]

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si


In [10]:
corpus_df["same_domains"].value_counts()

yes    2347120
no      829191
Name: same_domains, dtype: int64

In [11]:
# Analyze instances from different domains
corpus_df[corpus_df["same_domains"] != "yes"]["different_domains"] .value_counts()

eur-lex.europa.eu uradni-list.si                 21956
eur-lex.europa.eu europarl.europa.eu             16475
europarl.europa.eu eur-lex.europa.eu              8431
croatiabeachacc.com privatapartmajihrvaska.si     6907
predsednik.si up-rs.si                            6439
                                                 ...  
eesc.europa.eu okolje.si                             1
eesc.europa.eu nova-gorica.si                        1
eesc.europa.eu plastika-lvp.si                       1
eesc.europa.eu e-uprava.gov.si                       1
savacommission.org posavskiobzornik.si               1
Name: different_domains, Length: 118892, dtype: int64

In [12]:
# Discard instances that are from different domains
corpus_df = corpus_df[corpus_df["same_domains"] == "yes"]
corpus_df.describe(include="all")

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains
count,2347120.0,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120,2347120
unique,,2,2,210217,75347,1945904,4,4,258401,70296,1933838,7301,7301,1,7301
top,,No,sl-orig,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,Manager of personal data: Slovenian Tourist Bo...,B,B,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,Izdelana je iz prijetne mešanice naravnega in ...,eur-lex.europa.eu,eur-lex.europa.eu,yes,eur-lex.europa.eu eur-lex.europa.eu
freq,,2144059,2091190,4719,31662,491,1267294,1484562,4083,29002,207,387442,387442,2347120,387442
mean,0.8977433,,,,,,,,,,,,,,
std,0.1197017,,,,,,,,,,,,,,
min,0.5,,,,,,,,,,,,,,
25%,0.865,,,,,,,,,,,,,,
50%,0.955,,,,,,,,,,,,,,
75%,0.978,,,,,,,,,,,,,,


In [13]:
# Calculate average bicleaner ai score based on the en_source
corpus_df["average_score"] = corpus_df["score_bicleaner_ai"].groupby(corpus_df['en_source']).transform('mean')

corpus_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df["average_score"] = corpus_df["score_bicleaner_ai"].groupby(corpus_df['en_source']).transform('mean')


Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808


In [14]:
# Join par id and text
corpus_df["en-par-text"] = corpus_df["en_par_id"] + "-" + corpus_df["en_par"]
corpus_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df["en-par-text"] = corpus_df["en_par_id"] + "-" + corpus_df["en_par"]


Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s0-It went out with a bang.
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s1-The evening sparkled with glitter and go...


In [15]:
# Find duplicated English sentences that have the same par id
corpus_df[corpus_df["en-par-text"].duplicated(keep=False)]

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text
62307,0.794,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p56s0,"Jelka Stergel, Liffe Festival Director, attend...",B,B,http://15.liffe.si/?menu_item=podatki&amp;menu...,p50s1,Srečala se je z direktorico festivala na Portu...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,"p56s0-Jelka Stergel, Liffe Festival Director, ..."
831882,0.856,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p56s0,"Jelka Stergel, Liffe Festival Director, attend...",B,B,http://15.liffe.si/?lang_chg=sl,p56s0+p57s0+p57s1,Aktualno Tudi Jelka Stergel je bila na festiva...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,"p56s0-Jelka Stergel, Liffe Festival Director, ..."
2185018,0.546,No,sl-orig,http://2007-2013.ita-slo.eu/,p35s2,CALL FOR INDEPENDENT EXPERTS FOR THE QUALITY A...,UNK,B,http://2007-2013.ita-slo.eu/novice_in_informac...,p30s0,Javni Poziv za oblikovanje seznama neodvisnih ...,2007-2013.ita-slo.eu,2007-2013.ita-slo.eu,yes,2007-2013.ita-slo.eu 2007-2013.ita-slo.eu,0.770333,p35s2-CALL FOR INDEPENDENT EXPERTS FOR THE QUA...
2623298,0.975,No,sl-orig,http://2007-2013.ita-slo.eu/,p35s2,CALL FOR INDEPENDENT EXPERTS FOR THE QUALITY A...,UNK,B,http://2007-2013.ita-slo.eu/novice_in_informac...,p30s2,JAVNI POZIV ZA IZBOR NEODVISNIH STROKOVNJAKOV ...,2007-2013.ita-slo.eu,2007-2013.ita-slo.eu,yes,2007-2013.ita-slo.eu 2007-2013.ita-slo.eu,0.770333,p35s2-CALL FOR INDEPENDENT EXPERTS FOR THE QUA...
918963,0.781,No,sl-orig,http://2009-2010.isak.si/eng/index.php,p0s0,A project for promoting African culture with a...,B,B,http://2009-2010.isak.si/aktualno-novica.php?id=5,p10s1,Projekt ISAK je projekt širjenja afriške kultu...,2009-2010.isak.si,2009-2010.isak.si,yes,2009-2010.isak.si 2009-2010.isak.si,0.848600,p0s0-A project for promoting African culture w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381508,0.984,No,en-orig,https://zwave.si/index.php?_route_=FIB_FGD_212,p455s1,Each Z-Wave module functions as a wireless rep...,UNK,A,https://zwave.si/index.php?_route_=Aeotec/AEO-...,p442s1,Vsak Z-Wave modul deluje kot brezžični repetit...,zwave.si,zwave.si,yes,zwave.si zwave.si,0.975000,p455s1-Each Z-Wave module functions as a wirel...
2924267,0.956,No,en-orig,https://zwave.si/index.php?_route_=FibaroWallP...,p443s1,Each module Z-Wave operates as a wireless repe...,A,A,https://zwave.si/index.php?_route_=Philio-Tech...,p439s5,Ta Z- Wave modul deluje kot brezžični repetito...,zwave.si,zwave.si,yes,zwave.si zwave.si,0.970000,p443s1-Each module Z-Wave operates as a wirele...
3149698,0.984,No,en-orig,https://zwave.si/index.php?_route_=FibaroWallP...,p443s1,Each module Z-Wave operates as a wireless repe...,A,A,https://zwave.si/index.php?_route_=sensors/pow...,p442s1,Vsak Z-Wave modul deluje kot brezžični repetit...,zwave.si,zwave.si,yes,zwave.si zwave.si,0.970000,p443s1-Each module Z-Wave operates as a wirele...
151207,0.926,No,en-orig,https://zwave.si/index.php?_route_=all/AllRF43...,p440s0,This interface will increase the ability to tr...,UNK,A,https://zwave.si/index.php?_route_=newsletter0...,p443s1,Ta vmesnik bo povečal zmožnost oddajanja in sp...,zwave.si,zwave.si,yes,zwave.si zwave.si,0.895250,p440s0-This interface will increase the abilit...


In [16]:
# Discard all duplicated English paragraphs with the same par id - this could destroy structure of some of the Slovene texts, but we are interested only in English texts.
corpus_df = corpus_df.drop_duplicates("en-par-text")

In [17]:
corpus_df.describe(include="all")

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text
count,2047953.0,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953,2047953.0,2047953
unique,,2,2,207525,75347,1945904,4,4,231434,67428,1823951,7301,7301,1,7301,,2047953
top,,No,sl-orig,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,The sweatshirt is made of high quality material.,B,B,http://nl.ijs.si/ME/Vault/V3/htm/mte-cesdoc-mt...,p26s0,Izdelane so iz trpežnega in rahlo raztegljiveg...,eur-lex.europa.eu,eur-lex.europa.eu,yes,eur-lex.europa.eu eur-lex.europa.eu,,p43s0-It went out with a bang.
freq,,1872834,1818807,4276,15844,116,1139607,1324035,3912,15665,138,339909,339909,2047953,339909,,1
mean,0.905268,,,,,,,,,,,,,,,0.9045701,
std,0.1151669,,,,,,,,,,,,,,,0.05848313,
min,0.5,,,,,,,,,,,,,,,0.5,
25%,0.883,,,,,,,,,,,,,,,0.8879344,
50%,0.96,,,,,,,,,,,,,,,0.9206226,
75%,0.979,,,,,,,,,,,,,,,0.9406818,


In [19]:
# Add to each instance from the same en_source joint text from all sentences
corpus_df["en_doc"] = corpus_df["en_par"].groupby(corpus_df['en_source']).transform(' '.join)

corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text,en_doc
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s0-It went out with a bang.,It went out with a bang. The evening sparkled ...
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s1-The evening sparkled with glitter and go...,It went out with a bang. The evening sparkled ...


In [20]:
# Repeat with the Slovene text
corpus_df["sl_doc"] = corpus_df["sl_par"].groupby(corpus_df['sl_source']).transform(' '.join)
corpus_df.head(2)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text,en_doc,sl_doc
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s0-It went out with a bang.,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...
3046105,0.974,Yes,sl-orig,http://15.liffe.si/?lang_chg=en,p43s1,The evening sparkled with glitter and gold and...,B,B,http://15.liffe.si/?lang_chg=sl,p43s1,Večer je bleščavo mežikal in Gallusova dvorana...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s1-The evening sparkled with glitter and go...,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...


In [21]:
# Keep only one example of each text
corpus_df = corpus_df.drop_duplicates("en_doc")

corpus_df.shape

(205785, 19)

In [22]:
corpus_df.describe(include="all")

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text,en_doc,sl_doc
count,205785.0,205785,205785,205785,205785,205785,205785,205785,205785,205785,205785,205785,205785,205785,205785,205785.0,205785,205785,205785
unique,,2,2,205785,12093,200156,4,4,170244,10991,183477,7300,7300,1,7300,,205785,205785,168462
top,,No,sl-orig,http://15.liffe.si/?lang_chg=en,p26s0,The material perfectly diverts sweat and it wi...,UNK,B,https://www.realitica.com/en/listing/2447504,p26s0,Nahajate se na arhivskem spletnem mestu Festiv...,rumenestrani.si,rumenestrani.si,yes,rumenestrani.si rumenestrani.si,,p43s0-It went out with a bang.,It went out with a bang. The evening sparkled ...,Deloglasnik.si je danes med najbolj obiskanimi...
freq,,179191,178411,1,10082,19,109044,97925,13,8692,84,16674,16674,205785,16674,,1,1,44
mean,0.865134,,,,,,,,,,,,,,,0.870281,,,
std,0.13449,,,,,,,,,,,,,,,0.0995,,,
min,0.5,,,,,,,,,,,,,,,0.5,,,
25%,0.792,,,,,,,,,,,,,,,0.823,,,
50%,0.926,,,,,,,,,,,,,,,0.900833,,,
75%,0.97,,,,,,,,,,,,,,,0.9438,,,


In [23]:
# Add information about length
corpus_df["en_length"] = corpus_df.en_doc.str.split().str.len()

corpus_df.head(3)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,sl_par,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text,en_doc,sl_doc,en_length
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,Končalo se je razburljivo in z razkošjem.,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s0-It went out with a bang.,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...,574
1212933,0.899,No,sl-orig,http://16.liffe.si/?lang_chg=en,p39s0,Some days ago the organisers of the 17th Liffe...,B,B,http://16.liffe.si/index.php?menu_item=domov,p39s0+p39s1,Pred dnevi smo se iz 59. mednarodnega filmskeg...,16.liffe.si,16.liffe.si,yes,16.liffe.si 16.liffe.si,0.9,p39s0-Some days ago the organisers of the 17th...,Some days ago the organisers of the 17th Liffe...,Pred dnevi smo se iz 59. mednarodnega filmskeg...,293
598330,0.976,Yes,sl-orig,http://17.liffe.si/?lang_chg=en,p2s0,17th LIFFe was brought to an end with the best...,B,B,http://17.liffe.si/?lang_chg=sl,p2s0,S podelitvijo nagrad in predvajanjem Režiserja...,17.liffe.si,17.liffe.si,yes,17.liffe.si 17.liffe.si,0.957875,p2s0-17th LIFFe was brought to an end with the...,17th LIFFe was brought to an end with the best...,S podelitvijo nagrad in predvajanjem Režiserja...,445


In [34]:
corpus_df.en_length.describe()

count    205785.000000
mean        228.741604
std        1208.042942
min           1.000000
25%          27.000000
50%          75.000000
75%         189.000000
max       98761.000000
Name: en_length, dtype: float64

In [35]:
# Add information about Slovene length
corpus_df["sl_length"] = corpus_df.sl_doc.str.split().str.len()

corpus_df.head(3)

Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,sl_source,sl_par_id,...,en_domain,sl_domain,same_domains,different_domains,average_score,en-par-text,en_doc,sl_doc,en_length,sl_length
2584979,0.947,No,sl-orig,http://15.liffe.si/?lang_chg=en,p43s0,It went out with a bang.,B,B,http://15.liffe.si/?lang_chg=sl,p43s0,...,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,p43s0-It went out with a bang.,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...,574,463
1212933,0.899,No,sl-orig,http://16.liffe.si/?lang_chg=en,p39s0,Some days ago the organisers of the 17th Liffe...,B,B,http://16.liffe.si/index.php?menu_item=domov,p39s0+p39s1,...,16.liffe.si,16.liffe.si,yes,16.liffe.si 16.liffe.si,0.9,p39s0-Some days ago the organisers of the 17th...,Some days ago the organisers of the 17th Liffe...,Pred dnevi smo se iz 59. mednarodnega filmskeg...,293,184
598330,0.976,Yes,sl-orig,http://17.liffe.si/?lang_chg=en,p2s0,17th LIFFe was brought to an end with the best...,B,B,http://17.liffe.si/?lang_chg=sl,p2s0,...,17.liffe.si,17.liffe.si,yes,17.liffe.si 17.liffe.si,0.957875,p2s0-17th LIFFe was brought to an end with the...,17th LIFFe was brought to an end with the best...,S podelitvijo nagrad in predvajanjem Režiserja...,445,418


In [None]:
# Analyze Slovene length
corpus_df.sl_length.describe()

In [36]:
# Discard instances that have length less than the median - less than 75
corpus_df = corpus_df[corpus_df["en_length"] > 74]

corpus_df.shape

(103281, 21)

In [37]:
corpus_df.columns

Index(['score_bicleaner_ai', 'biroamer_entities', 'translation_direction',
       'en_source', 'en_par_id', 'en_par', 'en_var_doc', 'en_var_dom',
       'sl_source', 'sl_par_id', 'sl_par', 'en_domain', 'sl_domain',
       'same_domains', 'different_domains', 'average_score', 'en-par-text',
       'en_doc', 'sl_doc', 'en_length', 'sl_length'],
      dtype='object')

In [None]:
# Discard irrelevant columns
corpus_df = corpus_df.drop(columns = ['score_bicleaner_ai', 'en_par_id', 'en_par', 'sl_par_id', 'sl_par', 'en-par-text', 'same_domains', 'different_domains'])

In [None]:
# Save the dataframe to csv
corpus_df.to_csv("Macocu-sl-en-doc-format.csv", sep= "\t")

## Analysis of prepared corpus

After this preparation, I added additional pre-processing based on the ratio of punctuation per words - see *2.1-Filtering-non-textual.ipynb*. We will now analyse the final file where non-textual texts were removed based on this ratio.

In [2]:
# Import filtered corpus
corpus_df = pd.read_csv("Macocu-sl-en-doc-format-filtered.csv", sep = "\t", index_col = 0)

corpus_df.head()

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,average_score,en_doc,sl_doc,en_length,sl_length,punct_ratio
2584979,No,sl-orig,http://15.liffe.si/?lang_chg=en,B,B,http://15.liffe.si/?lang_chg=sl,15.liffe.si,15.liffe.si,0.936808,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...,574,463,0.103501
1212933,No,sl-orig,http://16.liffe.si/?lang_chg=en,B,B,http://16.liffe.si/index.php?menu_item=domov,16.liffe.si,16.liffe.si,0.9,Some days ago the organisers of the 17th Liffe...,Pred dnevi smo se iz 59. mednarodnega filmskeg...,293,184,0.07622
598330,Yes,sl-orig,http://17.liffe.si/?lang_chg=en,B,B,http://17.liffe.si/?lang_chg=sl,17.liffe.si,17.liffe.si,0.957875,17th LIFFe was brought to an end with the best...,S podelitvijo nagrad in predvajanjem Režiserja...,445,418,0.07393
374561,No,sl-orig,http://1proti1.mg-lj.si/en/concept/,UNK,UNK,http://1proti1.mg-lj.si/koncept/,1proti1.mg-lj.si,1proti1.mg-lj.si,0.947393,"Not interested in showcasing art, 1:1 would be...",Koncept 1:1 noče biti razstava v smislu izložb...,769,650,0.115178
642105,Yes,sl-orig,http://2006.fdf.si/?lang_chg=en,B,B,http://2006.fdf.si/?menu_item=podatki&amp;menu...,2006.fdf.si,2006.fdf.si,0.941667,The 8th International Documentary Film Festiva...,S slovenskim filmom Poročno potovanje se je ko...,423,13,0.084567


In [3]:
# Inspect corpus information
corpus_df.describe(include="all")

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,average_score,en_doc,sl_doc,en_length,sl_length,punct_ratio
count,101807,101807,101807,101807,101807,101807,101807,101807,101807.0,101807,101807,101807.0,101807.0,101807.0
unique,2,2,101807,4,4,92708,6066,6066,,101807,92544,,,
top,No,sl-orig,http://15.liffe.si/?lang_chg=en,B,B,https://www.sofascore.com/sl/ekipa/nogomet/vik...,oblacila.si,oblacila.si,,It went out with a bang. The evening sparkled ...,"Ali se strinjate, da na vaš računalnik namesti...",,,
freq,89024,90537,1,42890,57737,9,3600,3600,,1,30,,,
mean,,,,,,,,,0.897452,,,428.811084,495.158761,0.092997
std,,,,,,,,,0.063443,,,1694.062268,2320.090506,0.027555
min,,,,,,,,,0.502,,,75.0,2.0,0.015
25%,,,,,,,,,0.868429,,,119.0,93.0,0.07483
50%,,,,,,,,,0.913667,,,190.0,165.0,0.089552
75%,,,,,,,,,0.942684,,,346.0,324.0,0.106952


In [6]:
# Inspect en_var_doc statistics
print(corpus_df.en_var_doc.value_counts(normalize = True).to_markdown())

|     |   en_var_doc |
|:----|-------------:|
| B   |    0.421287  |
| UNK |    0.351813  |
| A   |    0.165755  |
| MIX |    0.0611451 |


In [7]:
print(corpus_df.en_var_dom.value_counts(normalize = True).to_markdown())

|     |   en_var_dom |
|:----|-------------:|
| B   |   0.567122   |
| MIX |   0.281886   |
| A   |   0.140992   |
| UNK |   0.00999931 |


In [8]:
# Inspect translation direction
print(corpus_df.translation_direction.value_counts(normalize = True).to_markdown())

|         |   translation_direction |
|:--------|------------------------:|
| sl-orig |                  0.8893 |
| en-orig |                  0.1107 |


In [9]:
print(corpus_df.average_score.describe().to_markdown())

|       |   average_score |
|:------|----------------:|
| count |  101807         |
| mean  |       0.897452  |
| std   |       0.0634431 |
| min   |       0.502     |
| 25%   |       0.868429  |
| 50%   |       0.913667  |
| 75%   |       0.942684  |
| max   |       0.9905    |


In [11]:
corpus_df.en_length.describe()

count    101807.000000
mean        428.811084
std        1694.062268
min          75.000000
25%         119.000000
50%         190.000000
75%         346.000000
max       98761.000000
Name: en_length, dtype: float64

In [None]:
# Inspect duplicated Slovene texts
pd.set_option('display.max_colwidth', None)
duplicated = corpus_df[corpus_df.sl_doc.duplicated(keep=False)].sort_values("sl_doc")
duplicated[["sl_doc", "en_doc", "average_score"]]

In [12]:
# Inspect whether this solved any issues with non-textual texts
non_textual_index = [1887229, 798879, 5579561,  150635, 2050301,  472170,  249792, 65284,  477792,  282858, 1738274, 2754062,   84675]

discarded_counter = 0

remaining_index = []

for i in non_textual_index:
	try:
		print(corpus_df.loc[i].en_doc)
		remaining_index.append(i)

	except KeyError:
		print(f"{i} not in final corpus.")
		discarded_counter += 1

print(discarded_counter)
print(remaining_index)

1887229 not in final corpus.
798879 not in final corpus.
5579561 not in final corpus.
Dejan Habicht - Zvočni film / Sound Movie (00:05:22) Subtitle: 12 dolgočasnih pesmi Production: Center in Galerija P74, 2007 The sound of an airplane over a forest. Dejan Habicht - Nevarni vzorci Subtitle: 12 dolgočasnih pesmi Production: Center in Galerija P74, 2006 Dangerous Patterns is an electronic book, which can be printed at home (557KB). Dejan Habicht - die Heißeste Nummer Subtitle: 12 dolgočasnih pesmi Production: Center in Galerija P74, 2004/06 die Heißeste Nummer / The Hottest Number is an e-book that one can print at home. Saša Spačal - 7K: new life form (00:10:13) Production: Saša Spačal, Muzej sodobne umetnosti Metelkova - MSUM+, 2016 The video conceptually reflects on 7K: new life form, an artwork by Saša Spačal.
2050301 not in final corpus.
472170 not in final corpus.
249792 not in final corpus.
65284 not in final corpus.
477792 not in final corpus.
282858 not in final corpus.
1738274 

In [24]:
from itertools import islice

def chunk(arr_range, arr_size):
	arr_range = iter(arr_range)
	return iter(lambda: tuple(islice(arr_range, arr_size)), ())

batches_list = list(chunk(corpus_df.en_source[:10], 3))

for i in batches_list:
	print(list(i))

['http://15.liffe.si/?lang_chg=en', 'http://16.liffe.si/?lang_chg=en', 'http://17.liffe.si/?lang_chg=en']
['http://1proti1.mg-lj.si/en/concept/', 'http://2006.fdf.si/?lang_chg=en', 'http://2007-2013.ita-slo.eu/news/contacts/']
['http://2007-2013.ita-slo.eu/programme/basic_information/', 'http://2007-2013.ita-slo.eu/programme/erdf_co_funding/', 'http://2007-2013.ita-slo.eu/programme/objectives/']
['http://2007-2013.ita-slo.eu/programme/priorities/']
