Here, we prepare a sample from MaCoCu-sl, on which we use the genre classifiers. To prepare the sample, we first need to discard all texts with text length smaller than 75 - I created a dictionary of all domains and urls of texts that are long enough. Then I calculated the frequency of the domains. I discarded domains that have less than 10 instances (if I wouldn't, the median would be 6 texts per domain). Then I calculated the median and took the instances with the median number of instances, and the same amount of domains above and below the median, so that at the end, the sample has around 1000 different domains.

In [1]:
import gzip
import wget
import regex as re
import pandas as pd
import numpy as np
import json
import random
from tqdm import tqdm

In [2]:
# Compile regex for url and domain
url_re = re.compile('url="(.*?)"')
domain_re = re.compile('domain="(.*?)"')

## Download and open the XLM file

In [2]:
# Download the corpus
# Download the file

#Defining the zip file URL
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1517/MaCoCu-sl.xml.gz"

# Downloading the file by sending the request to the URL
corpus_file = wget.download(url)
print('Downloading Completed')

Downloading Completed


In [3]:
file = gzip.open('MaCoCu-sl.xml.gz', 'rt', encoding='utf-8')

## Create a dataframe with the most frequent domains and a sample of 10 URLs from each domain

First, get a list of all domains and urls of texts that have are more than 75 words long.

In [7]:
text_counter = 0

texts = []

for line in tqdm(file):
	if line.startswith("<doc"):
		current_text = []
		pure_text = ""
		text_length = 0
		current_url = ""
		current_domain = ""
		current_url = url_re.search(line).group(1)
		current_domain = domain_re.search(line).group(1)
	elif line.startswith("<p"):
		continue
	elif line.startswith("</p"):
		continue
	elif line.startswith("</doc"):
		text_length = len(pure_text.split())
		if text_length > 75:
			current_text = [current_domain, current_url]
			texts.append(current_text)
			text_counter += 1
	elif line.startswith("<corpus"):
		continue
	elif line.startswith("</corpus"):
		continue
	else:
		pure_text += line

151967742it [07:38, 331784.12it/s]


In [9]:
# Create a dataframe
df = pd.DataFrame({"domain": [x[0] for x in texts], "url": [x[1] for x in texts]})

df.head()

Unnamed: 0,domain,url
0,e3.si,https://www.e3.si/
1,x5.si,http://www.x5.si/
2,a1.si,https://www.a1.si/
3,op.si,https://www.op.si/
4,fd.si,https://fd.si/


In [10]:
# Sort the df based on the domain
df = df.sort_values("domain")

df.head()

Unnamed: 0,domain,url
337411,007.com.hr,http://www.007.com.hr/bugdetector.html
177449,090linije.si,http://www.090linije.si/pogoji.htm
501444,090linije.si,http://www.090linije.si/simobil/pogoji.htm
550412,090vedezevanje.com,https://090vedezevanje.com/tarot-karte/
96050,090vedezevanje.com,https://090vedezevanje.com/


In [11]:
df.describe(include="all")

Unnamed: 0,domain,url
count,3787272,3787272
unique,49096,3779253
top,najdi.si,https://www.rtvslo.si/sport
freq,41255,8


We got 3,779,253 texts in 49,096 domains.

In [14]:
df.domain.value_counts().to_dict()

{'najdi.si': 41255,
 'rtvslo.si': 39012,
 'regionalobala.si': 35803,
 'primorske.si': 31999,
 'zurnal24.si': 29880,
 '1zavse.si': 28335,
 'sodnapraksa.si': 27388,
 'slo-tech.com': 26412,
 'mladina.si': 26191,
 'uradni-list.si': 25820,
 'radiostudent.si': 24408,
 'novice.svet24.si': 20773,
 'govorise.metropolitan.si': 19664,
 'dk.um.si': 18882,
 'radio1.si': 18151,
 'mojmojster.net': 18087,
 'dnevnik.si': 18001,
 'muziker.si': 17008,
 'tax-fin-lex.si': 16593,
 'siol.net': 16535,
 'ringaraja.net': 16507,
 'monitor.si': 16082,
 'moj-letak.si': 15774,
 'mojaobcina.si': 15596,
 'moja-lekarna.com': 14498,
 'vsi.si': 14387,
 'instore.si': 14279,
 'publishwall.si': 14159,
 'citymagazine.si': 14128,
 'sloski.si': 12958,
 'elektronik.si': 12233,
 'nogomania.com': 12215,
 'deloindom.delo.si': 11834,
 'politikis.si': 11517,
 'viva.si': 11371,
 'joker.muzej.si': 11280,
 'nova24tv.si': 10936,
 'pesem.si': 10301,
 'gov.si': 10134,
 'blog.uporabnastran.si': 9510,
 'sodisce.si': 9321,
 'mladipodjetnik.

In [10]:
# Calculate domain distribution
domain_distribution = pd.DataFrame({"domain": list(df.domain.value_counts().to_dict().keys()), "frequency":list(df.domain.value_counts().to_dict().values())})
domain_distribution

Unnamed: 0,domain,frequency
0,najdi.si,41255
1,rtvslo.si,39012
2,regionalobala.si,35803
3,primorske.si,31999
4,zurnal24.si,29880
...,...,...
49091,tapetnistvo-damjan.si,1
49092,eurograf.si,1
49093,tapetnistvo-kolar.si,1
49094,tapetnistvo-kopac.si,1


In [11]:
# Discard instances with frequency less than 10
domain_distribution = domain_distribution[domain_distribution["frequency"] > 9]

domain_distribution.shape

(20400, 2)

If we don't discard domains with less than 10 texts, the median is 6 texts which is not enough. So, I first discarded domains with less than 10 texts, then calculated the median. The remaining number of domains was 20,400.

In [12]:
# Find the median
domain_distribution.frequency.describe()

count    20400.000000
mean       181.051225
std       1044.961694
min         10.000000
25%         16.000000
50%         32.000000
75%         84.000000
max      41255.000000
Name: frequency, dtype: float64

The median number of texts per domain is 32. Now, I'll take 500 domains with frequency below this number and 500 domains with frequency above this number.

In [13]:
domain_distribution[domain_distribution["frequency"] == 32].shape

(219, 2)

In [14]:
# 219 instances are already at the median, so I will take 391 instances from above and below the median

domain_distribution[domain_distribution["frequency"] == 32].head(1)

Unnamed: 0,domain,frequency
9983,lkrv.fri.uni-lj.si,32


In [15]:
distr_above = domain_distribution.iloc[(9983-391):9983]

distr_above

Unnamed: 0,domain,frequency
9592,apkhome.net,35
9593,dsi2011.si,35
9594,zlatarnica.si,35
9595,pd-polzela.si,35
9596,pgd-cusperk.si,35
...,...,...
9978,kado.si,33
9979,began.si,33
9980,manuela-aleunam.blogspot.com,33
9981,mojporod.si,33


In [16]:
domain_distribution[domain_distribution["frequency"] == 32].tail(1)

Unnamed: 0,domain,frequency
10201,diabetes-loka.si,32


In [17]:
distr_below = domain_distribution.iloc[10202:(10201+392)]
distr_below

Unnamed: 0,domain,frequency
10202,luster.si,31
10203,sportnestave.org,31
10204,bodifit.net,31
10205,polyregion.org,31
10206,ma-ko.si,31
...,...,...
10588,ka-komunikacije.si,30
10589,kubu.si,30
10590,troblja.info,30
10591,karnion.si,30


In [18]:
# Merge the three domain dfs
intermediate = pd.concat([domain_distribution[domain_distribution["frequency"] == 32], distr_above])

intermediate

Unnamed: 0,domain,frequency
9983,lkrv.fri.uni-lj.si,32
9984,rts.si,32
9985,biotechnology-gmo.gov.si,32
9986,gml-drustvo.si,32
9987,kamen-mojster.si,32
...,...,...
9978,kado.si,33
9979,began.si,33
9980,manuela-aleunam.blogspot.com,33
9981,mojporod.si,33


In [19]:
domain_sample = pd.concat([intermediate, distr_below])
domain_sample

Unnamed: 0,domain,frequency
9983,lkrv.fri.uni-lj.si,32
9984,rts.si,32
9985,biotechnology-gmo.gov.si,32
9986,gml-drustvo.si,32
9987,kamen-mojster.si,32
...,...,...
10588,ka-komunikacije.si,30
10589,kubu.si,30
10590,troblja.info,30
10591,karnion.si,30


In [5]:
# Create a list of domains
domain_sample_list = domain_sample.domain.to_list()
domain_sample_list[:10]

['lkrv.fri.uni-lj.si',
 'rts.si',
 'biotechnology-gmo.gov.si',
 'gml-drustvo.si',
 'kamen-mojster.si',
 'bio-pharma.si',
 'binova.si',
 'tiego.si',
 'st-laboratoriji.si',
 'osmatijecopa.si']

In [17]:
# For each domain, sample out 10 texts from the initial dataframe

# First create the initial df to which all others in the loop will be added
final_sample = df[df["domain"] == domain_sample_list[0]].sample(n=10)

final_sample

Unnamed: 0,domain,url
1461252,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/seminar/in...
2129748,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/np_...
2710918,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/africa/?page=navodil...
2130932,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/np_...
1482576,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/dn/...
2708401,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/africa/?page=navodil...
1631675,lkrv.fri.uni-lj.si,https://lkrv.fri.uni-lj.si/~ajurisic/seminar/p...
1517203,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/mercedes/i...
1706165,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/tec...
1077648,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/tex/tex.html


In [18]:
# Add all other domains
remaining_list = domain_sample_list[1:]

for i in remaining_list:
	added_instances = df[df["domain"] == i].sample(n=10)
	final_sample = pd.concat([final_sample, added_instances])

final_sample.shape

(10010, 2)

In [19]:
final_sample.describe()

Unnamed: 0,domain,url
count,10010,10010
unique,1001,10005
top,lkrv.fri.uni-lj.si,https://www.easistent.com/navodila_za_placilo
freq,10,2


The final sample has 10.010 instances from 1001 domains.

In [21]:
# Save the df
final_sample.to_csv("MaCoCu-sample-domains-and-urls.csv", sep="\t")

## Extract the text from the TMX based on the URL list

In [4]:
# Open the df with domains and urls for the sample
final_sample = pd.read_csv("MaCoCu-sample-domains-and-urls.csv", sep="\t", index_col = 0)

final_sample.head(2)

Unnamed: 0,domain,url
1461252,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/seminar/in...
2129748,lkrv.fri.uni-lj.si,http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/np_...


In [5]:
final_sample.shape

(10010, 2)

In [6]:
# Create a list of urls in the sample
url_list = list(final_sample["url"].unique())
url_list[:10]

['http://lkrv.fri.uni-lj.si/~ajurisic/seminar/index.html',
 'http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/np_nasveti/pro_mot.html',
 'http://lkrv.fri.uni-lj.si/africa/?page=navodila&amp;sys=lin&amp;browser=firefox&amp;lang=eng',
 'http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/np_nasveti/nastopi.html',
 'http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/dn/dn9.html',
 'http://lkrv.fri.uni-lj.si/africa/?page=navodila&amp;sys=win&amp;browser=express&amp;lang=eng',
 'https://lkrv.fri.uni-lj.si/~ajurisic/seminar/projekti.html',
 'http://lkrv.fri.uni-lj.si/~ajurisic/mercedes/index.html',
 'http://lkrv.fri.uni-lj.si/~ajurisic/tec_ac/tec_pro_ac.html',
 'http://lkrv.fri.uni-lj.si/~ajurisic/tex/tex.html']

In [7]:
len(url_list)

10005

In [8]:
# Now that I have the URL list, I will extract texts from the MaCoCu-sl.xml.gz for the sample based on the URL list.

text_all_counter = 0

texts_all = []

for line in tqdm(file):
	if line.startswith("<doc"):
		current_text = []
		pure_text = ""
		text_string = ""
		current_url = ""
		current_domain = ""
		current_url = url_re.search(line).group(1)
		current_domain = domain_re.search(line).group(1)
		text_string += line
	elif line.startswith("<p"):
		text_string += line
	elif line.startswith("</p"):
		text_string += line
	elif line.startswith("</doc"):
		text_string += line
		if current_url in url_list:
			current_text = [current_domain, current_url, pure_text, text_string]
			texts_all.append(current_text)
			text_all_counter += 1
	elif line.startswith("<corpus"):
		continue
	elif line.startswith("</corpus"):
		continue
	else:
		text_string += line
		pure_text += line

151967742it [16:46, 151024.45it/s]


In [9]:
text_all_counter

10041

In [11]:
# Create a dataframe out of the text file

df_long_texts = pd.DataFrame({"domain": [x[0] for x in texts_all], "url": [x[1] for x in texts_all], "text": [x[2] for x in texts_all],"doc": [x[3] for x in texts_all]})

df_long_texts.head()

Unnamed: 0,domain,url,text,doc
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica..."
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak..."
2,osi.si,https://www.osi.si/,OSI je podjetje z dolgoletnimi izkušnjami na p...,"<doc id=""macocu.si.712"" title=""OSI sistemske i..."
3,ecpd.si,http://www.ecpd.si/,SVETOVNI DAN HRANE 2020\nRegistracija in več i...,"<doc id=""macocu.si.1004"" title=""ECPD - Domov"" ..."
4,jeko.si,https://www.jeko.si/,Oskrba z vodo\nOskrba z vodo prinaša občanom z...,"<doc id=""macocu.si.1011"" title=""Uvod | Jeko"" c..."


In [12]:
df_long_texts.describe(include="all")

Unnamed: 0,domain,url,text,doc
count,10041,10041,10041,10041
unique,1001,10005,10041,10041
top,stave-sportne.com,https://europacantat.jskd.si/sl/novice/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica..."
freq,12,3,1,1


In [25]:
df_long_texts.domain.value_counts()

stave-sportne.com    12
oskosmac.si          12
easistent.com        12
belvin.si            12
toner123.si          12
                     ..
luminum.si           10
paloma.si            10
lovska-zveza.si      10
lekarne-ptuj.si      10
nkt-z.si             10
Name: domain, Length: 1001, dtype: int64

Some URLs appear multiple times with different texts, so at the end, our sample consits of 10.041 texts. The problem with this is a) that some domains have more instances than other, and b) that texts under some of the URLs might be shorter than 75 words. That is why we calculated the length of the texts again and discarded those with length less than 75 words. Then we also sampled out the domains with more than 10 texts, so that at the end all domains have 10 instances.

In [13]:
# Add information on length
df_long_texts["length"] = df_long_texts["text"].str.split().str.len()

df_long_texts.head()

Unnamed: 0,domain,url,text,doc,length
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124
2,osi.si,https://www.osi.si/,OSI je podjetje z dolgoletnimi izkušnjami na p...,"<doc id=""macocu.si.712"" title=""OSI sistemske i...",173
3,ecpd.si,http://www.ecpd.si/,SVETOVNI DAN HRANE 2020\nRegistracija in več i...,"<doc id=""macocu.si.1004"" title=""ECPD - Domov"" ...",4509
4,jeko.si,https://www.jeko.si/,Oskrba z vodo\nOskrba z vodo prinaša občanom z...,"<doc id=""macocu.si.1011"" title=""Uvod | Jeko"" c...",340


In [15]:
df_long_texts["length"].describe()

count    10041.000000
mean       411.700229
std        890.426440
min         16.000000
25%        122.000000
50%        212.000000
75%        417.000000
max      30002.000000
Name: length, dtype: float64

In [26]:
# Filter out texts, shorter than 75 words
df_long_texts = df_long_texts[df_long_texts["length"] > 75]
df_long_texts.shape

(10027, 5)

In [30]:
# Save the intermediate dataframe
df_long_texts.to_csv("MaCoCu-sl-sample.csv", sep="\t")

In [32]:
all_domains_frequency = df_long_texts.domain.value_counts().to_dict()

all_domains_frequency

{'europacantat.jskd.si': 12,
 'easistent.com': 12,
 'belvin.si': 11,
 'bobri.si': 11,
 'os-hpuhar.si': 11,
 'gorenc.si': 11,
 'gs-lendava.si': 11,
 'psj.ff.uni-lj.si': 11,
 'osdobova.si': 11,
 'register.si': 11,
 'os-kobarid.si': 11,
 'oskosmac.si': 11,
 'hram-holding.si': 11,
 'st-garden.si': 11,
 'prodom.si': 11,
 'uc.fmf.uni-lj.si': 10,
 'medica-center.si': 10,
 'ghd-ilirskabistrica.si': 10,
 'odpocij.si': 10,
 'avtoplus.si': 10,
 'spekter-kr.si': 10,
 'nutrida.si': 10,
 'festival-gg.si': 10,
 'najdibon.si': 10,
 'kdlpp.calivita.si': 10,
 'spiklja.blogspot.com': 10,
 'kokos.shopamine.si': 10,
 'instalacije-strus.si': 10,
 'legama.si': 10,
 'lucijacevnik.si': 10,
 'ambulanta-zdravje.si': 10,
 'pdradovljica.si': 10,
 'hotel-mangart.com': 10,
 'artekcenter.com': 10,
 'register.prohereditate.com': 10,
 'letnoporocilo.triglav.eu': 10,
 'zumtobel.com': 10,
 'slo.castingbronzebushing.com': 10,
 'my-favouriteday.blogspot.com': 10,
 'gasilci-bistrica.org': 10,
 'zaper-1a.si': 10,
 'kafrca.bl

In [None]:
# Filter out a part of texts from domains that have more than 10 texts
for item in ['europacantat.jskd.si', 'easistent.com']:
	df_long_texts = df_long_texts.drop(df_long_texts[df_long_texts['domain'] == item].sample(n=2).index)

for item in ['belvin.si', 'bobri.si', 'os-hpuhar.si', 'gorenc.si', 'gs-lendava.si', 'psj.ff.uni-lj.si', 'osdobova.si', 'register.si', 'os-kobarid.si', 'oskosmac.si', 'hram-holding.si', 'st-garden.si', 'prodom.si']:
	df_long_texts = df_long_texts.drop(df_long_texts[df_long_texts['domain'] == item].sample(n=1).index)

In [34]:
df_long_texts.describe(include="all")

Unnamed: 0,domain,url,text,doc,length
count,10010,10010,10010,10010,10010.0
unique,1001,9991,10010,10010,
top,ahp.si,https://psj.ff.uni-lj.si/obvestila,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",
freq,10,2,1,1,
mean,,,,,412.45025
std,,,,,891.6065
min,,,,,76.0
25%,,,,,122.0
50%,,,,,213.0
75%,,,,,418.0


In [35]:
# Check if all domains have the same number of instances
df_long_texts.domain.value_counts()

ahp.si                          10
medica-center.si                10
gradovivoblakih.blogspot.com    10
my-favouriteday.blogspot.com    10
snjiksnjak.blogspot.com         10
                                ..
lepaznaravo.si                  10
pd-polzela.si                   10
camincam.si                     10
pgd-vipava.si                   10
nkt-z.si                        10
Name: domain, Length: 1001, dtype: int64

In [36]:
# Save the final sample
df_long_texts.to_csv("MaCoCu-sl-sample.csv", sep="\t")