Here, we prepare a sample from MaCoCu-sl, on which we use the genre classifiers. To prepare the sample, we first need to discard all texts with text length smaller than 75 - I created a dictionary of all domains and urls of texts that are long enough. Then we want to extract 10 texts from each of the domains to which I've assigned genres during the manual checkup.

In [1]:
import gzip
import wget
import regex as re
import pandas as pd
import numpy as np
import json
import random
from tqdm import tqdm

In [2]:
# Compile regex for url and domain
url_re = re.compile('url="(.*?)"')
domain_re = re.compile('domain="(.*?)"')

## Download and open the XLM file

In [3]:
# Download the corpus
# Download the file

#Defining the zip file URL
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1517/MaCoCu-sl.xml.gz"

# Downloading the file by sending the request to the URL
corpus_file = wget.download(url)
print('Downloading Completed')

Downloading Completed


In [3]:
file = gzip.open('MaCoCu-sl.xml.gz', 'rt', encoding='utf-8')

## Create a dataframe with the most frequent domains and a sample of 10 URLs from each domain

In [6]:
# Get a list of domains you are interested in
manual_an_df = pd.read_csv("genres-in-Slovene-domains.csv")
manual_an_df.head(2)

Unnamed: 0,Web domain,Genre
0,sodnapraksa.si,legal/regulation
1,dkom.si,legal/regulation


In [9]:
annotated_domains = list(manual_an_df["Web domain"].unique())
print(len(annotated_domains))
annotated_domains[:3]

301


['sodnapraksa.si', 'dkom.si', 'uradni-list.si']

First, get a list of all domains from the annotated_domains list and urls of texts that have are more than 75 words long.

In [10]:
text_counter = 0

texts = []

for line in tqdm(file):
	if line.startswith("<doc"):
		current_text = []
		pure_text = ""
		text_length = 0
		current_url = ""
		current_domain = ""
		current_url = url_re.search(line).group(1)
		current_domain = domain_re.search(line).group(1)
	elif line.startswith("<p"):
		continue
	elif line.startswith("</p"):
		continue
	elif line.startswith("</doc"):
		if current_domain in annotated_domains:
			text_length = len(pure_text.split())
			if text_length > 75:
				current_text = [current_domain, current_url]
				texts.append(current_text)
				text_counter += 1
	elif line.startswith("<corpus"):
		continue
	elif line.startswith("</corpus"):
		continue
	else:
		pure_text += line

151967742it [06:43, 376199.72it/s]


In [11]:
# Create a dataframe
df = pd.DataFrame({"domain": [x[0] for x in texts], "url": [x[1] for x in texts]})

df.head()

Unnamed: 0,domain,url
0,bal.si,https://bal.si/
1,e30.si,https://e30.si/
2,rsg.si,https://www.rsg.si/
3,vsi.si,https://www.vsi.si/
4,sta.si,https://www.sta.si/


In [12]:
df.describe(include="all")

Unnamed: 0,domain,url
count,1261519,1261519
unique,300,1258301
top,rtvslo.si,https://www.rtvslo.si/sport
freq,39012,8


We got 3,779,253 texts in 49,096 domains.

In [13]:
# Calculate domain distribution
domain_distribution = pd.DataFrame({"domain": list(df.domain.value_counts().to_dict().keys()), "frequency":list(df.domain.value_counts().to_dict().values())})
domain_distribution

Unnamed: 0,domain,frequency
0,rtvslo.si,39012
1,regionalobala.si,35803
2,primorske.si,31999
3,zurnal24.si,29880
4,1zavse.si,28335
...,...,...
295,adrialog.com,113
296,bal.si,110
297,dom24.si,87
298,murjeplovec.mojforum.si,78


In [14]:
# Discard instances with frequency less than 10
domain_distribution = domain_distribution[domain_distribution["frequency"] > 9]

domain_distribution.shape

(300, 2)

In [19]:
# For each domain, sample out 10 texts from the initial dataframe

# First create the initial df to which all others in the loop will be added
final_sample = df[df["domain"] == annotated_domains[0]].sample(n=10)

final_sample

Unnamed: 0,domain,url
1213635,sodnapraksa.si,http://sodnapraksa.si/?q=285%20&amp;database%5...
1259525,sodnapraksa.si,http://sodnapraksa.si/?q=podatki%20iz%20uradni...
1242556,sodnapraksa.si,http://www.sodnapraksa.si/?q=Odpoved%20pravici...
1209561,sodnapraksa.si,http://www.sodnapraksa.si/search.php?q=G35/12&...
1211501,sodnapraksa.si,http://www.sodnapraksa.si/search.php?q=G35/12&...
1256542,sodnapraksa.si,http://www.sodnapraksa.si/?q=sklep%20G%2034/20...
1215644,sodnapraksa.si,http://www.sodnapraksa.si/?q=id:20150811114380...
1216726,sodnapraksa.si,http://www.sodnapraksa.si/?q=id:20150811114470...
1178712,sodnapraksa.si,http://sodnapraksa.si/?q=ViiiIps193/2010&amp;d...
1255387,sodnapraksa.si,http://www.sodnapraksa.si/?q=id:20100408152525...


In [20]:
# Add all other domains
remaining_list = annotated_domains[1:]

for i in remaining_list:
	try:
		added_instances = df[df["domain"] == i].sample(n=10)
		final_sample = pd.concat([final_sample, added_instances])
	except:
		print(df[df["domain"] == i][:2].to_markdown())

final_sample.shape

| domain   | url   |
|----------|-------|


(3000, 2)

In [21]:
final_sample.describe()

Unnamed: 0,domain,url
count,3000,3000
unique,300,3000
top,sodnapraksa.si,http://sodnapraksa.si/?q=285%20&amp;database%5...
freq,10,1


In [26]:
# Save the df
final_sample.to_csv("MaCoCu-manual_ann_sample-domains-and-urls.csv", sep="\t")

## Extract the text from the TMX based on the URL list

In [4]:
# Open the df with domains and urls for the sample
final_sample = pd.read_csv("MaCoCu-manual_ann_sample-domains-and-urls.csv", sep="\t", index_col = 0)

final_sample.head(2)

Unnamed: 0,domain,url
1213635,sodnapraksa.si,http://sodnapraksa.si/?q=285%20&amp;database%5...
1259525,sodnapraksa.si,http://sodnapraksa.si/?q=podatki%20iz%20uradni...


In [5]:
final_sample.shape

(3000, 2)

In [6]:
# Create a list of urls in the sample
url_list = list(final_sample["url"].unique())
url_list[:10]

['http://sodnapraksa.si/?q=285%20&amp;database%5BSOVS%5D=SOVS&amp;database%5BIESP%5D=IESP&amp;_submit=išči&amp;order=date&amp;direction=asc&amp;rowsPerPage=50&amp;page=12',
 'http://sodnapraksa.si/?q=podatki%20iz%20uradnih%20evidenc&amp;database%5BSOVS%5D=SOVS&amp;database%5BIESP%5D=IESP&amp;database%5BVDSS%5D=VDSS&amp;database%5BUPRS%5D=UPRS&amp;database%5BSEU%5D=SEU&amp;database%5BNEGM%5D=NEGM&amp;database%5BSOSC%5D=SOSC&amp;database%5BSOPM%5D=SOPM&amp;_submit=išči&amp;rowsPerPage=20&amp;page=3&amp;id=2012032113044158',
 'http://www.sodnapraksa.si/?q=Odpoved%20pravici%20zoper%20pritožbe%20o%20dedovanju%20&amp;database%5BIESP%5D=IESP&amp;_submit=išči&amp;rowsPerPage=20&amp;page=0&amp;moreLikeThis=1&amp;id=doc_53571',
 'http://www.sodnapraksa.si/search.php?q=G35/12&amp;database%5BSOVS%5D=SOVS&amp;_submit=i�èi&amp;order=date&amp;direction=asc&amp;page=2&amp;moreLikeThis=1&amp;id=doc_18218',
 'http://www.sodnapraksa.si/search.php?q=G35/12&amp;database%5BSOVS%5D=SOVS&amp;_submit=i�èi&amp;

In [7]:
len(url_list)

3000

In [8]:
# Now that I have the URL list, I will extract texts from the MaCoCu-sl.xml.gz for the sample based on the URL list.

text_all_counter = 0

texts_all = []

for line in tqdm(file):
	if line.startswith("<doc"):
		current_text = []
		pure_text = ""
		current_url = ""
		current_domain = ""
		current_url = url_re.search(line).group(1)
		current_domain = domain_re.search(line).group(1)
	elif line.startswith("<p"):
		continue
	elif line.startswith("</p"):
		continue
	elif line.startswith("</doc"):
		if current_url in url_list:
			current_text = [current_domain, current_url, pure_text]
			texts_all.append(current_text)
			text_all_counter += 1
	elif line.startswith("<corpus"):
		continue
	elif line.startswith("</corpus"):
		continue
	else:
		pure_text += line

151967742it [08:20, 303624.01it/s]


In [9]:
text_all_counter

3033

In [11]:
# Create a dataframe out of the text file

df_long_texts = pd.DataFrame({"domain": [x[0] for x in texts_all], "url": [x[1] for x in texts_all], "text": [x[2] for x in texts_all]})

df_long_texts.head()

Unnamed: 0,domain,url,text
0,solemio.si,https://solemio.si/,Ker nam ni vseeno\nTočke zvestobe\nZvestobo na...
1,bimbo.si,https://www.bimbo.si/cam,"Cam\nGledati na svet z otroškimi očmi, da bi r..."
2,bimbo.si,https://www.bimbo.si/cam,Cam\nCam CULLAMI antracite col 153 - Otroška o...
3,kwon.si,https://www.kwon.si/acerr,"Ne glede na to, ali gledate filme, igrate igre..."
4,kwon.si,https://www.kwon.si/IIYAMA,Gaming monitor IIYAMA G-MASTER GB2730HSU-B1 Bl...


In [12]:
df_long_texts.describe(include="all")

Unnamed: 0,domain,url,text
count,3033,3033,3033
unique,300,3000,3033
top,telex.si,https://www.izstopaj.si/sl/za-punce/oblacila,Ker nam ni vseeno\nTočke zvestobe\nZvestobo na...
freq,17,3,1


In [13]:
df_long_texts.domain.value_counts()

telex.si              17
indigo-nails.si       13
podsvojostreho.net    12
extremevital.com      12
tenisportal.si        12
                      ..
liderpnevmatik.si     10
besedilo.si           10
mazda-si.net          10
strelec.si            10
sodnapraksa.si        10
Name: domain, Length: 300, dtype: int64

Some URLs appear multiple times with different texts, so at the end, our sample consits of 3033 texts. The problem with this is a) that some domains have more instances than other, and b) that texts under some of the URLs might be shorter than 75 words. That is why we calculated the length of the texts again and discarded those with length less than 75 words. Then we also sampled out the domains with more than 10 texts, so that at the end all domains have 10 instances.

In [14]:
# Add information on length
df_long_texts["length"] = df_long_texts["text"].str.split().str.len()

df_long_texts.head()

Unnamed: 0,domain,url,text,length
0,solemio.si,https://solemio.si/,Ker nam ni vseeno\nTočke zvestobe\nZvestobo na...,468
1,bimbo.si,https://www.bimbo.si/cam,"Cam\nGledati na svet z otroškimi očmi, da bi r...",230
2,bimbo.si,https://www.bimbo.si/cam,Cam\nCam CULLAMI antracite col 153 - Otroška o...,33
3,kwon.si,https://www.kwon.si/acerr,"Ne glede na to, ali gledate filme, igrate igre...",246
4,kwon.si,https://www.kwon.si/IIYAMA,Gaming monitor IIYAMA G-MASTER GB2730HSU-B1 Bl...,633


In [15]:
df_long_texts["length"].describe()

count     3033.000000
mean       500.803825
std       1324.283252
min          9.000000
25%        124.000000
50%        225.000000
75%        451.000000
max      30308.000000
Name: length, dtype: float64

In [16]:
# Filter out texts, shorter than 75 words
df_long_texts = df_long_texts[df_long_texts["length"] > 75]
df_long_texts.shape

(3020, 4)

In [17]:
all_domains_frequency = df_long_texts.domain.value_counts().to_dict()

all_domains_frequency

{'telex.si': 17,
 'podsvojostreho.net': 12,
 'dobrakarma.si': 12,
 'extremevital.com': 12,
 'elle.metropolitan.si': 11,
 'vestnik.si': 11,
 'gnes.si': 11,
 'tenisportal.si': 11,
 'izstopaj.si': 11,
 'gorenjskiglas.si': 11,
 'oglasi.si': 11,
 'mislec.net': 10,
 'pomagamo-zivalim.si': 10,
 'repozitorij.uni-lj.si': 10,
 'nas-stik.si': 10,
 'solemio.si': 10,
 'revis.openscience.si': 10,
 'novacebela.mojforum.si': 10,
 'nogomania.com': 10,
 'avto-fokus.si': 10,
 'sony.si': 10,
 'dk.um.si': 10,
 'moj-letak.si': 10,
 's5tech.net': 10,
 'mountvacation.si': 10,
 'dirros.openscience.si': 10,
 'bead.si': 10,
 'murjeplovec.mojforum.si': 10,
 'siol.net': 10,
 'primorske.si': 10,
 'robin.si': 10,
 'podjetnik.si': 10,
 'redoljub.si': 10,
 'regionalgoriska.si': 10,
 'joker.muzej.si': 10,
 'zurnal24.si': 10,
 'lokalne-ajdovscina.si': 10,
 'sta.si': 10,
 'malizakladi.si': 10,
 'rokometna-zveza.si': 10,
 'audio-kontakt.com': 10,
 'maxximum-portal.com': 10,
 'pisrs.si': 10,
 'elektronik.si': 10,
 'repozit

In [18]:
# Filter out a part of texts from domains that have more than 10 texts
for item in ["telex.si"]:
	df_long_texts = df_long_texts.drop(df_long_texts[df_long_texts['domain'] == item].sample(n=7).index)

for item in ['podsvojostreho.net','dobrakarma.si', 'extremevital.com']:
	df_long_texts = df_long_texts.drop(df_long_texts[df_long_texts['domain'] == item].sample(n=2).index)

for item in ['elle.metropolitan.si', 'vestnik.si',  'gnes.si',  'tenisportal.si', 'izstopaj.si', 'gorenjskiglas.si', 'oglasi.si']:
	df_long_texts = df_long_texts.drop(df_long_texts[df_long_texts['domain'] == item].sample(n=1).index)

In [19]:
df_long_texts.describe(include="all")

Unnamed: 0,domain,url,text,length
count,3000,3000,3000,3000.0
unique,300,2986,3000,
top,solemio.si,http://www.telex.si/index.php?s=143&amp;page=2,Ker nam ni vseeno\nTočke zvestobe\nZvestobo na...,
freq,10,3,1,
mean,,,,503.810333
std,,,,1330.9626
min,,,,76.0
25%,,,,125.0
50%,,,,227.5
75%,,,,452.25


In [20]:
# Check if all domains have the same number of instances
df_long_texts.domain.value_counts()

solemio.si                10
mountvacation.si          10
novacebela.mojforum.si    10
mislec.net                10
pomagamo-zivalim.si       10
                          ..
avdio.ognjisce.si         10
emundia.si                10
leoss.si                  10
mojaleta.si               10
sodnapraksa.si            10
Name: domain, Length: 300, dtype: int64

In [21]:
# Save the final sample
df_long_texts.to_csv("MaCoCu-sl-manual-checkup-sample.csv", sep="\t")