In [77]:
import gzip
import regex as re
import sys
import pandas as pd
import json

In [7]:
corpus_path = "datasets/sample.vert"

In [35]:
def vert_to_jsonl(corpus_path, gzip_file=True):
	"""
	Function to transform vertical format (vert) to jsonl format, based on the CLASSLA-web files. Change the function if the files contain other metadata.

	Args:
	- corpus_path: path to the dataset in VERT format
	- gzip_file: whether the file is gzipped. Defaults to true.
	"""
	if gzip_file == True:
		corpus = gzip.open(corpus_path, "rt")
	else:
		corpus = open(corpus_path, "r")

	# Open a new file to which we will append each json line
	new_file = open("{}.jsonl".format(corpus_path), "w")
	new_file.close()
	new_file = open("{}.jsonl".format(corpus_path), "a")

	text_id_re = re.compile('id="(.+?)"')
	url_re = re.compile('url="(.+?)"')
	domain_re = re.compile('domain="(.+?)"')
	genre_re = re.compile('genre="(.+?)"')

	text_counter = 0

	for line in corpus:
		if line.startswith("<text"):
			current_text = {}
			text_string = ""
			current_text["text_id"] = text_id_re.search(line).group(1)
			current_text["url"] = url_re.search(line).group(1)
			current_text["domain"] = domain_re.search(line).group(1)
			current_text["genre"] = genre_re.search(line).group(1)
			current_text["text"] = ""
			current_text["text_length"] = 0
			current_ling_anno = []
		elif line.startswith("<p"):
			continue
		elif line.startswith("<s"):
			continue
		elif line.startswith("</p"):
			text_string = text_string.rstrip()
			text_string += "\n"
		elif line.startswith("</s"):
			continue
		elif line.startswith("<g"):
			# Remove space before the last word if there is a symbol <g (= glue, meaning no space between words)
			text_string = text_string.rstrip()
		elif line.startswith("</text>"):
			current_text["ling_anno"] = current_ling_anno
			current_text["text"] = text_string
			current_text["text_length"] = len(text_string.split())
			new_file.write("{}".format(current_text))
			new_file.write("\n")
			text_counter += 1
			if text_counter%10 == 0:
				print("Processed {} files.".format(text_counter))
		else:
			current_line = line.split("\t")
			current_line_dict = {"word": current_line[0], "lemma": current_line[1], "xpos": current_line[2], "upos": current_line[3], "feats": current_line[4], "id": current_line[5].replace("\n", "")}
			current_ling_anno.append(current_line_dict)
			current_word = current_line[0]
			text_string += current_word
			text_string += " "
	
	new_file.close()
	print("Processing completed. The new file is saved as {}.jsonl".format(corpus_path))

In [93]:
def vert_to_txt_sample(corpus_path, gzip_file=True):
	"""
	Function to transform vertical format (vert) to jsonl format, based on the CLASSLA-web files.
	We will only extract a sample of 250 000 texts.

	Args:
	- corpus_path: path to the dataset in VERT format
	- gzip_file: whether the file is gzipped. Defaults to true.
	"""
	if gzip_file == True:
		corpus = gzip.open(corpus_path, "rt")
	else:
		corpus = open(corpus_path, "r")

	# Open a new file to which we will append each json line
	new_file = open("{}-sample.txt".format(corpus_path), "w")
	new_file.write("text_id\turl\tdomain\tgenre\ttext\ttext_length\n")
	new_file.close()
	new_file = open("{}-sample.txt".format(corpus_path), "a")

	text_id_re = re.compile('id="(.+?)"')
	url_re = re.compile('url="(.+?)"')
	domain_re = re.compile('domain="(.+?)"')
	genre_re = re.compile('genre="(.+?)"')

	text_counter = 0

	for line in corpus:
		if text_counter < 100000:
			if line.startswith("<text"):
				current_text = {}
				text_string = ""
				current_text["text_id"] = text_id_re.search(line).group(1)
				current_text["url"] = url_re.search(line).group(1)
				current_text["domain"] = domain_re.search(line).group(1)
				current_text["genre"] = genre_re.search(line).group(1)
				current_text["text"] = ""
				current_text["text_length"] = 0
			elif line.startswith("<p"):
				continue
			elif line.startswith("<s"):
				continue
			elif line.startswith("</p"):
				text_string = text_string.rstrip()
				text_string += "<p>"
			elif line.startswith("</s"):
				continue
			elif line.startswith("<g"):
				# Remove space before the last word if there is a symbol <g (= glue, meaning no space between words)
				text_string = text_string.rstrip()
			elif line.startswith("</text>"):
				current_text["text"] = text_string
				current_text["text_length"] = len(text_string.split())
				#new_file.write("{}".format(current_text))
				#new_file.write("\n")
				new_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(current_text["text_id"],current_text["url"],current_text["domain"], current_text["genre"], current_text["text"], current_text["text_length"]))
				text_counter += 1
				if text_counter%10 == 0:
					print("Processed {} files.".format(text_counter))
			else:
				current_line = line.split("\t")
				current_word = current_line[0]
				text_string += current_word
				text_string += " "
		else:
			break
	
	new_file.close()
	print("Processing completed. The sample is saved as {}-sample.txt".format(corpus_path))

In [None]:
vert_to_txt_sample("datasets/CLASSLA-web.mk.1.0.vert.gz")

In [100]:
def extract_genre_sample(sample_path):
	df = pd.read_csv(sample_path, sep="\t")

	# We will extract all labels
	labels_list=['Other', 'Information/Explanation', 'News', 'Instruction', 'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal', 'Promotion']

	# Remove all texts, longer than 500 words
	df = df[df["text_length"] < 500]

	# First create the initial df to which all others in the loop will be added
	final_sample = df[df["genre"] == labels_list[0]].sample(n=10)

	# Add all other domains
	remaining_list = labels_list[1:]

	for i in remaining_list:
		try:
			added_instances = df[df["genre"] == i].sample(n=10)
			final_sample = pd.concat([final_sample, added_instances])
		except:
			print(df[df["genre"] == i][:2].to_markdown())

	# Shuffle rows
	final_sample = final_sample.sample(frac=1)

	# Save sample
	final_sample.to_csv("{}-genre-sample.txt".format(sample_path), sep="\t")
	
	return final_sample

In [109]:
mk_genre = extract_genre_sample("datasets/CLASSLA-web.mk.1.0.vert.gz-sample.txt")

mk_genre

Unnamed: 0,text_id,url,domain,genre,text,text_length
15197,CLASSLA-web.mk.18600,https://radar.mk/?p=29847,radar.mk,Information/Explanation,Топењето на мразот е посебен феномен од 21-от ...,269
46057,CLASSLA-web.mk.138126,https://emiter.com.mk/napis/10922,emiter.com.mk,Information/Explanation,Широката распространетост на РЕЛ-заварувањето ...,204
38999,CLASSLA-web.mk.921342,http://www.interesno.mk/nauka/38-nauka/50238-k...,interesno.mk,Instruction,Кои хороскопски знаци ќе ги погоди Розевата По...,195
48491,CLASSLA-web.mk.384653,https://www.mn.mk/pesni-za-makedonija/6075-Dob...,mn.mk,Prose/Lyrical,Валентина Ѓоргиевска Парго Добриот човек го им...,373
99412,CLASSLA-web.mk.1050389,"http://forum.carclub.mk/index.php/topic,102.ms...",forum.carclub.mk,Forum,Провери си тука http://www.autobulbsdirect.co....,440
...,...,...,...,...,...,...
68027,CLASSLA-web.mk.874985,https://crithink.mk/zloupotrebata-na-naslovite...,crithink.mk,Other,Злоупотребата на насловите за повеќе кликови е...,471
23267,CLASSLA-web.mk.828276,https://skopjeinfo.mk/sakate-da-ste-posrekjni-...,skopjeinfo.mk,Instruction,Сакате да сте посреќни? Намалете го хаосот во ...,444
86580,CLASSLA-web.mk.1247279,http://proverkanafakti.mk/recenzija-brojot-na-...,proverkanafakti.mk,Opinion/Argumentation,"ФАЛБИ БЕЗ СРАМ!<p>Имено, овој исклучително нео...",329
62432,CLASSLA-web.mk.313556,http://1000knigi.mon.gov.mk/book.php?id=1731,1000knigi.mon.gov.mk,Information/Explanation,Сексуално преносливи болести<p>Автори:<p>978-6...,230


In [110]:
mk_genre.describe(include="all")

Unnamed: 0,text_id,url,domain,genre,text,text_length
count,90,90,90,90,90,90.0
unique,90,90,63,9,90,
top,CLASSLA-web.mk.18600,https://radar.mk/?p=29847,forum.carclub.mk,Information/Explanation,Топењето на мразот е посебен феномен од 21-от ...,
freq,1,1,5,10,1,
mean,,,,,,230.355556
std,,,,,,113.578989
min,,,,,,86.0
25%,,,,,,134.5
50%,,,,,,200.5
75%,,,,,,311.75


In [106]:
hr_genre = extract_genre_sample("datasets/CLASSLA-web.hr.1.0.vert.gz-sample.txt")

hr_genre

Unnamed: 0,text_id,url,domain,genre,text,text_length
97669,CLASSLA-web.hr.4346000,https://podravske-sirine.com.hr/arhiva/11701,podravske-sirine.com.hr,Information/Explanation,Batine kao mjera kažnjavanja u Vojnoj krajini<...,202
65197,CLASSLA-web.hr.1088566,https://www.moj-film.hr/film/info/casino-royale/,moj-film.hr,Other,Sinopsis<p>CASINO ROYALE predstavlja JAMESA BO...,121
56399,CLASSLA-web.hr.202668,https://nogometplus.net/guingamp-vise-clanova-...,nogometplus.net,Opinion/Argumentation,Guingamp: Više članova nego stanovnika<p>Prije...,111
38714,CLASSLA-web.hr.3837178,https://hrvatska-danas.com/2021/08/26/splicani...,hrvatska-danas.com,News,Splićanin na radu u Wuhanu: U tri dana testira...,94
60756,CLASSLA-web.hr.643955,https://www.ictbusiness.info/ictbusiness-tv/ic...,ictbusiness.info,News,ICTbusiness TV: Brza i efikasna softverska rje...,133
...,...,...,...,...,...,...
49283,CLASSLA-web.hr.4908191,https://teatarexit.hr/politike-privatnosti/,teatarexit.hr,Legal,Politike privatnosti<p>Online prodaja karata o...,194
30515,CLASSLA-web.hr.3031426,https://www.pula.hr/hr/uprava/gradonacelnik/za...,pula.hr,Legal,Zamjenici gradonačelnika<p>Gradonačelnik ima j...,116
78461,CLASSLA-web.hr.2419400,https://www.zooplus.hr/shop/psi/hrana_za_pse_s...,zooplus.hr,Promotion,Happy Dog Supreme Young Junior Original<p>294....,254
24999,CLASSLA-web.hr.2486817,https://www.cromoda.com/nose-se-cizme-i-tobice...,cromoda.com,Other,Nose se čizme i tobice u kaubojskom stilu: Evo...,100


In [107]:
hr_genre.describe(include="all")

Unnamed: 0,text_id,url,domain,genre,text,text_length
count,90,90,90,90,90,90.0
unique,90,90,84,9,90,
top,CLASSLA-web.hr.4346000,https://podravske-sirine.com.hr/arhiva/11701,metro-portal.hr,Information/Explanation,Batine kao mjera kažnjavanja u Vojnoj krajini<...,
freq,1,1,3,10,1,
mean,,,,,,204.955556
std,,,,,,116.402818
min,,,,,,74.0
25%,,,,,,118.0
50%,,,,,,171.5
75%,,,,,,258.75


In [101]:
sl_genre = extract_genre_sample("datasets/CLASSLA-web.sl.1.0.vert.gz-sample.txt")

sl_genre

Unnamed: 0,text_id,url,domain,genre,text,text_length
47895,CLASSLA-web.sl.693122,https://www.sloski.si/index.php?t=news&amp;amp...,sloski.si,News,Nordijska kombinacija<p>Mladi kombinatorci so ...,454
64374,CLASSLA-web.sl.2342984,http://duhresnice.blog.siol.net/2015/07/26/mli...,duhresnice.blog.siol.net,Prose/Lyrical,Strani<p>Jul<p>26<p>V vertikali je neskončna m...,245
84107,CLASSLA-web.sl.269562,https://www.vzmd.si/novice/mediji-o-malih-deln...,vzmd.si,Other,"RA SLOVENIJA 1, 02.08.2012, DRUGA JUTRANJA KRO...",291
28638,CLASSLA-web.sl.2846516,https://www.drugisvet.com/tag/drzava,drugisvet.com,Opinion/Argumentation,V tej državi se ne znamo lotevati problemov. R...,77
94595,CLASSLA-web.sl.1320205,https://www.sodisce.si/znanje/sodna_praksa/vis...,sodisce.si,Legal,JEDRO: Verjetnost obstoja terjatve se presoja ...,425
...,...,...,...,...,...,...
35571,CLASSLA-web.sl.3521045,http://bos.zrc-sazu.si/c/neva.exe?n=a_si_s&amp...,bos.zrc-sazu.si,Prose/Lyrical,"Mi, kar nas je velikih, smo bili tako zamaknje...",93
45195,CLASSLA-web.sl.420198,https://www.tekac.si/novica/5-razteznih-vaj-za...,tekac.si,Instruction,5 razteznih vaj za tekače<p>Poglejte kako se u...,358
39905,CLASSLA-web.sl.3956557,https://www.igrace-populi.com/izdelek/kuharski...,igrace-populi.com,Promotion,Ta obsežen set za peko bo vsakega mladega kuha...,75
69750,CLASSLA-web.sl.2886985,https://www.ecco-verde.si/erbe-de-janas/indigo...,ecco-verde.si,Instruction,Opis<p>Indigo v prahu je prijeten naraven izde...,146


In [102]:
sl_genre.describe(include="all")

Unnamed: 0,text_id,url,domain,genre,text,text_length
count,90,90,90,90,90,90.0
unique,90,90,79,9,90,
top,CLASSLA-web.sl.693122,https://www.sloski.si/index.php?t=news&amp;amp...,besedilo.si,News,Nordijska kombinacija<p>Mladi kombinatorci so ...,
freq,1,1,5,10,1,
mean,,,,,,203.844444
std,,,,,,121.000038
min,,,,,,64.0
25%,,,,,,108.5
50%,,,,,,156.0
75%,,,,,,267.25


In [104]:
sl_genre[sl_genre["domain"] == "besedilo.si"]

Unnamed: 0,text_id,url,domain,genre,text,text_length
96495,CLASSLA-web.sl.1507108,https://www.besedilo.si/aleksander-jez/vroce-j...,besedilo.si,Prose/Lyrical,"Vroče je, ti pa si hladna<p>Zunaj vroče je in ...",64
55640,CLASSLA-web.sl.1463952,https://www.besedilo.si/werner/ti-ljubezen-si,besedilo.si,Prose/Lyrical,Ti ljubezen si<p>Izvajalec: Werner Izvajalec: ...,110
56091,CLASSLA-web.sl.1506939,https://www.besedilo.si/ansambel-kvinta/nisem-...,besedilo.si,Prose/Lyrical,"Nisem taka kot so druge<p>Nočem rož, ne bombon...",150
96377,CLASSLA-web.sl.1494196,https://www.besedilo.si/ansambel-borst/kdor-je...,besedilo.si,Prose/Lyrical,Kdor je v srcu mlad<p>Svet vrti se vedno v ist...,86
55741,CLASSLA-web.sl.1475509,https://www.besedilo.si/ansambel-petra-finka/r...,besedilo.si,Prose/Lyrical,"Ribič<p>Nad mestom se dan budi, galebi kričijo...",91
