In [1]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=6

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=6


In [2]:
import gzip
import regex as re
import sys
import pandas as pd
import json
from tqdm import tqdm

In [7]:
corpus_path = "datasets/sample.vert"

## Vert to JSONL

In [35]:
def vert_to_jsonl(corpus_path, gzip_file=True):
	"""
	Function to transform vertical format (vert) to jsonl format, based on the CLASSLA-web files. Change the function if the files contain other metadata.

	Args:
	- corpus_path: path to the dataset in VERT format
	- gzip_file: whether the file is gzipped. Defaults to true.
	"""
	if gzip_file == True:
		corpus = gzip.open(corpus_path, "rt")
	else:
		corpus = open(corpus_path, "r")

	# Open a new file to which we will append each json line
	new_file = open("{}.jsonl".format(corpus_path), "w")
	new_file.close()
	new_file = open("{}.jsonl".format(corpus_path), "a")

	text_id_re = re.compile('id="(.+?)"')
	url_re = re.compile('url="(.+?)"')
	domain_re = re.compile('domain="(.+?)"')
	genre_re = re.compile('genre="(.+?)"')

	text_counter = 0

	for line in corpus:
		if line.startswith("<text"):
			current_text = {}
			text_string = ""
			current_text["text_id"] = text_id_re.search(line).group(1)
			current_text["url"] = url_re.search(line).group(1)
			current_text["domain"] = domain_re.search(line).group(1)
			current_text["genre"] = genre_re.search(line).group(1)
			current_text["text"] = ""
			current_text["text_length"] = 0
			current_ling_anno = []
		elif line.startswith("<p"):
			continue
		elif line.startswith("<s"):
			continue
		elif line.startswith("</p"):
			text_string = text_string.rstrip()
			text_string += "\n"
		elif line.startswith("</s"):
			continue
		elif line.startswith("<g"):
			# Remove space before the last word if there is a symbol <g (= glue, meaning no space between words)
			text_string = text_string.rstrip()
		elif line.startswith("</text>"):
			current_text["ling_anno"] = current_ling_anno
			current_text["text"] = text_string
			current_text["text_length"] = len(text_string.split())
			new_file.write("{}".format(current_text))
			new_file.write("\n")
			text_counter += 1
			if text_counter%10 == 0:
				print("Processed {} files.".format(text_counter))
		else:
			current_line = line.split("\t")
			current_line_dict = {"word": current_line[0], "lemma": current_line[1], "xpos": current_line[2], "upos": current_line[3], "feats": current_line[4], "id": current_line[5].replace("\n", "")}
			current_ling_anno.append(current_line_dict)
			current_word = current_line[0]
			text_string += current_word
			text_string += " "
	
	new_file.close()
	print("Processing completed. The new file is saved as {}.jsonl".format(corpus_path))

## Vert to TXT

In [93]:
def vert_to_txt_sample(corpus_path, gzip_file=True):
	"""
	Function to transform vertical format (vert) to jsonl format, based on the CLASSLA-web files.
	We will only extract a sample of 250 000 texts.

	Args:
	- corpus_path: path to the dataset in VERT format
	- gzip_file: whether the file is gzipped. Defaults to true.
	"""
	if gzip_file == True:
		corpus = gzip.open(corpus_path, "rt")
	else:
		corpus = open(corpus_path, "r")

	# Open a new file to which we will append each json line
	new_file = open("{}-sample.txt".format(corpus_path), "w")
	new_file.write("text_id\turl\tdomain\tgenre\ttext\ttext_length\n")
	new_file.close()
	new_file = open("{}-sample.txt".format(corpus_path), "a")

	text_id_re = re.compile('id="(.+?)"')
	url_re = re.compile('url="(.+?)"')
	domain_re = re.compile('domain="(.+?)"')
	genre_re = re.compile('genre="(.+?)"')

	text_counter = 0

	for line in corpus:
		if text_counter < 100000:
			if line.startswith("<text"):
				current_text = {}
				text_string = ""
				current_text["text_id"] = text_id_re.search(line).group(1)
				current_text["url"] = url_re.search(line).group(1)
				current_text["domain"] = domain_re.search(line).group(1)
				current_text["genre"] = genre_re.search(line).group(1)
				current_text["text"] = ""
				current_text["text_length"] = 0
			elif line.startswith("<p"):
				continue
			elif line.startswith("<s"):
				continue
			elif line.startswith("</p"):
				text_string = text_string.rstrip()
				text_string += "<p>"
			elif line.startswith("</s"):
				continue
			elif line.startswith("<g"):
				# Remove space before the last word if there is a symbol <g (= glue, meaning no space between words)
				text_string = text_string.rstrip()
			elif line.startswith("</text>"):
				current_text["text"] = text_string
				current_text["text_length"] = len(text_string.split())
				#new_file.write("{}".format(current_text))
				#new_file.write("\n")
				new_file.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(current_text["text_id"],current_text["url"],current_text["domain"], current_text["genre"], current_text["text"], current_text["text_length"]))
				text_counter += 1
				if text_counter%10 == 0:
					print("Processed {} files.".format(text_counter))
			else:
				current_line = line.split("\t")
				current_word = current_line[0]
				text_string += current_word
				text_string += " "
		else:
			break
	
	new_file.close()
	print("Processing completed. The sample is saved as {}-sample.txt".format(corpus_path))

In [None]:
vert_to_txt_sample("datasets/CLASSLA-web.mk.1.0.vert.gz")

## Vert to genre sample (txt)

In [25]:
def extract_genre_sample(sample_path):
	df = pd.read_csv(sample_path, sep="\t")

	# We will extract all labels
	labels_list=['Other', 'Information/Explanation', 'News', 'Instruction', 'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal', 'Promotion']

	# Remove all texts, longer than 500 words
	df = df[df["text_length"] < 500]

	# First create the initial df to which all others in the loop will be added
	final_sample = df[df["genre"] == labels_list[0]].sample(n=10)

	# Add all other domains
	remaining_list = labels_list[1:]

	for i in remaining_list:
		try:
			added_instances = df[df["genre"] == i].sample(n=10)
			final_sample = pd.concat([final_sample, added_instances])
		except:
			print(df[df["genre"] == i][:2].to_markdown())

	# Shuffle rows
	final_sample = final_sample.sample(frac=1)

	# Save sample
	final_sample.to_csv("{}-genre-sample.txt".format(sample_path), sep="\t")
	
	return final_sample

In [109]:
mk_genre = extract_genre_sample("datasets/CLASSLA-web.mk.1.0.vert.gz-sample.txt")

mk_genre

Unnamed: 0,text_id,url,domain,genre,text,text_length
15197,CLASSLA-web.mk.18600,https://radar.mk/?p=29847,radar.mk,Information/Explanation,Топењето на мразот е посебен феномен од 21-от ...,269
46057,CLASSLA-web.mk.138126,https://emiter.com.mk/napis/10922,emiter.com.mk,Information/Explanation,Широката распространетост на РЕЛ-заварувањето ...,204
38999,CLASSLA-web.mk.921342,http://www.interesno.mk/nauka/38-nauka/50238-k...,interesno.mk,Instruction,Кои хороскопски знаци ќе ги погоди Розевата По...,195
48491,CLASSLA-web.mk.384653,https://www.mn.mk/pesni-za-makedonija/6075-Dob...,mn.mk,Prose/Lyrical,Валентина Ѓоргиевска Парго Добриот човек го им...,373
99412,CLASSLA-web.mk.1050389,"http://forum.carclub.mk/index.php/topic,102.ms...",forum.carclub.mk,Forum,Провери си тука http://www.autobulbsdirect.co....,440
...,...,...,...,...,...,...
68027,CLASSLA-web.mk.874985,https://crithink.mk/zloupotrebata-na-naslovite...,crithink.mk,Other,Злоупотребата на насловите за повеќе кликови е...,471
23267,CLASSLA-web.mk.828276,https://skopjeinfo.mk/sakate-da-ste-posrekjni-...,skopjeinfo.mk,Instruction,Сакате да сте посреќни? Намалете го хаосот во ...,444
86580,CLASSLA-web.mk.1247279,http://proverkanafakti.mk/recenzija-brojot-na-...,proverkanafakti.mk,Opinion/Argumentation,"ФАЛБИ БЕЗ СРАМ!<p>Имено, овој исклучително нео...",329
62432,CLASSLA-web.mk.313556,http://1000knigi.mon.gov.mk/book.php?id=1731,1000knigi.mon.gov.mk,Information/Explanation,Сексуално преносливи болести<p>Автори:<p>978-6...,230


In [110]:
mk_genre.describe(include="all")

Unnamed: 0,text_id,url,domain,genre,text,text_length
count,90,90,90,90,90,90.0
unique,90,90,63,9,90,
top,CLASSLA-web.mk.18600,https://radar.mk/?p=29847,forum.carclub.mk,Information/Explanation,Топењето на мразот е посебен феномен од 21-от ...,
freq,1,1,5,10,1,
mean,,,,,,230.355556
std,,,,,,113.578989
min,,,,,,86.0
25%,,,,,,134.5
50%,,,,,,200.5
75%,,,,,,311.75


In [106]:
hr_genre = extract_genre_sample("datasets/CLASSLA-web.hr.1.0.vert.gz-sample.txt")

hr_genre

Unnamed: 0,text_id,url,domain,genre,text,text_length
97669,CLASSLA-web.hr.4346000,https://podravske-sirine.com.hr/arhiva/11701,podravske-sirine.com.hr,Information/Explanation,Batine kao mjera kažnjavanja u Vojnoj krajini<...,202
65197,CLASSLA-web.hr.1088566,https://www.moj-film.hr/film/info/casino-royale/,moj-film.hr,Other,Sinopsis<p>CASINO ROYALE predstavlja JAMESA BO...,121
56399,CLASSLA-web.hr.202668,https://nogometplus.net/guingamp-vise-clanova-...,nogometplus.net,Opinion/Argumentation,Guingamp: Više članova nego stanovnika<p>Prije...,111
38714,CLASSLA-web.hr.3837178,https://hrvatska-danas.com/2021/08/26/splicani...,hrvatska-danas.com,News,Splićanin na radu u Wuhanu: U tri dana testira...,94
60756,CLASSLA-web.hr.643955,https://www.ictbusiness.info/ictbusiness-tv/ic...,ictbusiness.info,News,ICTbusiness TV: Brza i efikasna softverska rje...,133
...,...,...,...,...,...,...
49283,CLASSLA-web.hr.4908191,https://teatarexit.hr/politike-privatnosti/,teatarexit.hr,Legal,Politike privatnosti<p>Online prodaja karata o...,194
30515,CLASSLA-web.hr.3031426,https://www.pula.hr/hr/uprava/gradonacelnik/za...,pula.hr,Legal,Zamjenici gradonačelnika<p>Gradonačelnik ima j...,116
78461,CLASSLA-web.hr.2419400,https://www.zooplus.hr/shop/psi/hrana_za_pse_s...,zooplus.hr,Promotion,Happy Dog Supreme Young Junior Original<p>294....,254
24999,CLASSLA-web.hr.2486817,https://www.cromoda.com/nose-se-cizme-i-tobice...,cromoda.com,Other,Nose se čizme i tobice u kaubojskom stilu: Evo...,100


In [107]:
hr_genre.describe(include="all")

Unnamed: 0,text_id,url,domain,genre,text,text_length
count,90,90,90,90,90,90.0
unique,90,90,84,9,90,
top,CLASSLA-web.hr.4346000,https://podravske-sirine.com.hr/arhiva/11701,metro-portal.hr,Information/Explanation,Batine kao mjera kažnjavanja u Vojnoj krajini<...,
freq,1,1,3,10,1,
mean,,,,,,204.955556
std,,,,,,116.402818
min,,,,,,74.0
25%,,,,,,118.0
50%,,,,,,171.5
75%,,,,,,258.75


In [101]:
sl_genre = extract_genre_sample("datasets/CLASSLA-web.sl.1.0.vert.gz-sample.txt")

sl_genre

Unnamed: 0,text_id,url,domain,genre,text,text_length
47895,CLASSLA-web.sl.693122,https://www.sloski.si/index.php?t=news&amp;amp...,sloski.si,News,Nordijska kombinacija<p>Mladi kombinatorci so ...,454
64374,CLASSLA-web.sl.2342984,http://duhresnice.blog.siol.net/2015/07/26/mli...,duhresnice.blog.siol.net,Prose/Lyrical,Strani<p>Jul<p>26<p>V vertikali je neskončna m...,245
84107,CLASSLA-web.sl.269562,https://www.vzmd.si/novice/mediji-o-malih-deln...,vzmd.si,Other,"RA SLOVENIJA 1, 02.08.2012, DRUGA JUTRANJA KRO...",291
28638,CLASSLA-web.sl.2846516,https://www.drugisvet.com/tag/drzava,drugisvet.com,Opinion/Argumentation,V tej državi se ne znamo lotevati problemov. R...,77
94595,CLASSLA-web.sl.1320205,https://www.sodisce.si/znanje/sodna_praksa/vis...,sodisce.si,Legal,JEDRO: Verjetnost obstoja terjatve se presoja ...,425
...,...,...,...,...,...,...
35571,CLASSLA-web.sl.3521045,http://bos.zrc-sazu.si/c/neva.exe?n=a_si_s&amp...,bos.zrc-sazu.si,Prose/Lyrical,"Mi, kar nas je velikih, smo bili tako zamaknje...",93
45195,CLASSLA-web.sl.420198,https://www.tekac.si/novica/5-razteznih-vaj-za...,tekac.si,Instruction,5 razteznih vaj za tekače<p>Poglejte kako se u...,358
39905,CLASSLA-web.sl.3956557,https://www.igrace-populi.com/izdelek/kuharski...,igrace-populi.com,Promotion,Ta obsežen set za peko bo vsakega mladega kuha...,75
69750,CLASSLA-web.sl.2886985,https://www.ecco-verde.si/erbe-de-janas/indigo...,ecco-verde.si,Instruction,Opis<p>Indigo v prahu je prijeten naraven izde...,146


In [102]:
sl_genre.describe(include="all")

Unnamed: 0,text_id,url,domain,genre,text,text_length
count,90,90,90,90,90,90.0
unique,90,90,79,9,90,
top,CLASSLA-web.sl.693122,https://www.sloski.si/index.php?t=news&amp;amp...,besedilo.si,News,Nordijska kombinacija<p>Mladi kombinatorci so ...,
freq,1,1,5,10,1,
mean,,,,,,203.844444
std,,,,,,121.000038
min,,,,,,64.0
25%,,,,,,108.5
50%,,,,,,156.0
75%,,,,,,267.25


In [104]:
sl_genre[sl_genre["domain"] == "besedilo.si"]

Unnamed: 0,text_id,url,domain,genre,text,text_length
96495,CLASSLA-web.sl.1507108,https://www.besedilo.si/aleksander-jez/vroce-j...,besedilo.si,Prose/Lyrical,"Vroče je, ti pa si hladna<p>Zunaj vroče je in ...",64
55640,CLASSLA-web.sl.1463952,https://www.besedilo.si/werner/ti-ljubezen-si,besedilo.si,Prose/Lyrical,Ti ljubezen si<p>Izvajalec: Werner Izvajalec: ...,110
56091,CLASSLA-web.sl.1506939,https://www.besedilo.si/ansambel-kvinta/nisem-...,besedilo.si,Prose/Lyrical,"Nisem taka kot so druge<p>Nočem rož, ne bombon...",150
96377,CLASSLA-web.sl.1494196,https://www.besedilo.si/ansambel-borst/kdor-je...,besedilo.si,Prose/Lyrical,Kdor je v srcu mlad<p>Svet vrti se vedno v ist...,86
55741,CLASSLA-web.sl.1475509,https://www.besedilo.si/ansambel-petra-finka/r...,besedilo.si,Prose/Lyrical,"Ribič<p>Nad mestom se dan budi, galebi kričijo...",91


## Convert genre sample to JSON file with paragraph structure

In [5]:
# Open samples

sample_paths = {"hr":"datasets/CLASSLA-web.hr.1.0.vert.gz-sample.txt-genre-sample.txt", "mk":"datasets/CLASSLA-web.mk.1.0.vert.gz-sample.txt-genre-sample.txt", "sl":"datasets/CLASSLA-web.sl.1.0.vert.gz-sample.txt-genre-sample.txt", "sq": "datasets/MaCoCu-sq-texts-with-genres.tsv-genre-sample.txt"}

In [7]:
def genre_sample_to_json(sample_paths, lang):
	""" Convert genre sample in TXT to JSON and add translations and paragraph structure.
	
		Args:
		- sample_paths: path to a file, created with the function extract_genre_sample
		- lang: hr, mk, sl"""
	
	sample_df = pd.read_csv(sample_paths[lang], sep="\t", index_col = 0)

	# Change <p> signs to actual new lines
	sample_df["text"] = sample_df["text"].str.replace("<p>", "\n\n")


	# Apply Google Translate and machine translate the data
	import googletrans
	from googletrans import Translator

	# Define the translation model
	translator = Translator()

	# Create the final list
	translation_GT = []

	sentence_list = sample_df["text"].to_list()

	print("Starting translation.")

	# Loop through the list of original sentences,
	# translate each and append the translation to the final list
	for i in sentence_list:
		# Translate the sentence from Slovene (src = "sl") to English (dest = "en")
			current_translation = translator.translate(i, src = lang, dest='en')
		# Append the translated sentence to the final list
			translation_GT.append(current_translation.text)

	print("Translation finished.")

	# Append translations to the sample

	sample_df["translation"] = translation_GT

	# Save to JSON lines
	sample_df.to_json("datasets/CLASSLA-web.{}.1.0.-translated-genre-sample.jsonl".format(lang), orient="records", lines=True)

	print("Final file saved as datasets/CLASSLA-web.{}.1.0.-translated-genre-sample.jsonl".format(lang))

	# Create also a version for the annotation tool, only with translation and labels
	ann_df = sample_df[["translation","genre"]]

	# For annotation, each label should be in a list
	ann_df["genre"] = ann_df["genre"].apply(lambda x:[x])

	# Rename df
	ann_df.columns = ["text", "label"]

	# Add metadata
	text_ids = sample_df["text_id"].to_list()
	domains = sample_df["domain"].to_list()

	metadata_list = []

	for i in list(zip(text_ids,domains)):
		metadata = {"text_id": i[0], "domain": i[1]}
		metadata_list.append(metadata)

	ann_df["metadata"] = metadata_list

	# Save to JSON lines
	ann_df.to_json("datasets/CLASSLA-web.{}.1.0.-translated-genre-sample-for-annotation.jsonl".format(lang), orient="records", lines=True)

	print("File for annotation saved as datasets/CLASSLA-web.{}.1.0.-translated-genre-sample-for-annotation.jsonl".format(lang))

	return sample_df


In [50]:
sample_hr = genre_sample_to_json(sample_paths, "mk")

sample_hr.head()

Starting translation.
Translation finished.
Final file saved as datasets/CLASSLA-web.mk.1.0.-translated-genre-sample.jsonl
File for annotation saved as datasets/CLASSLA-web.mk.1.0.-translated-genre-sample-for-annotation.jsonl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ann_df["genre"] = ann_df["genre"].apply(lambda x:[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ann_df["metadata"] = metadata_list


Unnamed: 0,text_id,url,domain,genre,text,text_length,translation
15197,CLASSLA-web.mk.18600,https://radar.mk/?p=29847,radar.mk,Information/Explanation,Топењето на мразот е посебен феномен од 21-от ...,269,Melting ice is a special 21st century phenomen...
46057,CLASSLA-web.mk.138126,https://emiter.com.mk/napis/10922,emiter.com.mk,Information/Explanation,Широката распространетост на РЕЛ-заварувањето ...,204,The widespread distribution of the relief and ...
38999,CLASSLA-web.mk.921342,http://www.interesno.mk/nauka/38-nauka/50238-k...,interesno.mk,Instruction,Кои хороскопски знаци ќе ги погоди Розевата По...,195,What zodiac signs would hit the pink full moon...
48491,CLASSLA-web.mk.384653,https://www.mn.mk/pesni-za-makedonija/6075-Dob...,mn.mk,Prose/Lyrical,Валентина Ѓоргиевска Парго Добриот човек го им...,373,Valentina Gjorgievska Pargo The good man has s...
99412,CLASSLA-web.mk.1050389,"http://forum.carclub.mk/index.php/topic,102.ms...",forum.carclub.mk,Forum,Провери си тука http://www.autobulbsdirect.co....,440,Check out here http://www.autobulbsdirect.co.u...


## Create JSONL file from MaCoCu.TSV

Code to create JSONL file and genre sample from the MaCoCu.TSV file - in case of corpora that do not have a VERT format (haven't been linguistically annotated yet).

In [25]:
genre_file = pd.read_csv("/cache/tajak/macocu-mt/MaCoCu-mt-2.0.tsv-genres.tsv", sep="\t", names = ["text_id", "genre"])

display(genre_file.head())

Unnamed: 0,text_id,genre
0,macocu.mt.1,Other
1,macocu.mt.2,Prose/Lyrical
2,macocu.mt.5,Information/Explanation
3,macocu.mt.7,Prose/Lyrical
4,macocu.mt.9,Mix


In [26]:
genre_file.shape

(513904, 2)

In [27]:
# Extract texts

texts = []
text_ids = []

for line in open("/cache/tajak/macocu-mt/MaCoCu-mt-2.0.tsv"):
    did,text=line.strip().split('\t')
    texts.append(text)
    text_ids.append(did)

print(len(texts), len(text_ids))

513904 513904


In [28]:
# Create a df out of the extacted texts and ids
text_file = pd.DataFrame({"text_id":text_ids,"text":texts})

text_file.head(3)

Unnamed: 0,text_id,text
0,macocu.mt.1,Bil-Filmat: Diversi Żgħażagħ Barranin Qaqoċċa ...
1,macocu.mt.2,Itlob biex jiġi jżurek Xhud taʼ Ġeħova biex ti...
2,macocu.mt.5,"tal-Għaqda Dilettanti Knisja ta' Lapsi, San Ġi..."


In [29]:
# Merge the dfs

df = text_file.merge(genre_file, on="text_id")

df.head()


Unnamed: 0,text_id,text,genre
0,macocu.mt.1,Bil-Filmat: Diversi Żgħażagħ Barranin Qaqoċċa ...,Other
1,macocu.mt.2,Itlob biex jiġi jżurek Xhud taʼ Ġeħova biex ti...,Prose/Lyrical
2,macocu.mt.5,"tal-Għaqda Dilettanti Knisja ta' Lapsi, San Ġi...",Information/Explanation
3,macocu.mt.7,"Intant, ħu pjaċir. U jekk tkun trid tgħaddilna...",Prose/Lyrical
4,macocu.mt.9,Qegħdin hawn biex ngħinuk. <p>Il-Family Planni...,Mix


In [30]:
df.shape

(513904, 3)

In [31]:
# Add text length information
df["text_length"] = df.text.apply(lambda x:len(x.split()))

df.head()

Unnamed: 0,text_id,text,genre,text_length
0,macocu.mt.1,Bil-Filmat: Diversi Żgħażagħ Barranin Qaqoċċa ...,Other,102
1,macocu.mt.2,Itlob biex jiġi jżurek Xhud taʼ Ġeħova biex ti...,Prose/Lyrical,124
2,macocu.mt.5,"tal-Għaqda Dilettanti Knisja ta' Lapsi, San Ġi...",Information/Explanation,297
3,macocu.mt.7,"Intant, ħu pjaċir. U jekk tkun trid tgħaddilna...",Prose/Lyrical,413
4,macocu.mt.9,Qegħdin hawn biex ngħinuk. <p>Il-Family Planni...,Mix,367


In [32]:
df.describe(include="all")

Unnamed: 0,text_id,text,genre,text_length
count,513904,513904,513904,513904.0
unique,513904,513904,10,
top,macocu.mt.1,Bil-Filmat: Diversi Żgħażagħ Barranin Qaqoċċa ...,Information/Explanation,
freq,1,1,210002,
mean,,,,423.462176
std,,,,126.537873
min,,,,76.0
25%,,,,366.0
50%,,,,512.0
75%,,,,512.0


In [35]:
print(df.genre.value_counts(normalize="True").to_markdown())

| genre                   |   proportion |
|:------------------------|-------------:|
| Information/Explanation |  0.408641    |
| Mix                     |  0.242082    |
| Forum                   |  0.101274    |
| Opinion/Argumentation   |  0.0943892   |
| Other                   |  0.0712001   |
| Prose/Lyrical           |  0.0700773   |
| Instruction             |  0.00454365  |
| Promotion               |  0.00443857  |
| News                    |  0.00332747  |
| Legal                   |  2.72424e-05 |


In [36]:
print(df.genre.value_counts().to_markdown())

| genre                   |   count |
|:------------------------|--------:|
| Information/Explanation |  210002 |
| Mix                     |  124407 |
| Forum                   |   52045 |
| Opinion/Argumentation   |   48507 |
| Other                   |   36590 |
| Prose/Lyrical           |   36013 |
| Instruction             |    2335 |
| Promotion               |    2281 |
| News                    |    1710 |
| Legal                   |      14 |


In [37]:
# Save the file as it is
df.to_csv("/cache/tajak/macocu-mt/MaCoCu-mt-texts-with-genres.tsv", sep="\t")

In [16]:
# Compare the file created with original code and the file created with the extended improved code
old_file = pd.read_csv("/cache/tajak/macocu-mt/MaCoCu-mt-texts-with-genres.tsv", sep="\t", index_col = 0)

display(old_file)

new_file = pd.read_json("/cache/tajak/macocu-mt/MaCoCu-mt-2.0.tsv-genre-annotated.jsonl", orient="records", lines=True)

display(new_file)

print(old_file.shape, new_file.shape)

Unnamed: 0,text_id,text,genre
0,macocu.sq.1,Te rejat Fondi Shqiptar i Zhvillimit ka hapur ...,News
1,macocu.sq.2,You are using an out of date browser. It may n...,Forum
2,macocu.sq.3,Shkarkoni aplikacionin në celularin tuaj. Nga ...,News
3,macocu.sq.4,"Persona që nuk kanë një kult, që nuk kanë një ...",Opinion/Argumentation
4,macocu.sq.7,4 risitë e Bindjes Demokratike në këto zgjedhj...,Mix
...,...,...,...
1303397,macocu.sq.1684318,Enter full names of the beneficiary of the acc...,Legal
1303398,macocu.sq.1684319,Platforma Magazina franceze “Closer” pretendon...,News
1303399,macocu.sq.1684320,“Ky aktvendim u mor për arsye se ekziston dysh...,News
1303400,macocu.sq.1684321,"Saturday, 30 November 2013 Ka ndërruar jetë so...",News


Unnamed: 0,document_id,text,genre,logit
0,macocu.mt.1,Bil-Filmat: Diversi Żgħażagħ Barranin Qaqoċċa ...,Other,"[6.211301326751709, -1.123962044715881, -2.108..."
1,macocu.mt.2,Itlob biex jiġi jżurek Xhud taʼ Ġeħova biex ti...,Prose/Lyrical,"[1.873501896858215, -0.670612275600433, -1.697..."
2,macocu.mt.5,"tal-Għaqda Dilettanti Knisja ta' Lapsi, San Ġi...",Information/Explanation,"[-0.45298385620117104, 8.026633262634277, -1.0..."
3,macocu.mt.7,"Intant, ħu pjaċir. U jekk tkun trid tgħaddilna...",Prose/Lyrical,"[0.431536048650741, -1.36893618106842, -1.2726..."
4,macocu.mt.9,Qegħdin hawn biex ngħinuk. <p>Il-Family Planni...,Mix,"[3.972864389419555, -0.131693825125694, -2.675..."
...,...,...,...,...
187995,macocu.mt.199503,Kif issir bartender: professjonisti Tips <p>Il...,Opinion/Argumentation,"[1.5924533605575562, -1.841930270195007, -1.72..."
187996,macocu.mt.199504,Pulpetti homemade fenek: diversi riċetti <p>Il...,Mix,"[1.042439937591552, 4.791776657104492, -1.8440..."
187997,macocu.mt.199505,Temperatura wara t-tilqima: hemm periklu? <p>G...,Information/Explanation,"[-0.8246461749076841, 8.31525707244873, -0.820..."
187998,macocu.mt.199506,Kif taħsad il-ħafur biex jitilfu l-piż? <p>Laħ...,Other,"[4.6228227615356445, 1.88374388217926, -2.4615..."


(1303402, 3) (188000, 4)


### Create a genre sample

For the sample, we will remove the "Mix" categories.

In [20]:
sample_path = "datasets/MaCoCu-sq-texts-with-genres.tsv"

In [27]:
genre_sample_sq = extract_genre_sample(sample_path)

In [28]:
genre_sample_sq.head()

Unnamed: 0.1,Unnamed: 0,text_id,text,genre,text_length
814690,814690,macocu.sq.1061396,Blog “Unë të kam dashur me një dashuri të përj...,Opinion/Argumentation,341
1080203,1080203,macocu.sq.1408163,"Fronti Bashkimit Kombëtar Shqiptar (FBKSH), or...",Promotion,231
143319,143319,macocu.sq.183383,Liria nga keqtrajtimi Konventa e të Drejtave t...,Legal,140
914937,914937,macocu.sq.1191613,"Një milimetër larg golit, “VAR” bëhet makth pë...",News,115
846370,846370,macocu.sq.1104611,Ç’MENDONI JU? Pyetje: Nena ime ka pare enderr ...,Other,342


## Create a file from the MaCoCu PREVERT corpus

In [1]:
# Unzip the ZIP folder with the files
import zipfile

folder = "datasets/MaCoCu-sq-1.0.xml.zip"

with zipfile.ZipFile(folder, 'r') as zip_ref:
    zip_ref.extractall()

In [46]:
def merge_prevert_with_sample(sample_path, prevert_path):
    from prevert import dataset

    # Open the file that has the texts with genres sample
    sample = pd.read_csv(sample_path, sep="\t", index_col = 0)

    # Extract the list of all text ids
    text_ids = sample.text_id.to_list()

    prevert_texts = {}
    domains = {}

    # Open the dataset with the prevert parser 
    dset = dataset(prevert_path)

    # loop through the documents in MaCoCu prevert corpus and add
    # the document to the sample list if its id is in the genre sample
    for doc in tqdm(dset): # iterating through documents of a dataset
        current_text = ""
        current_text_id = doc.meta["id"]
        current_domain = doc.meta["domain"]
        if any(text_id == current_text_id for text_id in text_ids):
            for par in doc: # iterating through paragraphs of a document
                current_text += str(par)
                current_text += "\n"
            prevert_texts[current_text_id] = current_text
            domains[current_text_id] = current_domain
        else:
            continue

    print("Processing finished.")

    # Then append the new information to a df
    prevert_df = pd.DataFrame(list(zip(domains.keys(), domains.values(), prevert_texts.values())), columns=["text_id", 'domain', 'text'])

    # Merge the new dataframe to the sample dataframe
    final_df = sample.merge(prevert_df, on="text_id")

    # Remove unnecessary columns
    final_df = final_df.drop(columns=["Unnamed: 0", "text_x"])

    # Rename the text_y column
    final_df.rename(columns={"text_y":"text"}, inplace=True)

    # Save the file
    final_df.to_csv("{}-extracted-text-from-prevert.csv".format(sample_path))

    print("File created and saved as {}-extracted-text-from-prevert.csv".format(sample_path))

    return final_df

In [None]:
merge_prevert_with_sample("datasets/MaCoCu-sq-texts-with-genres.tsv-genre-sample.txt", "datasets/MaCoCu-sq-1.0.xml")

In [44]:
def prevert_enriched_sample_to_json(file_path, lang):
	""" Convert genre sample, enriched with the texts from the prevert corpus that have paragraph structure to a JSON file and translate them. This function is meant for corpora which were not extracted from VERT files, e.g. SQ corpus.

		Args:
		- file_path: path to the genre sample, created with the function merge_prevert_with_sample()
		- lang: sq"""

	sample_df = pd.read_csv(file_path, index_col = 0)

	# Apply Google Translate and machine translate the data
	import googletrans
	from googletrans import Translator

	# Define the translation model
	translator = Translator()

	# Create the final list
	translation_GT = []

	sentence_list = sample_df["text"].to_list()

	print("Starting translation.")

	# Loop through the list of original sentences,
	# translate each and append the translation to the final list
	for i in sentence_list:
		# Translate the sentence from source language, e.g. Slovene (src = "sl") to English (dest = "en")
			current_translation = translator.translate(i, src = lang, dest='en')
		# Append the translated sentence to the final list
			translation_GT.append(current_translation.text)

	print("Translation finished.")

	# Append translations to the sample

	sample_df["translation"] = translation_GT

	# Save to JSON lines
	sample_df.to_json("datasets/CLASSLA-web.{}.1.0.-translated-genre-sample.jsonl".format(lang), orient="records", lines=True)

	print("Final file saved as datasets/CLASSLA-web.{}.1.0.-translated-genre-sample.jsonl".format(lang))

	# Create also a version for the annotation tool, only with translation and labels
	ann_df = sample_df[["translation","genre"]]

	# For annotation, each label should be in a list
	ann_df["genre"] = ann_df["genre"].apply(lambda x:[x])

	# Rename df
	ann_df.columns = ["text", "label"]

	# Add metadata
	text_ids = sample_df["text_id"].to_list()
	domains = sample_df["domain"].to_list()

	metadata_list = []

	for i in list(zip(text_ids,domains)):
		metadata = {"text_id": i[0], "domain": i[1]}
		metadata_list.append(metadata)

	ann_df["metadata"] = metadata_list

	# Save to JSON lines
	ann_df.to_json("datasets/CLASSLA-web.{}.1.0.-translated-genre-sample-for-annotation.jsonl".format(lang), orient="records", lines=True)

	print("File for annotation saved as datasets/CLASSLA-web.{}.1.0.-translated-genre-sample-for-annotation.jsonl".format(lang))

	return sample_df


In [47]:
prevert_enriched_sample_to_json("datasets/MaCoCu-sq-texts-with-genres.tsv-genre-sample.txt-extracted-text-from-prevert.csv", "sq")

Starting translation.
Translation finished.
Final file saved as datasets/CLASSLA-web.sq.1.0.-translated-genre-sample.jsonl
File for annotation saved as datasets/CLASSLA-web.sq.1.0.-translated-genre-sample-for-annotation.jsonl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ann_df["genre"] = ann_df["genre"].apply(lambda x:[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ann_df["metadata"] = metadata_list


Unnamed: 0,text_id,genre,text_length,domain,text,translation
0,macocu.sq.1061396,Opinion/Argumentation,341,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31..."
1,macocu.sq.1408163,Promotion,231,zemrashqiptare.net,"Fronti Bashkimit Kombëtar Shqiptar (FBKSH), or...","The Albanian National Union Front (FBKSH), the..."
2,macocu.sq.183383,Legal,140,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...
3,macocu.sq.1191613,News,115,sportekspres.com,"Një milimetër larg golit, “VAR” bëhet makth pë...","A millimeter away from goal, ""Var"" becomes nig..."
4,macocu.sq.1104611,Other,342,burimijetes.com,Ç’MENDONI JU?\n\nPyetje: Nena ime ka pare ende...,What do you think?\n\nQuestion: My mother has ...
...,...,...,...,...,...,...
85,macocu.sq.915191,Forum,235,intervista.al,Gazeta Intervista\n\nGazeta Intervista\n\nPas ...,Interview newspapers\n\nInterview newspapers\n...
86,macocu.sq.1060790,Information/Explanation,121,radioiliria.net,Aleksandra Stan dhe superhiti i saj “Mr.Saxobe...,"Alexandra Stan and her superhiti ""Mr.Axobeat""\..."
87,macocu.sq.581026,Opinion/Argumentation,437,novamedia.al,Përfundon operacioni i larjes se halesë se Edv...,The operation of Edvin Czech's dump mask is co...
88,macocu.sq.1118037,Other,223,burimijetes.com,A LEJOHET TË THEM “EL HAMDU LILAH” KUR TESHTIJ...,"Is it permissible to say ""Al Hamdu lilah"" when..."


## Create a remaining file for annotation - in case you annotated only a part of corpus

In [4]:
# Open the annotated part

annotated = pd.read_json("datasets/CLASSLA-sq-samo-oznaceni-primeri.jsonl", lines=True)
annotated.head(2)

Unnamed: 0,id,text,metadata,label,Comments
0,1,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",[Opinion/Argumentation],[]
1,2,"The Albanian National Union Front (FBKSH), the...","{'text_id': 'macocu.sq.1408163', 'domain': 'ze...",[Other],[]


In [6]:
annotated.tail()

Unnamed: 0,id,text,metadata,label,Comments
49,50,"Altin Toska, CV\n\nThis is the vital curriculu...","{'text_id': 'macocu.sq.1616592', 'domain': 'pu...",[Information/Explanation],[]
50,51,Dermocosmetics\n\nLA ROCHE POSAY - TOLERIAN CA...,"{'text_id': 'macocu.sq.1486735', 'domain': 'fi...",[Promotion],[]
51,52,Does men's sleep worse with full moon?The stud...,"{'text_id': 'macocu.sq.797062', 'domain': 'val...",[Information/Explanation],[]
52,53,Dear reader!\n\nTitle: Dear Reader!14.09.10 6:...,"{'text_id': 'macocu.sq.1576314', 'domain': 'yo...",[Information/Explanation],[]
53,54,"For example, it is preparing to take responsib...","{'text_id': 'macocu.sq.463621', 'domain': 'rre...",[Opinion/Argumentation],[]


In [11]:
annotated.label.value_counts()

label
[Forum]                      8
[Information/Explanation]    8
[Prose/Lyrical]              7
[Instruction]                7
[Opinion/Argumentation]      6
[Legal]                      5
[Promotion]                  5
[News]                       4
[Other]                      2
[Incomprehensible]           1
[Multiple texts]             1
Name: count, dtype: int64

In [5]:
# Open the entire genre sample for annotation
sample = pd.read_json("datasets/CLASSLA-web.sq.1.0.-translated-genre-sample-for-annotation.jsonl", lines= True)

sample.head(2)

Unnamed: 0,text,label,metadata
0,"Blog\n\n""I loved you with eternal love.""Jer 31...",[Opinion/Argumentation],"{'text_id': 'macocu.sq.1061396', 'domain': 'fj..."
1,"The Albanian National Union Front (FBKSH), the...",[Promotion],"{'text_id': 'macocu.sq.1408163', 'domain': 'ze..."


In [8]:
sample[52:55]

Unnamed: 0,text,label,metadata
52,Dear reader!\n\nTitle: Dear Reader!14.09.10 6:...,[Other],"{'text_id': 'macocu.sq.1576314', 'domain': 'yo..."
53,"For example, it is preparing to take responsib...",[Opinion/Argumentation],"{'text_id': 'macocu.sq.463621', 'domain': 'rre..."
54,"Edi Rama's star, here's what symbolizes\n\nEdi...",[Opinion/Argumentation],"{'text_id': 'macocu.sq.372579', 'domain': 'jav..."


In [7]:
unanno = sample[54:]
unanno.head()

Unnamed: 0,text,label,metadata
54,"Edi Rama's star, here's what symbolizes\n\nEdi...",[Opinion/Argumentation],"{'text_id': 'macocu.sq.372579', 'domain': 'jav..."
55,"If you are always tired, you can actually suff...",[Instruction],"{'text_id': 'macocu.sq.1144450', 'domain': 'ar..."
56,"Enion Cala\n\nProduction, sale, import -export...",[Information/Explanation],"{'text_id': 'macocu.sq.1325438', 'domain': 'op..."
57,The secret falls!Now transparency on Albanians...,[News],"{'text_id': 'macocu.sq.959055', 'domain': 'shq..."
58,Dermedic Sunbrella Sun Protection Spf 50+\n\nP...,[Promotion],"{'text_id': 'macocu.sq.296193', 'domain': 'far..."


In [10]:
# Save the unannotated part
unanno.to_json("datasets/CLASSLA-sq-remaining-part-to-be-annotated.jsonl", orient="records", lines=True)