In [20]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=6

import pandas as pd
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=6


Calculate token distribution - vector of values per tokens (how many times does each token occur) - create a dictionary with all tokens from train set and specific test set, iterate through the tokens and count how many times each ocurrs. Calculate cosine similarity.

Do this on label level as well to see whether this explains good performance on some of the labels for Maltese.

## Tokenize and count tokens for train_df

Code for tokenization (it is now already done):

In [18]:
# Import the train dataset
train = load_dataset("TajaKuzman/X-GENRE-multilingual-text-genre-dataset", "train")

# To open as Pandas DataFrame:
train_df = pd.DataFrame(train["train"])

display(train_df.head(2))

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the train dataset
tokens_train = []
integers_train = []
token_list_train = []

for text in tqdm(train_df.text.to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens_train, except the beginning (<s>) and end (</s>) token
	current_tokens_train = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	tokens_train.append(current_tokens_train)
	token_list_train.extend(current_tokens_train)
	integers_train.append(encoded_text.input_ids[1:-1])

train_df["tokens_train"] = tokens_train
train_df["token_ids"] = integers_train

#print(token_list_train[:10])
#print(len(token_list_train))

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

train_df.head(3)

# Save the tokenized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English


  0%|          | 0/1772 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1810 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1772/1772 [00:09<00:00, 189.74it/s]


699465


Code to count tokens:

In [2]:
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51..."


In [11]:
# Create a dictionary that counts all the token occurrences

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

# Create a dictionary which counts the occurrences of the words

word_dict_train = Counter(train_tokens_shortened)

# Sort the dictionary alphabetically based on keys
word_dict_train = dict(sorted(word_dict_train.items()))

print(list(word_dict_train.items())[:100])
print(len(word_dict_train))

699465
[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1), ('!!!!!!', 1), ('!!!!!!!', 1), ('!"', 14), ('!)', 10), ('!),', 1), ('!).', 2), ('"', 528), ('")', 7), ('"),', 6), ('").', 7), ('",', 56), ('".', 83), ('"...', 3), ('";', 3), ('"?', 6), ('#', 4), ('$', 8), ('%', 4), ('%)', 1), ('&', 46), ("'", 4517), ('(', 31), ('(1', 1), (')', 788), ('),', 242), (').', 297), ('):', 13), (');', 5), ('*', 19), ('**', 2), ('****', 1), ('+', 15), ('+5', 1), (',', 23447), (',«', 12), ('-', 2803), ('---', 3), ('------', 5), ('----------------', 41), ('-0', 3), ('-01', 1), ('-02', 2), ('-02-', 2), ('-03-', 4), ('-06', 1), ('-06-', 3), ('-09-', 1), ('-1', 15), ('-1)', 3), ('-10', 2), ('-10-', 1), ('-11', 6), ('-11-', 1), ('-12', 8), ('-13', 6), ('-14', 3), ('-15', 7), ('-16', 9), ('-17', 3), ('-18', 9), ('-19', 5), ('-2', 10), ('-20', 3), ('-2000', 3), ('-2005', 1), ('-2007', 8), ('-2009', 1), ('-2010', 1), ('-2011', 1), ('-2012', 1), ('-2014', 1), ('-2020', 1), ('-21', 4), ('-22', 3), ('-

The train dataset has 699.465 tokens and 27.025 unique words.

In [12]:
# See the most frequent tokens:

# Sort the dictionary by values (word counts) in descending order
sorted(word_dict_train.items(), key=lambda x: x[1], reverse=True)[:10]


[(',', 23447),
 ('.', 21407),
 ('▁', 19553),
 ('▁the', 18860),
 ('s', 14184),
 ('▁to', 10762),
 ('▁of', 9912),
 ('▁and', 9752),
 ('▁in', 9140),
 ('▁a', 8341)]

In [14]:
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "w") as train_count_file:
	json.dump(word_dict_train, train_count_file)

# Tokenize and count tokens for test sets

Code with which I tokenized the datasets:

In [15]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [None]:
# Define an array of token overlap
token_overlap_results = {}

# Loop through the datasets and calculate token overlap
for lang in list(main_dict.keys()):
	df = pd.DataFrame(main_dict[lang]["dataset"])

	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

	tokens = []
	integers = []
	token_list = []

	print("Tokenizing text.")

	for text in tqdm(df.text.to_list()):
		encoded_text = tokenizer(text)
		# Take all tokens, except the beginning (<s>) and end (</s>) token
		current_tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
		# Shorten the list to 512, as tokens after that were not observed by the classifier
		current_tokens = current_tokens[:512]
		tokens.append(current_tokens)
		token_list.extend(current_tokens)
		integers.append(encoded_text.input_ids[1:-1][:512])

	df["tokens"] = tokens
	df["token_ids"] = integers

	print(token_list[:10])
	print("All tokens:")
	print(len(token_list))

	# See how many tokens overlap
	overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_token_list = []

	#overlap_counter = sum(1 for element in token_list if element in train_tokens_shortened)
	for token in tqdm(token_list):
		if token in train_tokens_shortened:
			overlap_counter += 1
			overlap_token_list.append(token)

	# Out of all tokens, how many overlap?
	overlap_per = overlap_counter/len(token_list)

	print(f"Number of tokens that overlap: {overlap_counter}")
	print(f"Percentage of overlap: {overlap_per}")

	# Update the dataset in the dictionary
	main_dict[lang]["dataset"] = df.to_dict()

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"] = {"overlap_percentage":overlap_per, "token_list": token_list, "overlap_token_list":overlap_token_list}

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list": overlap_token_list}

	# Convert tokens back to words
	#print(tokenizer.convert_tokens_to_string(tokens))

# Inspect the results
overlap_df = pd.DataFrame(token_overlap_results)
	

In [26]:
overlap_df = pd.DataFrame(token_overlap_results).transpose()
overlap_df

Unnamed: 0,percentage,overlap_list
mt,0.817085,"[▁Angel, o, ▁Che, t, ,, ▁se, ▁j, kun, d, u, ▁p..."
el,0.161428,"[asi, asi, ,, ,, ▁Re, ception, ., ▁driver, ▁es..."
tr,0.521502,"[▁A, L, ▁Der, s, i, ▁ve, ▁Beli, r, leme, ▁S, h..."
sq,0.605775,"[▁Blog, ▁“, U, ▁kam, ▁me, jet, .”, ▁Jer, ▁31, ..."
is,0.517575,"[▁[, is, ], ▁er, fi, ▁reg, ▁sett, ar, lag, sin..."
uk,0.156675,"[., ,, ,, ?, ▁-, ., ?, ▁-, ', ., ,, ., ▁, ▁(, ..."
ca,0.744881,"[▁P, à, gine, s, ▁En, nada, ▁Porto, ▁uns, ▁die..."
mk,0.145989,"[,, ▁T, CL, ,, ▁T, CL, :, ▁Alca, tel, ▁Mobile,..."
hr,0.821517,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, ,, ▁za, ▁pa..."
sl,0.974289,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,..."


In [36]:
for token in overlap_df["overlap_list"].to_list()[0][:2]:
	if len(token) > 1:
		non_short_counter += 1

▁Angel


In [37]:
# Inspect how many tokens are one character long

counters = []
pers = []

for token_list_el in overlap_df["overlap_list"].to_list():
	non_short_counter = 0
	for token in token_list_el:
		if len(token) > 1:
			non_short_counter += 1
	
	counters.append(non_short_counter)
	pers.append(non_short_counter/len(token_list_el))

# Add to df
overlap_df["non_short"] = counters

# Calculate the percentage of overlap tokens that have more than 1 character
overlap_df["non_short_per"] = pers

overlap_df


Unnamed: 0,percentage,overlap_list,non_short,non_short_per
mt,0.817085,"[▁Angel, o, ▁Che, t, ,, ▁se, ▁j, kun, d, u, ▁p...",23083,0.723628
el,0.161428,"[asi, asi, ,, ,, ▁Re, ception, ., ▁driver, ▁es...",1883,0.373389
tr,0.521502,"[▁A, L, ▁Der, s, i, ▁ve, ▁Beli, r, leme, ▁S, h...",11141,0.722269
sq,0.605775,"[▁Blog, ▁“, U, ▁kam, ▁me, jet, .”, ▁Jer, ▁31, ...",12459,0.768315
is,0.517575,"[▁[, is, ], ▁er, fi, ▁reg, ▁sett, ar, lag, sin...",10915,0.711399
uk,0.156675,"[., ,, ,, ?, ▁-, ., ?, ▁-, ', ., ,, ., ▁, ▁(, ...",1258,0.253476
ca,0.744881,"[▁P, à, gine, s, ▁En, nada, ▁Porto, ▁uns, ▁die...",15657,0.763123
mk,0.145989,"[,, ▁T, CL, ,, ▁T, CL, :, ▁Alca, tel, ▁Mobile,...",1270,0.314746
hr,0.821517,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, ,, ▁za, ▁pa...",17678,0.81062
sl,0.974289,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...",21567,0.841935


In [49]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

Add token counts information

In [23]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [25]:
token_number = {}
type_number = {}

for lang in list(main_dict.keys()):
	print(f"Creating token dict for {lang}")
	current_token_list = main_dict[lang]["token_overlap"]["token_list"]

	print(f"No of tokens: {len(current_token_list)}")

	# Create a dictionary which counts the occurrences of the tokens

	word_dict_test = Counter(current_token_list)

	# Sort the dictionary alphabetically based on keys
	word_dict_test = dict(sorted(word_dict_test.items()))

	# Add information on no. of tokens and words to a dict
	token_number[lang] = len(current_token_list)
	type_number[lang] = len(word_dict_test)

	print(list(word_dict_test.items())[:100])
	print(f"No of unique tokens: {len(word_dict_test)}")

	# Add the count of tokens to the main dictionary
	main_dict[lang]["token_overlap"]["token_count"] = word_dict_test

# Create a dataframe for statistics
token_df = pd.DataFrame({"tokens": token_number, "types": type_number})
print(token_df.to_markdown(index=False))

Creating token dict for mt
No of tokens: 39040
[('!', 9), ('"', 34), ('",', 11), ('".', 6), ('#', 20), ("'", 462), ('(', 3), ('(1)', 4), ('(2)', 4), ('(3)', 4), (')', 50), ('),', 15), (').', 15), (');', 1), (',', 884), ('-', 2119), ('-0', 1), ('-1', 1), ('-11', 1), ('-12-', 2), ('-16', 5), ('-17', 1), ('-18', 2), ('-19', 1), ('-2', 5), ('-20', 1), ('-200', 1), ('-2000', 1), ('-2007', 1), ('-2009', 1), ('-2014', 1), ('-2015', 1), ('-2016', 2), ('-2017', 2), ('-21', 1), ('-22', 1), ('-23', 1), ('-24', 2), ('-28', 1), ('-30', 4), ('-35', 1), ('-4', 1), ('-5', 1), ('-500', 1), ('-6', 1), ('-7', 1), ('-90', 2), ('.', 724), ('."', 5), ('...', 1), ('.”', 3), ('/', 11), ('/08', 3), ('/13', 1), ('/14', 3), ('/19', 4), ('/24', 2), ('/3', 6), ('/4', 2), ('/5', 2), ('/7', 2), ('0.7', 1), ('00', 1), ('016', 1), ('02', 2), ('04', 2), ('05', 1), ('050', 1), ('09.', 2), ('1', 5), ('100', 1), ('112', 1), ('12', 2), ('135', 2), ('14', 1), ('152', 1), ('164', 1), ('19', 5), ('1962', 1), ('1998', 1), ('2'

In [26]:
print(token_df.to_markdown())

|    |   tokens |   types |
|:---|---------:|--------:|
| mt |    39040 |    4787 |
| el |    31240 |    4751 |
| tr |    29578 |    6272 |
| sq |    26769 |    4891 |
| is |    29644 |    4615 |
| uk |    31677 |    6507 |
| ca |    27544 |    5314 |
| mk |    27639 |    5468 |
| hr |    26546 |    6222 |
| sl |    26292 |    5763 |


In [35]:
# Let's see the most frequent tokens
most_frequent = {}
# Sort the dictionary by values (word counts) in descending order
for lang in list(main_dict.keys()):
	most_frequent[lang] = (sorted(main_dict[lang]['token_overlap']['token_count'].items(), key=lambda x: x[1], reverse=True)[:10])

print(pd.DataFrame({"most_frequent_type": most_frequent}).to_markdown())

|    | most_frequent_type                                                                                                                       |
|:---|:-----------------------------------------------------------------------------------------------------------------------------------------|
| ca | [(',', 1080), ('▁de', 1014), ('.', 675), ('s', 648), ('▁i', 564), ('▁la', 559), ('▁a', 529), ('▁que', 438), ("'", 360), ('’', 334)]      |
| el | [('▁', 1017), ('.', 801), (',', 782), ('▁και', 553), ('ς', 525), ('▁να', 351), ('▁το', 321), ('▁του', 292), ('▁την', 268), ('▁με', 264)] |
| hr | [(',', 878), ('.', 766), ('▁i', 546), ('▁u', 430), ('a', 413), ('▁je', 350), ('▁na', 282), ('▁za', 253), ('▁se', 239), ('e', 219)]       |
| is | [('.', 1017), ('▁og', 628), ('▁að', 613), (',', 600), ('▁', 528), ('▁í', 434), ('▁á', 388), ('▁er', 357), ('s', 326), ('▁sem', 279)]     |
| mk | [(',', 1021), ('▁на', 983), ('.', 738), ('▁и', 619), ('▁за', 475), ('▁да', 378), ('▁во', 376), ('▁се', 365), ('▁', 34

In [36]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

# Compare train df and test set overlap

Create a dictionary with all tokens from train set and specific test set, iterate through the tokens and count how many times each ocurrs. Calculate cosine similarity.

Do this on label level as well to see whether this explains good performance on some of the labels for Maltese.

Cosine similarity is a metric used to measure the similarity of two vectors. Specifically, it measures the similarity in the direction or orientation of the vectors ignoring differences in their magnitude or scale. Both vectors need to be part of the same inner product space, meaning they must produce a scalar through inner product multiplication. The similarity of two vectors is measured by the cosine of the angle between them. The similarity can take values between -1 and +1. Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity. 

Cosine similarity ignores 0-0 matches. Counting 0-0 matches in sparse data would inflate similarity scores. Another commonly used metric that ignores 0-0 matches is Jaccard Similarity.

In [2]:
def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [3]:
# Import train token count
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "r") as train_count_file:
	train_count = json.load(train_count_file)

list(train_count.items())[:5]

[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1)]

In [4]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

list(main_dict["sl"]["token_overlap"]["token_count"].items())[:10]

[('!', 30),
 ('!!', 1),
 ('!"', 1),
 ('!),', 1),
 ('"', 11),
 ('".', 2),
 ('"...', 1),
 ('&', 8),
 ("'", 3),
 ("''", 1)]

In [8]:
cosine_sim = {}
vector_size = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")
	# Get token count for current lang
	current_lang_count = main_dict[lang]["token_overlap"]["token_count"]

	print(f"Number of token types for {lang}: {len(list(current_lang_count.keys()))}")

	# For each test set, create a vector of token counts. Take only tokens that are present either in train_df or test set.
	intersection_dict = {}

	# First, create a list of tokens that are present in either one or the other list
	intersection_keys = []
	intersection_keys.extend(list(current_lang_count.keys()))
	intersection_keys.extend(list(train_count.keys()))
	# Remove duplicated keys
	intersection_keys = list(set(intersection_keys))
	print(f"Number of intersecting types: {len(intersection_keys)}")

	# Then create a dictionary for 1) train df and 2) test df with counts of token types that occur in either of the datasets
	train_intersect_dict = {}
	test_intersect_dict = {}

	for i in intersection_keys:
		try:
			train_intersect_dict[i] = train_count[i]
		except:
			train_intersect_dict[i] = 0
		try:
			test_intersect_dict[i] = current_lang_count[i]
		except:
			test_intersect_dict[i] = 0


	# Create a df with intersecting keys
	intersect_df = pd.DataFrame({"train": train_intersect_dict, "test": test_intersect_dict})

	display(intersect_df.head(10))

	# Calculate cosine similarity
	current_cosine_sim = cosine_similarity(np.array(intersect_df["train"].to_list()), np.array(intersect_df["test"].to_list()))

	print(f"Cosine similarity for {lang}: {current_cosine_sim}")

	# Add to the main dictionary
	main_dict[lang]["token_overlap"]["cosine_similarity"] = current_cosine_sim
	main_dict[lang]["token_overlap"]["intersection_df"] = intersect_df.to_dict()
	main_dict[lang]["token_overlap"]["intersection_vector_size"] = len(intersection_keys)

	# Add to a dict of results
	cosine_sim[lang] = current_cosine_sim
	vector_size[lang] = len(intersection_keys)

Processing mt
Number of token types for mt: 4787
Number of intersecting types: 28226


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,1
▁Ż,0,7
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for mt: 0.4142476865826809
Processing el
Number of token types for el: 4751
Number of intersecting types: 30954


Unnamed: 0,train,test
▁Απριλίου,0,3
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
έλθει,0,1
▁Bran,9,0
NJE,3,0


Cosine similarity for el: 0.5273207128330722
Processing tr
Number of token types for tr: 6272
Number of intersecting types: 30846


Unnamed: 0,train,test
▁boot,11,1
▁Trend,2,0
set,47,0
▁Neden,0,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for tr: 0.5938472232210807
Processing sq
Number of token types for sq: 4891
Number of intersecting types: 29168


Unnamed: 0,train,test
▁boot,11,0
▁pavarur,0,1
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for sq: 0.43448915477115685
Processing is
Number of token types for is: 4615
Number of intersecting types: 29518


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0
TION,18,0


Cosine similarity for is: 0.5251354659540077
Processing uk
Number of token types for uk: 6507
Number of intersecting types: 33121


Unnamed: 0,train,test
та,0,33
▁boot,11,0
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
фи,0,1
NJE,3,0


Cosine similarity for uk: 0.5846121788244142
Processing ca
Number of token types for ca: 5314
Number of intersecting types: 29443


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,1
TION,18,0


Cosine similarity for ca: 0.5254392489246424
Processing mk
Number of token types for mk: 5468
Number of intersecting types: 31837


Unnamed: 0,train,test
та,0,206
▁boot,11,0
▁Јан,0,1
ензи,0,3
▁Trend,2,0
сметаат,0,5
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0


Cosine similarity for mk: 0.4225320009293607
Processing hr
Number of token types for hr: 6222
Number of intersecting types: 28864


Unnamed: 0,train,test
▁boot,11,0
▁Essen,0,2
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for hr: 0.5662098277228971
Processing sl
Number of token types for sl: 5763
Number of intersecting types: 27507


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,1
▁primordial,1,0
TION,18,0


Cosine similarity for sl: 0.6334513674796362


In [12]:
# Show results
cosine_sim_df = pd.DataFrame({"cosine_similarity": cosine_sim, "vector_size": vector_size}).sort_values(by="cosine_similarity", ascending=False)
print(cosine_sim_df.to_markdown())

|    |   cosine_similarity |   vector_size |
|:---|--------------------:|--------------:|
| sl |            0.633451 |         27507 |
| tr |            0.593847 |         30846 |
| uk |            0.584612 |         33121 |
| hr |            0.56621  |         28864 |
| el |            0.527321 |         30954 |
| ca |            0.525439 |         29443 |
| is |            0.525135 |         29518 |
| sq |            0.434489 |         29168 |
| mk |            0.422532 |         31837 |
| mt |            0.414248 |         28226 |


In [19]:
main_dict["sl"]["token_overlap"].keys()

dict_keys(['overlap_percentage', 'token_list', 'overlap_token_list', 'token_count', 'cosine_similarity', 'intersection_df', 'intersection_vector_size'])

In [18]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

# Compare token overlap on label level 

# Compare train df and test set overlap

Separate texts in train and test sets into genre datasets. Create a dictionary with all tokens from train set and specific test set for each genre dataset for each language, iterate through the tokens and count how many times each ocurrs. Calculate cosine similarity.

In [21]:
def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

### Create label-level token counts for train dataset

In [22]:
# Open the tokenized train df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51..."


In [23]:
train_df.labels.unique()

array(['Other', 'Information/Explanation', 'News', 'Instruction',
       'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal',
       'Promotion'], dtype=object)

In [33]:
# Separate the train df into label-based dfs

# Create a dictionary of token counts that are label based
label_token_count_train = {}
token_count = {}
type_count = {}

for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
	print(f"Processing {label}")

	label_df = train_df[train_df["labels"] == label]
	display(label_df.head(3))

	# Create a dictionary that counts all the token occurrences

	# Create a list of tokens, where we take only the first 512 tokens
	train_tokens_shortened = []

	for i in label_df["tokens_train"].to_list():
		train_tokens_shortened.extend(i[:512])

	print(f"Number of all tokens: {len(train_tokens_shortened)}")

	# Create a dictionary which counts the occurrences of the words

	word_dict_train = Counter(train_tokens_shortened)

	# Sort the dictionary alphabetically based on keys
	word_dict_train = dict(sorted(word_dict_train.items()))

	print(list(word_dict_train.items())[:20])
	print(f"Number of all types: {len(list(word_dict_train.keys()))}")

	# Add to dictionaries
	label_token_count_train[label] = word_dict_train
	token_count[label] = len(train_tokens_shortened)
	type_count[label] = len(list(word_dict_train.keys()))



Processing Information/Explanation


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,"[▁Abstract, ▁Object, ive, :, ▁Report, ing, ▁bi...","[233973, 134549, 5844, 12, 34798, 214, 333, 16..."
3,In 2009 the song was the focus of a successful...,Information/Explanation,CORE,English,"[▁In, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ...","[360, 1877, 70, 11531, 509, 70, 32153, 111, 10..."
39,Story: Whaling Page 4 -- M?ori and whaling Wha...,Information/Explanation,CORE,English,"[▁Story, :, ▁W, hal, ing, ▁Page, ▁4, ▁--, ▁M, ...","[30575, 12, 601, 4200, 214, 14231, 201, 4210, ..."


Number of all tokens: 124130
[('!', 17), ('!"', 1), ('"', 72), ('")', 1), ('").', 2), ('",', 8), ('".', 15), ('"?', 1), ('%', 3), ('&', 14), ("'", 393), ('(', 4), ('(1', 1), (')', 138), ('),', 64), (').', 80), ('):', 1), ('*', 2), (',', 4231), (',«', 2)]
Number of all types: 14678
Processing News


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
4,QuotW This was the week when neither rumours o...,News,CORE,English,"[▁Quo, t, W, ▁This, ▁was, ▁the, ▁week, ▁when, ...","[43851, 18, 1456, 3293, 509, 70, 5895, 3229, 2..."
5,KaZaA claims it can't stop users sharing music...,News,CORE,English,"[▁Ka, Za, A, ▁claims, ▁it, ▁can, ', t, ▁stop, ...","[1136, 16737, 284, 140526, 442, 831, 25, 18, 7..."
9,Nebraska fans checking out airfare for a trip ...,News,CORE,English,"[▁Ne, bra, ska, ▁fans, ▁checking, ▁out, ▁air, ...","[799, 2844, 937, 35992, 175199, 1810, 1831, 44..."


Number of all tokens: 136557
[('!', 38), ('!!!', 1), ('!"', 1), ('"', 177), ('").', 1), ('",', 21), ('".', 25), ('";', 1), ('"?', 1), ('#', 1), ('$', 3), ('&', 2), ("'", 968), ('(', 7), (')', 124), ('),', 47), (').', 31), ('+', 7), ('+5', 1), (',', 4559)]
Number of all types: 15319
Processing Instruction


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
6,When you first sign up with an online casino a...,Instruction,CORE,English,"[▁When, ▁you, ▁first, ▁sign, ▁up, ▁with, ▁an, ...","[14847, 398, 5117, 24092, 1257, 678, 142, 1118..."
7,How to be the BEST Workplace Supervisor A work...,Instruction,CORE,English,"[▁How, ▁to, ▁be, ▁the, ▁BEST, ▁Work, place, ▁S...","[11249, 47, 186, 70, 121300, 27985, 23935, 426..."
29,I am hungry and now have an hour with a tobler...,Instruction,CORE,English,"[▁I, ▁am, ▁hun, gry, ▁and, ▁now, ▁have, ▁an, ▁...","[87, 444, 1926, 47285, 136, 5036, 765, 142, 56..."


Number of all tokens: 83750
[('!', 57), ('!!', 6), ('!!!', 4), ('!!!!', 2), ('!)', 4), ('"', 47), ('").', 1), ('",', 2), ('".', 8), ('";', 1), ('"?', 1), ('%)', 1), ('&', 8), ("'", 557), ('(', 11), (')', 79), ('),', 11), (').', 51), ('):', 3), ('*', 6)]
Number of all types: 8929
Processing Opinion/Argumentation


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
8,popular themes AllMusic relies heavily on Java...,Opinion/Argumentation,CORE,English,"[▁popular, ▁them, es, ▁All, Music, ▁reli, es, ...","[5700, 2856, 90, 3164, 158257, 28702, 90, 1730..."
10,"I was just recalling how, about a year ago, my...",Opinion/Argumentation,CORE,English,"[▁I, ▁was, ▁just, ▁recall, ing, ▁how, ,, ▁abou...","[87, 509, 1660, 189232, 214, 3642, 4, 1672, 10..."
32,Combining our love of shiny things with some t...,Opinion/Argumentation,CORE,English,"[▁Combi, ning, ▁our, ▁love, ▁of, ▁shi, ny, ▁th...","[106935, 592, 2446, 5161, 111, 6544, 299, 8966..."


Number of all tokens: 103141
[('!', 95), ('!!', 7), ('!!!', 3), ('!"', 2), ('!)', 2), ('!).', 1), ('"', 109), ('")', 1), ('"),', 3), ('").', 1), ('",', 9), ('".', 13), ('"...', 3), ('$', 1), ('%', 1), ('&', 2), ("'", 770), ('(', 1), (')', 132), ('),', 52)]
Number of all types: 13088
Processing Forum


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
13,Quote ryan mead: I would like something that g...,Forum,CORE,English,"[▁Quote, ▁ry, an, ▁me, ad, :, ▁I, ▁would, ▁lik...","[109216, 5535, 66, 163, 712, 12, 87, 2806, 188..."
15,Changing ISP re Broadband - what happens to em...,Forum,CORE,English,"[▁Chang, ing, ▁I, SP, ▁re, ▁Bro, ad, band, ▁-,...","[108193, 214, 87, 9434, 456, 13177, 712, 8262,..."
16,Comments for Post (25) I'm there with you. I'v...,Forum,CORE,English,"[▁Comments, ▁for, ▁Post, ▁(25), ▁I, ', m, ▁the...","[11427, 100, 2795, 59791, 87, 25, 39, 2685, 67..."


Number of all tokens: 58900
[('!', 121), ('!!', 8), ('!!!', 5), ('!!!!', 4), ('!!!!!', 1), ('!!!!!!', 1), ('!!!!!!!', 1), ('!"', 1), ('!)', 2), ('"', 46), ('").', 1), ('",', 11), ('".', 9), ('"?', 2), ('#', 1), ('$', 2), ('&', 12), ("'", 719), ('(', 8), (')', 128)]
Number of all types: 8555
Processing Prose/Lyrical


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
14,One Dance Too Many The night I first met Ziegl...,Prose/Lyrical,CORE,English,"[▁One, ▁Dance, ▁Too, ▁Many, ▁The, ▁night, ▁I, ...","[6561, 67022, 56374, 52455, 581, 17431, 87, 51..."
57,"Household Tales, by Brothers Grimm The Story o...",Prose/Lyrical,CORE,English,"[▁House, hold, ▁Tale, s, ,, ▁by, ▁Brother, s, ...","[13038, 16200, 59144, 7, 4, 390, 67921, 7, 106..."
72,Empire! Empire! I Would Have Stolen You A Whol...,Prose/Lyrical,CORE,English,"[▁Empire, !, ▁Empire, !, ▁I, ▁Would, ▁Have, ▁S...","[145359, 38, 145359, 38, 87, 154559, 31901, 73..."


Number of all tokens: 46860
[('!', 22), ('!!', 1), ('!"', 5), ('!)', 1), ('!),', 1), ('"', 34), ('";', 1), ('#', 1), ('&', 2), ("'", 542), (')', 42), ('),', 3), (').', 5), ('*', 1), ('+', 1), (',', 1982), ('-', 262), ('------', 2), ('----------------', 16), ('-09-', 1)]
Number of all types: 5990
Processing Legal


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
35,Full Terms and Conditions Eligibility to enter...,Legal,CORE,English,"[▁Full, ▁Terms, ▁and, ▁Condi, tions, ▁E, ligi,...","[9312, 165504, 136, 46347, 5256, 241, 7883, 83..."
110,Commonwealth Consolidated Acts INCOME TAX ASSE...,Legal,CORE,English,"[▁Common, we, al, th, ▁Con, solid, ated, ▁Act,...","[151301, 1177, 289, 927, 1657, 97281, 27686, 2..."
122,"This Terms of Use Agreement (""Agreement"") is b...",Legal,CORE,English,"[▁This, ▁Terms, ▁of, ▁Use, ▁Agreement, ▁("", A,...","[3293, 165504, 111, 36836, 186670, 24073, 284,..."


Number of all tokens: 28496
[('"', 7), ('")', 4), ('"),', 1), ('").', 1), ('".', 2), ("'", 47), (')', 61), ('),', 10), (').', 11), (');', 2), (',', 833), ('-', 66), ('----------------', 20), ('-1', 5), ('-2', 1), ('-2000', 2), ('-21', 1), ('-30', 1), ('-31', 1), ('-5', 1)]
Number of all types: 4425
Processing Promotion


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
79,Post navigation Citizen Fish are back on the r...,Promotion,CORE,English,"[▁Post, ▁navigation, ▁Citizen, ▁Fish, ▁are, ▁b...","[2795, 134470, 193223, 104796, 621, 4420, 98, ..."
130,Win yourself a FREE copy of the BradyGames off...,Promotion,CORE,English,"[▁Win, ▁yourself, ▁a, ▁FREE, ▁copy, ▁of, ▁the,...","[17686, 31949, 10, 86697, 43658, 111, 70, 5859..."
179,Do You Want To Know The Quick Secret for On Pa...,Promotion,CORE,English,"[▁Do, ▁You, ▁Want, ▁To, ▁Know, ▁The, ▁Quick, ▁...","[984, 2583, 42335, 717, 70829, 581, 89038, 390..."


Number of all tokens: 88626
[('!', 51), ('!!!', 1), ('!).', 1), ('"', 10), ('"),', 1), ('",', 1), ('#', 1), ('&', 1), ("'", 235), (')', 34), ('),', 10), (').', 21), ('*', 1), (',', 2782), ('-', 352), ('-02', 2), ('-1', 2), ('-12', 5), ('-14', 2), ('-16', 1)]
Number of all types: 12548


In [35]:
# Create df with results
label_results_train = pd.DataFrame({"token_count": token_count, "type_count": type_count})

print(label_results_train.to_markdown())

|                         |   token_count |   type_count |
|:------------------------|--------------:|-------------:|
| Information/Explanation |        124130 |        14678 |
| News                    |        136557 |        15319 |
| Instruction             |         83750 |         8929 |
| Opinion/Argumentation   |        103141 |        13088 |
| Forum                   |         58900 |         8555 |
| Prose/Lyrical           |         46860 |         5990 |
| Legal                   |         28496 |         4425 |
| Promotion               |         88626 |        12548 |


In [37]:
# Save the label token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count.json", "w") as train_label_count_file:
	json.dump(label_token_count_train, train_label_count_file)

### Create label-level token counts for test sets

In [44]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

pd.DataFrame(main_dict["sl"]["dataset"]).head(2)

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."


In [46]:
# Do the same as with the dataset, but on every language
lang_results = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")

	label_token_dict_test = {}
	token_count_test = {}
	type_count_test = {}

	# Current df
	df = pd.DataFrame(main_dict[lang]["dataset"])
	display(df.head(2))

	# Create a dictionary of token counts that are label based
	label_token_dict = {}
	token_count = {}
	type_count = {}

	for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
		print(f"Processing {label}")

		label_df = df[df["y_true"] == label]
		display(label_df.head(3))

		# Create a dictionary that counts all the token occurrences

		# Create a list of tokens
		token_list = []

		for i in label_df["tokens"].to_list():
			token_list.extend(i)

		print(f"Number of all tokens: {len(token_list)}")

		# Create a dictionary which counts the occurrences of the words

		word_dict = Counter(token_list)

		# Sort the dictionary alphabetically based on keys
		word_dict = dict(sorted(word_dict.items()))

		print(list(word_dict.items())[:20])
		print(f"Number of all types: {len(list(word_dict.keys()))}")

		# Add to dictionaries
		label_token_dict_test[label] = word_dict
		token_count_test[label] = len(token_list)
		type_count_test[label] = len(list(word_dict.keys()))

	# Add these to a lang-based dict
	lang_results[lang] = {"token_count_dict": label_token_dict_test, "token_count": token_count_test, "type_count": type_count_test }

	# Add this information to the main dictionary
	main_dict[lang]["token_overlap"]["label-level"] = lang_results[lang]


Processing mt


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262..."
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
8,macocu.mt.241966,Prose/Lyrical,Tales dwar il-magna. Tales Moderna għat-Tfal \...,Tales about the machine.Modern tales for child...,{'text_id': 'macocu.mt.241966'},Information/Explanation,"[▁Tale, s, ▁d, war, ▁il, -, magn, a, ., ▁Tale,...","[59144, 7, 104, 4205, 211, 9, 86049, 11, 5, 59..."
14,macocu.mt.383508,Information/Explanation,L-Isqof ta 'Bridget kien wieħed mid-dsatax-il ...,The Bishop of Bridget was one of the nineteen ...,{'text_id': 'macocu.mt.383508'},Information/Explanation,"[▁L, -, Is, q, of, ▁ta, ▁', B, ridge, t, ▁ki, ...","[339, 9, 29598, 864, 4390, 308, 242, 571, 9185..."
15,macocu.mt.294792,Forum,"Kif tpoġġi ""kelb"" ikona? @ Għaliex huwa msejja...","How to put a ""dog"" icon?@ Why it's called ""dog...",{'text_id': 'macocu.mt.294792'},Information/Explanation,"[▁Ki, f, ▁t, po, ġ, ġ, i, ▁"", kel, b, "", ▁ikon...","[1519, 420, 808, 771, 245673, 245673, 14, 44, ..."


Number of all tokens: 6656
[('"', 9), ('",', 6), ("'", 68), (')', 3), ('),', 2), (').', 4), (',', 193), ('-', 404), ('-12-', 2), ('-16', 3), ('-35', 1), ('-90', 2), ('.', 137), ('."', 1), ('/4', 1), ('12', 2), ('19', 1), ('2.5', 1), ('22', 1), ('25', 1)]
Number of all types: 1446
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262..."
13,macocu.mt.470885,News,SPANJA: Pulizija jmut waqt ġlied qabel partita...,Spain: Police die during fighting before a Eur...,{'text_id': 'macocu.mt.470885'},News,"[▁SP, ANJA, :, ▁Puli, zija, ▁j, mut, ▁waqt, ▁,...","[12047, 94661, 12, 125870, 35858, 1647, 14311,..."
16,macocu.mt.369113,News,Ryan Searle (ritratt) jipproduċi l-aktar riżul...,Ryan Searle (photo) produces the most result o...,{'text_id': 'macocu.mt.369113'},News,"[▁Ryan, ▁Se, ar, le, ▁(, rit, rat, t, ), ▁ji, ...","[78201, 503, 147, 133, 15, 2783, 2175, 18, 16,..."


Number of all tokens: 6978
[('"', 3), ('",', 1), ("'", 159), (')', 3), (',', 126), ('-', 474), ('-1', 1), ('-17', 1), ('-18', 2), ('-19', 1), ('-2', 2), ('-20', 1), ('-2016', 1), ('-2017', 1), ('-28', 1), ('-500', 1), ('.', 114), ('...', 1), ('.”', 3), ('1', 2)]
Number of all types: 1614
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
3,macocu.mt.243402,Forum,Kif tkellem lit-tfal dwar id-diżabbiltajiet \n...,How to talk to children about disabilities\n\n...,{'text_id': 'macocu.mt.243402'},Instruction,"[▁Ki, f, ▁t, kel, lem, ▁lit, -, t, fal, ▁d, wa...","[1519, 420, 808, 2590, 6153, 16060, 9, 18, 871..."
4,macocu.mt.213859,Forum,Kif tneħħi hangover sewwa u bla perikolu \n\nH...,How to remove a proper and safe hangover\n\nHa...,{'text_id': 'macocu.mt.213859'},Instruction,"[▁Ki, f, ▁, tne, ħ, ħ, i, ▁hang, over, ▁se, w,...","[1519, 420, 6, 23738, 245766, 245766, 14, 1075..."
5,macocu.mt.70136,Information/Explanation,Għalf Hills għall-qtates \n\nGħall-iżvilupp xi...,Feed hills for cats\n\nFor the proper developm...,{'text_id': 'macocu.mt.70136'},Instruction,"[▁G, ħ, al, f, ▁Hills, ▁g, ħ, all, -, q, tate,...","[527, 245766, 289, 420, 117889, 706, 245766, 5..."


Number of all tokens: 9288
[('!', 2), ('"', 3), ('",', 1), ('".', 2), ("'", 81), (')', 1), ('),', 4), (').', 3), (',', 219), ('-', 511), ('-30', 3), ('.', 200), ('."', 2), ('1', 1), ('100', 1), ('135', 1), ('200', 1), ('30%', 1), (':', 9), (';', 1)]
Number of all types: 1585
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24..."
2,macocu.mt.109995,Forum,Chrysler: Brand ta 'lussu jew le? \n\nBrand ji...,Chrysler: Luxury brand or not?\n\nBrand moves ...,{'text_id': 'macocu.mt.109995'},Opinion/Argumentation,"[▁Chrysler, :, ▁Brand, ▁ta, ▁', lus, su, ▁je, ...","[237562, 12, 23243, 308, 242, 5782, 1159, 55, ..."
10,macocu.mt.413333,Opinion/Argumentation,Il-ktieb Iqarribna fis ieħor fid-dinja. Huwa t...,The book brings us into another in the world.I...,{'text_id': 'macocu.mt.413333'},Opinion/Argumentation,"[▁Il, -, kti, eb, ▁I, qar, rib, na, ▁fis, ▁i, ...","[891, 9, 13089, 6403, 87, 29199, 11049, 76, 43..."


Number of all tokens: 4096
[('!', 4), ('"', 1), ('",', 1), ("'", 56), (')', 3), ('),', 1), (').', 2), (',', 93), ('-', 208), ('-0', 1), ('-200', 1), ('-2000', 1), ('-6', 1), ('.', 76), ('."', 2), ('152', 1), ('2)', 1), ('2003', 1), ('3-5', 1), (':', 10)]
Number of all types: 1109
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
23,macocu.mt.496182,Forum,Jista' l-Ministru jagħti rendikont tal-aħħar K...,Can the Minister give an account of the last c...,{'text_id': 'macocu.mt.496182'},Forum,"[▁J, ista, ', ▁l, -, Mini, stru, ▁jag, ħ, ti, ...","[821, 1035, 25, 96, 9, 114986, 10058, 810, 245..."


Number of all tokens: 512
[("'", 10), (',', 11), ('-', 48), ('-21', 1), ('.', 9), (':', 1), ('?', 2), ('A', 2), ('Amerika', 1), ('Ap', 2), ('Bel', 2), ('Dar', 1), ('Dia', 2), ('E', 1), ('Em', 1), ('Fu', 1), ('I', 1), ('Il', 1), ('K', 1), ('Kap', 1)]
Number of all types: 240
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
63,macocu.mt.85701,Prose/Lyrical,Versi Bibliċi dwar il-Fidwa \n\nQari permezz t...,Biblical verses about redemption\n\nReading th...,{'text_id': 'macocu.mt.85701'},Prose/Lyrical,"[▁Versi, ▁Bibli, ċ, i, ▁d, war, ▁il, -, Fi, dw...","[145121, 66429, 245384, 14, 104, 4205, 211, 9,..."


Number of all tokens: 512
[("'", 2), (')', 4), (',', 10), ('-', 25), ('-24', 1), ('.', 8), (':', 1), (':14', 1), (':18', 1), (':19', 1), ('A', 1), ('Bi', 1), ('Fi', 2), ('Is', 1), ('Kristu', 1), ('L', 1), ('LT', 2), ('N', 2), ('NI', 2), ('V', 2)]
Number of all types: 241
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
20,macocu.mt.269132,Legal,Document 62009CJ0162 \n\nJudgment of the Court...,Document 62009CJ0162\n\nJudgment of the Court ...,{'text_id': 'macocu.mt.269132'},Legal,"[▁Document, ▁6, 2009, C, J, 016, 2, ▁Jud, g, m...","[43101, 305, 15841, 441, 1375, 169887, 304, 59..."
26,macocu.mt.24684,Legal,Press Release \n\nWhilst referring to the intr...,Press release\n\nWhilst Referring to the intro...,{'text_id': 'macocu.mt.24684'},Legal,"[▁Press, ▁Release, ▁W, hil, st, ▁refer, ring, ...","[21072, 152590, 601, 16001, 271, 15005, 2852, ..."
28,macocu.mt.507549,Legal,Plaintiff \n\nDefendant \n\nKeywords \n\nUnfai...,Plaintiff\n\nDefendant\n\nKeywords\n\nUnfair C...,{'text_id': 'macocu.mt.507549'},Legal,"[▁Pla, inti, ff, ▁De, fen, dant, ▁Keyword, s, ...","[8950, 11379, 4902, 262, 6211, 58349, 189755, ..."


Number of all tokens: 5120
[('#', 20), ("'", 43), ('(', 3), ('(1)', 4), ('(2)', 4), ('(3)', 4), (')', 32), ('),', 5), (').', 5), (');', 1), (',', 78), ('-', 115), ('-11', 1), ('-16', 2), ('-2', 3), ('-2014', 1), ('-22', 1), ('-23', 1), ('-24', 1), ('-30', 1)]
Number of all types: 1439
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
9,macocu.mt.528970,Information/Explanation,"""lagi Braslavsky"" - a park nazzjonali. ""Brasla...","""Braslavsky lakes"" - a national park.""Braslav ...",{'text_id': 'macocu.mt.528970'},Promotion,"[▁"", lagi, ▁Bra, slav, sky, "", ▁-, ▁a, ▁park, ...","[44, 44003, 6163, 29505, 4922, 58, 20, 10, 920..."
35,macocu.mt.154411,Promotion,Aħmar Stag Lemonade Cocktail Recipe \n\nIl-Lem...,Red Stag Lemonade Cocktail Recipe\n\nThe red l...,{'text_id': 'macocu.mt.154411'},Promotion,"[▁A, ħ, mar, ▁S, tag, ▁Lemon, ade, ▁Cocktail, ...","[62, 245766, 1727, 159, 6936, 182508, 2873, 21..."
39,macocu.mt.148939,Promotion,"""Viking Hotel"". Dundjani. deskrizzjoni \n\nSpa...","""Viking Hotel"".Turkey.description\n\nSpacious ...",{'text_id': 'macocu.mt.148939'},Promotion,"[▁"", Vi, king, ▁Hotel, ""., ▁Du, nd, jani, ., ▁...","[44, 6609, 6048, 2352, 740, 786, 2208, 45946, ..."


Number of all tokens: 5878
[('!', 3), ('"', 18), ('",', 2), ('".', 4), ("'", 43), (')', 4), ('),', 3), (').', 1), (',', 154), ('-', 334), ('-2007', 1), ('-2009', 1), ('-2015', 1), ('-2016', 1), ('-2017', 1), ('-5', 1), ('.', 113), ('0.7', 1), ('1', 1), ('135', 1)]
Number of all types: 1361
Processing el


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ..."
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
10,macocu.el.11057302,Information/Explanation,Ο Ηρόδοτος αναφέρει ότι το κράνος αυτό έχει τη...,Herodotus states that this helmet is originall...,{'text_id': 'macocu.el.11057302'},Information/Explanation,"[▁Ο, ▁Η, ρό, δο, τος, ▁αναφέρει, ▁ότι, ▁το, ▁,...","[2108, 1700, 19385, 12497, 11261, 98971, 2135,..."
19,macocu.el.11396017,Information/Explanation,Τα Χριστούγεννα ανέκαθεν ήταν η γιορτή της αγά...,Christmas has always been the celebration of l...,{'text_id': 'macocu.el.11396017'},Information/Explanation,"[▁Τα, ▁Χριστούγεννα, ▁ανέ, κα, θεν, ▁ήταν, ▁η,...","[7222, 231152, 72529, 9165, 104803, 6426, 781,..."
28,macocu.el.9616758,Instruction,Σύνδρομο Δυσοσμίας Μαλλιών \n\nΜπορεί να λούζε...,Hair Syndrome\n\nYou may bathe regularly and y...,{'text_id': 'macocu.el.9616758'},Information/Explanation,"[▁Σ, ύν, δρομο, ▁Δ, υσ, οσ, μίας, ▁Μ, αλλ, ιών...","[5127, 73537, 115638, 6732, 38994, 36869, 1387..."


Number of all tokens: 5608
[('!', 1), ('"', 6), ('",', 1), ('".', 4), ("'", 4), ('(', 3), (')', 11), ('),', 6), (').', 6), (',', 149), ('-', 10), ('-1', 1), ('-10', 1), ('-2014', 1), ('.', 157), ('.[1]', 1), ('/', 4), ('/07/', 2), ('10', 1), ('18', 1)]
Number of all types: 1826
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
9,macocu.el.8155472,News,Η κάνναβη ίσως νικήσει τους πόνους από παθήσει...,Cannabis may beat pain from nervous system dis...,{'text_id': 'macocu.el.8155472'},News,"[▁Η, ▁κ, άννα, βη, ▁ίσως, ▁, νικ, ήσει, ▁τους,...","[1700, 4409, 219885, 117929, 85451, 6, 72616, ..."
16,macocu.el.11233519,News,Σήμερα η συνάντηση της δημοτικής αρχής με τους...,Today the meeting of the Municipal Authority w...,{'text_id': 'macocu.el.11233519'},News,"[▁Σήμερα, ▁η, ▁συνάντηση, ▁της, ▁δημοτικ, ής, ...","[111310, 781, 120233, 463, 123671, 4201, 70984..."
32,macocu.el.1068501,News,"Βαξεβάνης για Δικαιοσύνη: ""Τα τελευταία χρόνια...","Vaxevan for justice: ""In recent years it does ...",{'text_id': 'macocu.el.1068501'},News,"[▁Βα, ξε, β, άνης, ▁για, ▁Δικαιοσύνη, :, ▁"", Τ...","[105371, 16548, 5079, 205120, 614, 219609, 12,..."


Number of all tokens: 3501
[('"', 8), ('",', 1), ('".', 4), ('&', 1), ("'", 2), (')', 10), ('),', 3), (').', 2), ('):', 1), (',', 98), ('-', 5), ('-19', 1), ('-52', 1), ('.', 58), ('."', 1), ('...', 1), ('.000', 1), ('01', 1), ('1', 1), ('2%', 1)]
Number of all types: 1344
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ..."
7,macocu.el.7160936,Instruction,Η πνευματική και νευρομυϊκή ισορροπία είναι μι...,Spiritual and neuromuscular balance is a very ...,{'text_id': 'macocu.el.7160936'},Instruction,"[▁Η, ▁πνευματική, ▁και, ▁νευρ, ομ, υ, ϊκ, ή, ▁...","[1700, 227802, 215, 222954, 27448, 1797, 94457..."
29,macocu.el.10313358,Instruction,Κόβουμε τα καρότα σε φέτες και τα ρίχνουμε στο...,Cut the carrots into slices and toss in the bl...,{'text_id': 'macocu.el.10313358'},Instruction,"[▁Κό, β, ουμε, ▁τα, ▁καρ, ό, τα, ▁σε, ▁φ, έτες...","[151611, 5079, 9049, 860, 41288, 1279, 3223, 9..."


Number of all tokens: 2971
[('!', 3), ('!!!', 1), ('"', 2), ('",', 1), ('&', 4), ("'", 2), (')', 10), ('),', 1), (').', 1), ('+', 1), (',', 70), ('-', 4), ('.', 89), ('/', 1), ('00', 2), ('01', 5), ('19', 1), ('2', 10), ('32', 3), (':', 8)]
Number of all types: 1121
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
2,macocu.el.15913298,Opinion/Argumentation,FC: Το κλειστό του ΟΑΚΑ ξανάγινε ελκυστικός πρ...,FC: OAKA's closed re -attractive destination\n...,{'text_id': 'macocu.el.15913298'},Opinion/Argumentation,"[▁FC, :, ▁Το, ▁κλεισ, τό, ▁του, ▁Ο, Α, ΚΑ, ▁ξα...","[15161, 12, 2654, 184894, 14280, 385, 2108, 93..."
5,macocu.el.16383590,Opinion/Argumentation,"Η δική μας οργάνωση, ανάχωμα σε κάθε uber \n\n...","Our organization, embankment on every Uber\n\n...",{'text_id': 'macocu.el.16383590'},Opinion/Argumentation,"[▁Η, ▁δική, ▁μας, ▁οργάνωση, ,, ▁ανά, χ, ωμα, ...","[1700, 86513, 2274, 137517, 4, 28189, 2088, 38..."
20,macocu.el.1715097,Opinion/Argumentation,tanea.gr &gt; Πολιτική &gt; Το προφίλ του μέσο...,tanea.gr & gt;Politics & GT;The profile of the...,{'text_id': 'macocu.el.1715097'},Opinion/Argumentation,"[▁tane, a, ., gr, ▁&, gt, ;, ▁Πολιτική, ▁&, gt...","[61560, 11, 5, 3964, 619, 5386, 74, 109310, 61..."


Number of all tokens: 5607
[('!', 2), ('!!', 1), ('!!!', 1), ('!!!!!!', 1), ('"', 13), ('",', 3), ('".', 5), ("'", 7), ('(', 1), (')', 8), (').', 1), (',', 144), ('-', 2), ('-34', 1), ('.', 133), ('...', 7), ('.....', 1), ('/', 1), ('2', 1), ('2004', 1)]
Number of all types: 1921
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,..."
14,macocu.el.2532316,Forum,Σύνδεση \n\nΠροειδοποίηση \n\nΠΡΟΣΟΧΗ \n\nΌτι ...,Connection\n\nWarning\n\ncaution\n\nThat Janty...,{'text_id': 'macocu.el.2532316'},Forum,"[▁Σ, ύν, δε, ση, ▁Προ, ειδ, οποίηση, ▁ΠΡΟΣ, Ο,...","[5127, 73537, 18388, 5314, 25829, 80348, 38658..."
18,macocu.el.2561810,Forum,Προειδοποίηση \n\nΠΡΟΣΟΧΗ \n\nμια και τα παει ...,"Warning\n\ncaution\n\nSince there is a trial, ...",{'text_id': 'macocu.el.2561810'},Forum,"[▁Προ, ειδ, οποίηση, ▁ΠΡΟΣ, Ο, Χ, Η, ▁μια, ▁κα...","[25829, 80348, 38658, 231860, 10350, 18137, 11..."


Number of all tokens: 5137
[('!', 10), ('!!!', 3), ('"', 9), ('".', 3), ('"?', 1), ('&', 3), ("'", 8), ('(', 2), (')', 11), (').', 1), ('):', 1), ('+', 1), ('+4', 1), (',', 90), ('-', 11), ('-17', 1), ('-50', 1), ('.', 158), ('...', 18), ('......', 1)]
Number of all types: 1658
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
3,macocu.el.15538176,Prose/Lyrical,Ο λογισμός με βασανίζει γέροντα να εγκαταλείψω...,Calculus torture me elder to give up the fight...,{'text_id': 'macocu.el.15538176'},Prose/Lyrical,"[▁Ο, ▁λογ, ισμός, ▁με, ▁βασ, αν, ίζει, ▁γ, έρ,...","[2108, 87026, 30277, 558, 69889, 8240, 17259, ..."
12,macocu.el.9309315,Prose/Lyrical,Είμαι τρελός για τα δυο σου τα μάτια που με κο...,I'm crazy about your two eyes looking at me an...,{'text_id': 'macocu.el.9309315'},Prose/Lyrical,"[▁, Είμαι, ▁, τρελ, ός, ▁για, ▁τα, ▁δυο, ▁σου,...","[6, 192392, 6, 190309, 5981, 614, 860, 77712, ..."
22,macocu.el.11919281,Prose/Lyrical,Χρησιμα \n\nΟικογενεια \n\nO Τοτός λέει στην α...,Useful\n\nFamily\n\nToto says to his sister Ma...,{'text_id': 'macocu.el.11919281'},Prose/Lyrical,"[▁Χρ, ησ, ιμα, ▁Οι, κο, γεν, εια, ▁O, ▁Το, τός...","[80056, 64339, 133012, 5549, 10631, 43052, 459..."


Number of all tokens: 2872
[('!', 4), ('!!', 1), ("'", 6), (',', 93), ('-', 1), ('.', 68), ('...', 4), ('......', 2), ('.........', 1), (':', 3), (';', 3), ('<unk>', 5), ('S', 1), ('blogspot', 1), ('ed', 1), ('erade', 1), ('gr', 1), ('hack', 1), ('isi', 1), ('s', 1)]
Number of all types: 932
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,macocu.el.11442474,Legal,προτείνεται στην παράγραφος 8 να προστεθεί Ο Α...,It is proposed in paragraph 8 to add the PM by...,{'text_id': 'macocu.el.11442474'},Legal,"[▁προ, τε, ίνεται, ▁στην, ▁παρά, γραφο, ς, ▁8,...","[4536, 3101, 107118, 1227, 18038, 153122, 235,..."
6,macocu.el.4074885,Legal,ΟΡΟΙ ΣΥΝΑΛΛΑΓΩΝ \n\nΟι παρόντες Όροι Συναλλαγώ...,Terms of trading\n\nThese transaction terms go...,{'text_id': 'macocu.el.4074885'},Legal,"[▁Ο, ΡΟ, Ι, ▁ΣΥΝ, ΑΛ, ΛΑ, Γ, ΩΝ, ▁Οι, ▁παρόν, ...","[2108, 80990, 15471, 189117, 79603, 55246, 157..."
11,macocu.el.13664703,Legal,της υπ' αριθμ. 1/2022/ΟΣΝΙΕ 2ης Πρόσκλησης Εκδ...,of the 2nd Invitation of Interest 2nd Invitati...,{'text_id': 'macocu.el.13664703'},Legal,"[▁της, ▁υπ, ', ▁αριθμ, ., ▁1, /, 2022, /, ΟΣ, ...","[463, 33023, 25, 152533, 5, 106, 64, 151159, 6..."


Number of all tokens: 3368
[('"', 6), ('"),', 2), ('",', 6), ('".', 4), ("'", 8), ('(', 2), ('(1)', 1), ('(2)', 1), (')', 8), ('),', 5), (').', 1), ('):', 1), (',', 82), ('-', 11), ('-0', 1), ('-00', 4), ('-25', 1), ('.', 91), ('/', 7), ('/02', 1)]
Number of all types: 1237
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
8,macocu.el.15049641,Promotion,"Το Renault Clio R.S. απέκτησε έκδοση Trophy, μ...","Renault Clio R.S.It got a Trophy version, with...",{'text_id': 'macocu.el.15049641'},Promotion,"[▁Το, ▁Renault, ▁C, lio, ▁R, ., S, ., ▁απέ, κτ...","[2654, 60855, 313, 10258, 627, 5, 294, 5, 1021..."
33,macocu.el.13399719,Promotion,Κώστας Μακεδόνας | Καλοκαιρινή Περιοδεία 2015 ...,Costas Macedonia2015 summer tour\n\nThe belove...,{'text_id': 'macocu.el.13399719'},Promotion,"[▁Κώστας, ▁Μακ, εδ, όνα, ς, ▁, |, ▁Καλ, οκ, αι...","[201005, 193340, 65917, 96444, 235, 6, 58745, ..."
42,macocu.el.2552298,Promotion,Έτος έκδοσης : 2001 \n\nISBN : 978-960-14-0484...,Year of issue: 2001\n\nISBN: 978-960-14-0484-4...,{'text_id': 'macocu.el.2552298'},Promotion,"[▁Έ, τος, ▁έκδοση, ς, ▁:, ▁2001, ▁ISBN, ▁:, ▁9...","[19905, 11261, 110873, 235, 152, 6789, 47475, ..."


Number of all tokens: 2176
[('!', 3), ('"', 11), ('",', 5), ('".', 1), ('&', 1), ("'", 2), (')', 2), (',', 56), ('-', 1), ('-0', 1), ('-14', 1), ('-4', 1), ('.', 47), ('/', 1), ('2', 1), ('29', 1), ('484', 1), ('5)', 1), ('500', 1), ('6', 1)]
Number of all types: 978
Processing tr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.tr.15851513,Instruction,AÖL Ders Seçimi ve Sınav Giriş Merkezi Belirle...,AÖL warned of frequent negligence and errors o...,{'text_id': 'macocu.tr.15851513'},Instruction,"[▁A, Ö, L, ▁Der, s, ▁Seçim, i, ▁ve, ▁Sınav, ▁G...","[62, 8655, 866, 1310, 7, 166134, 14, 173, 1762..."
1,macocu.tr.12699738,Legal,Banka promosyonu ihalesinde uygulanacak kriter...,Criteria to be applied in the tender for bank ...,{'text_id': 'macocu.tr.12699738'},Legal,"[▁Banka, ▁promo, syon, u, ▁i, hale, sinde, ▁uy...","[81847, 8891, 10270, 34, 17, 50742, 19209, 633..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
7,macocu.tr.13908190,Information/Explanation,"Bu gruptaki bakteriler, enerji kaynağı olarak ...",Bacteria in this group use inorganic nitrogen ...,{'text_id': 'macocu.tr.13908190'},Information/Explanation,"[▁Bu, ▁grup, taki, ▁bakteri, ler, ,, ▁enerji, ...","[667, 5921, 44694, 28983, 603, 4, 39039, 12159..."
20,macocu.tr.14562023,Information/Explanation,Reis – Raees 2017 Türkçe Altyazını Full HD Fil...,Reis - Raees 2017 Turkish subtitles\n\nDirecte...,{'text_id': 'macocu.tr.14562023'},Information/Explanation,"[▁Reis, ▁–, ▁Ra, e, es, ▁2017, ▁Türkçe, ▁Alt, ...","[57943, 46, 2552, 13, 90, 505, 61579, 10544, 8..."
42,macocu.tr.2700864,Information/Explanation,İtalyan Opera bestecisi Gaspare Luigi Pacifico...,Italian opera composer Gaspare Luigi Pacifico ...,{'text_id': 'macocu.tr.2700864'},Information/Explanation,"[▁İtalya, n, ▁Opera, ▁beste, ci, si, ▁Gas, par...","[116234, 19, 16556, 2184, 318, 172, 27967, 160..."


Number of all tokens: 2289
[('!', 1), ('"', 4), ("'", 45), (')', 16), (').', 1), (',', 89), ('-', 5), ('.', 79), ('/', 2), ('02', 1), ('04', 1), ('04)', 1), ('05', 1), ('07', 2), ('09', 2), ('1', 1), ('100', 1), ('14', 1), ('17', 1), ('1800', 1)]
Number of all types: 1154
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
5,macocu.tr.8676879,News,ATATÜRK'Ü İLELEBET YAŞATACAĞIZ! \n\nATATÜRK'Ü ...,We will live Atatürk forever!\n\nWe will live ...,{'text_id': 'macocu.tr.8676879'},News,"[▁, ATA, TÜRK, ', Ü, ▁İLE, LE, BET, ▁YA, Ş, AT...","[6, 17758, 148373, 25, 8047, 202070, 15300, 75..."
8,macocu.tr.7730349,News,Silivrispor'dan basın açıklaması \n\nSilivrisp...,Press Release from Silivrispor\n\nIn a written...,{'text_id': 'macocu.tr.7730349'},News,"[▁Si, liv, ri, spor, ', dan, ▁basın, ▁açıklama...","[602, 9617, 416, 11946, 25, 549, 79079, 141478..."
14,macocu.tr.13782197,News,21 Mart Dünya Down Sendromu Günü artık ülkemiz...,March 21 World Down Syndrome Day is now celebr...,{'text_id': 'macocu.tr.13782197'},News,"[▁21, ▁Mart, ▁Dünya, ▁Down, ▁Sen, drom, u, ▁Gü...","[952, 13212, 19436, 49472, 4311, 41684, 34, 45..."


Number of all tokens: 3342
[('!', 2), ('"', 17), ("'", 76), (')', 11), ('+', 1), (',', 92), ('-', 11), ('-14', 1), ('-19', 5), ('.', 86), ('."', 2), ('...', 1), ('05', 1), ('1', 1), ('10', 1), ('13', 1), ('15', 2), ('16', 1), ('232', 1), ('3', 2)]
Number of all types: 1478
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.tr.15851513,Instruction,AÖL Ders Seçimi ve Sınav Giriş Merkezi Belirle...,AÖL warned of frequent negligence and errors o...,{'text_id': 'macocu.tr.15851513'},Instruction,"[▁A, Ö, L, ▁Der, s, ▁Seçim, i, ▁ve, ▁Sınav, ▁G...","[62, 8655, 866, 1310, 7, 166134, 14, 173, 1762..."
6,macocu.tr.10188634,Instruction,Ankastreyle Daha Kullanışlı Mutfaklar! \n\nAnk...,More useful kitchens with built -in!\n\nThe bu...,{'text_id': 'macocu.tr.10188634'},Instruction,"[▁An, ka, stre, yle, ▁Daha, ▁Kullan, ış, lı, ▁...","[893, 161, 5967, 7786, 20692, 133679, 13827, 1..."
17,macocu.tr.5815393,Instruction,Bebek biberonu nasıl ısıtılır \n\nEğer bebeğin...,How to Heat a Baby Booton\n\nIf you do not bre...,{'text_id': 'macocu.tr.5815393'},Instruction,"[▁Bebek, ▁biber, on, u, ▁nasıl, ▁ısı, t, ılır,...","[159949, 177898, 191, 34, 17315, 104504, 18, 3..."


Number of all tokens: 4049
[('!', 7), ('&', 1), ("'", 17), ('(', 4), (')', 4), (').', 2), (',', 91), ('-', 8), ('-11-', 1), ('-21', 1), ('-7', 1), ('.', 157), ('."', 1), ('...', 4), ('/', 10), ('09', 4), ('111', 2), ('12)', 1), ('16', 4), ('19', 1)]
Number of all types: 1650
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
3,macocu.tr.9263252,Opinion/Argumentation,----------------------------------------------...,----------------------------------------------...,{'text_id': 'macocu.tr.9263252'},Opinion/Argumentation,"[▁, ----------------, ----------------, ------...","[6, 103428, 103428, 103428, 103428, 103428, 70..."
13,macocu.tr.1850879,Opinion/Argumentation,BLOKZİNCİR ARACILARA KAFA TUTUYOR VEDAT GÜVEN ...,BLOCZincir is challenging the intermediaries V...,{'text_id': 'macocu.tr.1850879'},Opinion/Argumentation,"[▁BL, OK, Zİ, NC, İR, ▁A, RAC, IL, ARA, ▁K, AF...","[68170, 9092, 168383, 36253, 75055, 62, 126349..."
18,macocu.tr.6598485,Opinion/Argumentation,"Eski defterleri karıştırır, durur.. Biz de öyl...","He confuses the old notebooks, stops .. We did...",{'text_id': 'macocu.tr.6598485'},Opinion/Argumentation,"[▁Eski, ▁de, fter, leri, ▁karıştır, ır, ,, ▁du...","[82048, 8, 46327, 1341, 147719, 3772, 4, 7920,..."


Number of all tokens: 5613
[('!', 4), ('!!', 1), ('!"', 2), ('"', 34), ('",', 2), ('"?', 1), ("'", 45), (')', 7), (').', 1), ('+00:00', 3), (',', 206), ('-', 2), ('----------------', 5), ('-02-', 3), ('.', 205), ('."', 1), (".''", 1), ('...', 7), ('......', 1), ('/', 1)]
Number of all types: 2208
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,macocu.tr.12349626,Forum,Gönderen Konu: ARICILIĞA İLK ADIM (Okunma sayı...,Sender Subject: First Step to Beekeeping (Read...,{'text_id': 'macocu.tr.12349626'},Forum,"[▁Gö, nder, en, ▁Konu, :, ▁A, RIC, ILI, Ğ, A, ...","[23912, 7944, 33, 45577, 12, 62, 110804, 18106..."
16,macocu.tr.11900682,Forum,Taklitler Aslını Yüceltir ! \n\nUzun zamandır ...,Imitations glorify the original!\n\nI haven't ...,{'text_id': 'macocu.tr.11900682'},Forum,"[▁Tak, lit, ler, ▁As, l, ını, ▁Yüce, l, tir, ▁...","[2561, 4353, 603, 1301, 141, 4644, 221522, 141..."
23,macocu.tr.2834051,Forum,"""NOHUT YEMEĞİ (TAVUKLU)"" için 7 cevap \n\nımm ...","7 answers for ""chickpea dinner (chicken)""\n\nI...",{'text_id': 'macocu.tr.2834051'},Forum,"[▁"", NO, H, UT, ▁Y, EM, EĞİ, ▁(, TA, V, UK, LU...","[44, 8575, 841, 17632, 990, 17513, 223362, 15,..."


Number of all tokens: 3219
[('!!', 1), ('"', 4), ("'", 18), (')', 3), (')))', 2), ('):', 2), (',', 27), ('-', 13), ('-0', 2), ('-60', 1), ('.', 136), ('...', 6), ('/', 4), ('10', 1), ('1980', 1), ('23', 1), ('3', 1), ('61', 1), ('64', 1), ('75', 1)]
Number of all types: 1448
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
9,macocu.tr.4196875,Prose/Lyrical,SADECE SEN SEVİM... \n\n''meğerse vurulmuşum s...,Just you love ...\n\n'' It turns out that I wa...,{'text_id': 'macocu.tr.4196875'},Prose/Lyrical,"[▁SAD, ECE, ▁SEN, ▁SE, V, İM, ..., ▁'', m, eğe...","[47854, 113712, 70087, 6755, 856, 54291, 27, 5..."
22,macocu.tr.11907651,Opinion/Argumentation,"Mossingham kasabası, şimdi hiç olmadığı kadar ...",The town of Mossingham is more quiet and calm ...,{'text_id': 'macocu.tr.11907651'},Prose/Lyrical,"[▁Mos, sing, ham, ▁kasaba, sı, ,, ▁şimdi, ▁hiç...","[8455, 6953, 3915, 208642, 2540, 4, 43998, 226..."
25,macocu.tr.5305470,Prose/Lyrical,Temel bir gun yanina torununu almis ve askerli...,On a basic day he took his grandson with him a...,{'text_id': 'macocu.tr.5305470'},Prose/Lyrical,"[▁Temel, ▁bir, ▁gun, ▁yani, na, ▁tor, unun, u,...","[124292, 263, 17863, 51375, 76, 9983, 10465, 3..."


Number of all tokens: 3764
[('!', 6), ('!!!', 1), ('"', 1), ('#', 1), ('%', 1), ("'", 21), ("''", 6), ('(', 2), (')', 7), (').', 1), ('*', 1), (',', 102), ('-', 10), ('.', 114), ('...', 11), ('...(', 2), ('.12.2018', 1), ('/', 5), ('/6', 1), ('1.2', 1)]
Number of all types: 1620
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,macocu.tr.12699738,Legal,Banka promosyonu ihalesinde uygulanacak kriter...,Criteria to be applied in the tender for bank ...,{'text_id': 'macocu.tr.12699738'},Legal,"[▁Banka, ▁promo, syon, u, ▁i, hale, sinde, ▁uy...","[81847, 8891, 10270, 34, 17, 50742, 19209, 633..."
11,macocu.tr.7236102,Legal,2011 Thof Hakem Kursu Açılacak \n\nTarih \n\nK...,2011 THOF Referee course will be opened\n\nHis...,{'text_id': 'macocu.tr.7236102'},Legal,"[▁2011, ▁T, hof, ▁Hak, em, ▁Kurs, u, ▁Aç, ılac...","[1392, 384, 21676, 23493, 195, 24289, 34, 1297..."
12,macocu.tr.9532281,Legal,"ESER, İCRA, YAPIM VE YAYINLARIN KULLANILMASI V...",Regulation on the procedures and principles re...,{'text_id': 'macocu.tr.9532281'},Legal,"[▁E, SER, ,, ▁İ, CRA, ,, ▁YAP, IM, ▁VE, ▁YA, Y...","[241, 50184, 4, 4661, 154791, 4, 128706, 17199..."


Number of all tokens: 3677
[('"', 6), ('",', 1), ("'", 15), (')', 28), ('),', 1), ('*', 1), (',', 119), ('-', 13), ('-1)', 1), ('-11', 1), ('-3', 1), ('.', 88), ('/', 13), ('/02/', 1), ('/09/', 1), ('/1', 1), ('/12/', 2), ('/12/2018', 1), ('/14', 2), ('/6', 2)]
Number of all types: 1430
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
10,macocu.tr.9519281,Promotion,Selam beyler ben Antalya Olgun Balıketli Escor...,"Hi gentlemen, I'm Antalya mature fishing escor...",{'text_id': 'macocu.tr.9519281'},Promotion,"[▁Se, lam, ▁be, y, ler, ▁ben, ▁Antalya, ▁Ol, g...","[503, 3719, 186, 53, 603, 1585, 56938, 9295, 6..."
35,macocu.tr.9838296,Promotion,Meme estetiği hakkında mutlaka bilmeniz gereke...,What you need to know about breast aesthetics\...,{'text_id': 'macocu.tr.9838296'},Promotion,"[▁Mem, e, ▁este, tiği, ▁hakkında, ▁mutlaka, ▁b...","[17443, 13, 473, 54292, 21152, 80428, 2193, 62..."
37,macocu.tr.4118269,Promotion,Geniş zum aralığı ve üstün hız performansı \n\...,WIDE ZUM RANGE AND SPECIAL SPEED PERFORMANCE\n...,{'text_id': 'macocu.tr.4118269'},Promotion,"[▁Geni, ş, ▁zum, ▁ara, lığı, ▁ve, ▁üstün, ▁hız...","[71491, 1759, 2388, 5689, 10388, 173, 54840, 1..."


Number of all tokens: 3625
[('!', 1), ('"', 3), ("'", 7), ("''", 1), (')', 7), ('*', 1), (',', 80), ('-', 2), ('-200', 4), ('-70', 1), ('.', 130), ('."', 1), ('...', 2), ('..."', 1), ('/', 2), ('02', 1), ('100', 1), ('2.8', 1), ('4.5', 1), ('68', 1)]
Number of all types: 1576
Processing sq


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
0,macocu.sq.1061396,Opinion/Argumentation,341.0,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",Opinion/Argumentation,"[▁Blog, ▁“, U, në, ▁të, ▁kam, ▁dashur, ▁me, ▁n...","[5061, 52, 1062, 3208, 134, 3840, 57168, 163, ..."
2,macocu.sq.183383,Legal,140.0,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...,"{'text_id': 'macocu.sq.183383', 'domain': 'euk...",Legal,"[▁Li, ria, ▁nga, ▁keq, t, raj, timi, ▁Kon, ven...","[1261, 1651, 817, 39184, 18, 10185, 20520, 369..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
14,macocu.sq.1160383,Opinion/Argumentation,156.0,islamgjakova.net,"Duke studiuar Kuranin, çdokush ndesh shpesh në...","Studying the Qur'an, anyone often encounters w...","{'text_id': 'macocu.sq.1160383', 'domain': 'is...",Information/Explanation,"[▁Duke, ▁studiu, ar, ▁Kura, nin, ,, ▁çdo, kush...","[51978, 77993, 147, 88810, 694, 4, 19432, 5576..."
17,macocu.sq.57195,Information/Explanation,107.0,fjale.al,Fjalor Shqip\n\nPANDASHËM mb.\n\n1. Që nuk mun...,Albanian dictionary\n\nInadvertent mb.\n\n1. t...,"{'text_id': 'macocu.sq.57195', 'domain': 'fjal...",Information/Explanation,"[▁F, ja, lor, ▁, Shqip, ▁P, ANDA, SH, Ë, M, ▁m...","[563, 145, 1484, 6, 108413, 436, 34860, 12927,..."
19,macocu.sq.1622172,Information/Explanation,346.0,knowledgecenter.ubt-uni.net,Date of Award\n\nHyrje: Infeksionet spitalore ...,Date of award\n\nIntroduction: Soft tissue hos...,"{'text_id': 'macocu.sq.1622172', 'domain': 'kn...",Information/Explanation,"[▁Date, ▁of, ▁Award, ▁Hy, rje, :, ▁In, fek, si...","[25512, 111, 60992, 3905, 15614, 12, 360, 2371..."


Number of all tokens: 5196
[('"', 3), ('"),', 1), ('",', 2), ('".', 1), ('%)', 1), (')', 16), ('),', 4), (').', 3), ('):', 1), ('+', 1), (',', 171), ('-', 14), ('-12', 1), ('.', 138), ('.05.', 1), ('.12.', 2), ('.3.', 1), ('.4.', 1), ('.8.', 1), ('/', 9)]
Number of all types: 1663
Processing News


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
3,macocu.sq.1191613,News,115.0,sportekspres.com,"Një milimetër larg golit, “VAR” bëhet makth pë...","A millimeter away from goal, ""Var"" becomes nig...","{'text_id': 'macocu.sq.1191613', 'domain': 'sp...",News,"[▁Një, ▁mil, ime, tër, ▁larg, ▁gol, it, ,, ▁“,...","[16203, 2717, 3602, 20401, 22430, 8104, 217, 4..."
11,macocu.sq.625662,News,113.0,lajme365.com,Shërbimi Spitalor Klinik dhe Universitar i Kos...,The Clinical and University Hospital Service o...,"{'text_id': 'macocu.sq.625662', 'domain': 'laj...",News,"[▁Sh, ër, bi, mi, ▁Spi, ta, lor, ▁Klinik, ▁dhe...","[7525, 10170, 964, 266, 33365, 102, 1484, 5884..."
35,macocu.sq.439446,News,224.0,spektrum.al,"Raport, Përparimi i teknologjisë rrezikon zhdu...","Report, technology progress risks extinction o...","{'text_id': 'macocu.sq.439446', 'domain': 'spe...",News,"[▁Raport, ,, ▁Për, pari, mi, ▁i, ▁teknologji, ...","[83300, 4, 9910, 24980, 266, 17, 167614, 3567,..."


Number of all tokens: 2415
[('!', 3), (')', 1), ('),', 2), (',', 92), ('-', 8), ('-19', 1), ('.', 54), ('/', 2), ('12', 1), ('20', 1), ('21', 1), (':', 1), (';', 1), ('?', 2), ('BE', 1), ('C', 1), ('Click', 1), ('DI', 3), ('FO', 1), ('Ga', 1)]
Number of all types: 921
Processing Instruction


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
24,macocu.sq.1564538,Instruction,164.0,shqiperia.forumotion.com,"Titulli: Kujdes, video me virus në facebook Su...","Title: Care, Video Video on Facebook Sun Aug 0...","{'text_id': 'macocu.sq.1564538', 'domain': 'sh...",Instruction,"[▁Titul, li, :, ▁Kujdes, ,, ▁video, ▁me, ▁viru...","[104665, 150, 12, 238386, 4, 1202, 163, 14994,..."
25,macocu.sq.1056906,Instruction,295.0,fjalaejetes.org,Blog\n\nA ke një plan për jetën tënde — Mediti...,Blog\n\nDo you have a plan for your life - dai...,"{'text_id': 'macocu.sq.1056906', 'domain': 'fj...",Instruction,"[▁Blog, ▁A, ▁ke, ▁një, ▁plan, ▁për, ▁jetën, ▁t...","[5061, 62, 311, 898, 1774, 521, 56879, 134, 10..."
31,macocu.sq.646269,Instruction,298.0,zeropese.com,"Ju shqetëson lodhja nga adrenalina, jeni vazhd...","You worry about adrenaline fatigue, are you co...","{'text_id': 'macocu.sq.646269', 'domain': 'zer...",Instruction,"[▁Ju, ▁, shq, e, tës, on, ▁lod, hja, ▁nga, ▁ad...","[3314, 6, 36136, 13, 23036, 191, 45314, 60869,..."


Number of all tokens: 3204
[('!', 3), ('",', 1), ("'", 1), (')', 4), ('),', 2), (').', 1), ('):', 2), (',', 75), ('-', 4), ('.', 101), ('...', 1), ('.”', 1), ('/', 3), ('1', 3), ('32', 1), ('5000', 1), (':', 27), (':20', 1), (':59', 1), ('?', 6)]
Number of all types: 1173
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
0,macocu.sq.1061396,Opinion/Argumentation,341.0,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",Opinion/Argumentation,"[▁Blog, ▁“, U, në, ▁të, ▁kam, ▁dashur, ▁me, ▁n...","[5061, 52, 1062, 3208, 134, 3840, 57168, 163, ..."
27,macocu.sq.386123,Opinion/Argumentation,159.0,javanews.al,Një pyetje të thjeshtë për antikomunistët e 20...,A simple question for 2017 anti-communists\n\n...,"{'text_id': 'macocu.sq.386123', 'domain': 'jav...",Opinion/Argumentation,"[▁Një, ▁pyetje, ▁të, ▁thjeshtë, ▁për, ▁anti, k...","[16203, 107384, 134, 156723, 521, 2874, 95703,..."
34,macocu.sq.171654,News,254.0,priza.al,Agron Shehaj konfirmoi firmat/ Dy degët e PD j...,Agron Shehaj confirmed the firms/ two DP branc...,"{'text_id': 'macocu.sq.171654', 'domain': 'pri...",Opinion/Argumentation,"[▁Agro, n, ▁She, haj, ▁konfirm, oi, ▁firma, t,...","[51364, 19, 4687, 34789, 58407, 5380, 5239, 18..."


Number of all tokens: 4440
[('!', 6), ('!”', 1), ('"', 1), ("'", 6), ('(', 1), (')', 5), ('),', 2), (').', 1), (',', 141), ('-', 16), ('.', 98), ('...', 7), ('.”', 2), ('/', 3), ('1-6', 1), (':', 9), (':32', 1), (':40', 1), (';', 5), ('?', 8)]
Number of all types: 1467
Processing Forum


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
8,macocu.sq.1340549,Forum,471.0,shkodraonline.com,hahahahahahahaha naq do i kerkoj zamirit me ka...,Hahahahahahahaha naq will I ask Zamir with the...,"{'text_id': 'macocu.sq.1340549', 'domain': 'sh...",Forum,"[▁hahaha, hahahahaha, ▁na, q, ▁do, ▁i, ▁kerk, ...","[58060, 221582, 24, 864, 54, 17, 25808, 897, 8..."
12,macocu.sq.910913,Prose/Lyrical,148.0,respublica.al,"Foto 1 nga 1\n\nQeveri moj qeveri, Na e fute, ...","Photo 1 of 1\n\nGovernment of government, put ...","{'text_id': 'macocu.sq.910913', 'domain': 'res...",Forum,"[▁Foto, ▁1, ▁nga, ▁1, ▁Qe, veri, ▁moj, ▁qeveri...","[5198, 106, 817, 106, 60737, 20628, 22168, 704..."
13,macocu.sq.1281255,Forum,174.0,shkodraonline.com,Nuk ka skandal dhe herezi me te madhe se sa kj...,There is no greater scandal and heresy than th...,"{'text_id': 'macocu.sq.1281255', 'domain': 'sh...",Forum,"[▁Nuk, ▁ka, ▁skandal, ▁dhe, ▁here, zi, ▁me, ▁t...","[23302, 156, 74310, 467, 3688, 708, 163, 120, ..."


Number of all tokens: 3509
[('!', 6), ('!!', 3), ('!!!', 4), ('!!!!!', 1), ('"', 1), ('&', 1), ("'", 12), ('(', 1), (')', 2), (',', 99), ('-', 4), ('---', 1), ('------', 1), ('.', 66), ('...', 13), ('...!!', 1), ('.....', 3), ('......', 3), ('.......', 1), ('...............', 1)]
Number of all types: 1235
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
5,macocu.sq.1049407,Prose/Lyrical,109.0,teksteshqip.com,Nuk je intersant Mos e bo veten t'veçantë. Fja...,You are not interstellar don't make yourself e...,"{'text_id': 'macocu.sq.1049407', 'domain': 'te...",Prose/Lyrical,"[▁Nuk, ▁je, ▁inter, sant, ▁Mos, ▁e, ▁bo, ▁vete...","[23302, 55, 1940, 36800, 8455, 28, 337, 80830,..."
7,macocu.sq.1411292,Prose/Lyrical,126.0,zemrashqiptare.net,Sejdi Berisha: Dudi plak në Patrikanë\n\nP O E...,Sejdi Berisha: old Dudi in Patriarchate\n\nPOE...,"{'text_id': 'macocu.sq.1411292', 'domain': 'ze...",Prose/Lyrical,"[▁Se, j, di, ▁Berisha, :, ▁Du, di, ▁plak, ▁në,...","[503, 170, 428, 135148, 12, 786, 428, 26883, 3..."
9,macocu.sq.1049425,Prose/Lyrical,146.0,teksteshqip.com,Bekim Kastrati - Gurbeti Lyrics\n\nNi muj ne K...,Bekim Kastrati - Gurbeti lyrics\n\nI just in K...,"{'text_id': 'macocu.sq.1049425', 'domain': 'te...",Prose/Lyrical,"[▁Be, kim, ▁Ka, stra, ti, ▁-, ▁Gur, be, ti, ▁L...","[873, 8103, 1136, 2816, 118, 20, 41449, 372, 1..."


Number of all tokens: 3393
[('!', 8), ('!”', 2), ("'", 18), (')', 3), (').', 1), (',', 96), ('-26', 1), ('.', 54), ('...', 9), ('......', 1), ('.”', 1), ('2', 1), (':', 19), (':26', 1), (';', 2), ('?', 3), ('?”', 2), ('@', 2), ('B', 1), ('Do', 1)]
Number of all types: 1163
Processing Legal


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
2,macocu.sq.183383,Legal,140.0,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...,"{'text_id': 'macocu.sq.183383', 'domain': 'euk...",Legal,"[▁Li, ria, ▁nga, ▁keq, t, raj, timi, ▁Kon, ven...","[1261, 1651, 817, 39184, 18, 10185, 20520, 369..."
15,macocu.sq.1588366,Legal,303.0,hoteleriturizemalbania.al,kthehet për rishqyrtim në Kuvend ligjin “Për t...,returns to the Assembly for reconsideration th...,"{'text_id': 'macocu.sq.1588366', 'domain': 'ho...",Legal,"[▁kthehet, ▁për, ▁ri, shq, yr, tim, ▁në, ▁Ku, ...","[190139, 521, 1427, 36136, 12271, 5083, 322, 1..."
16,macocu.sq.1684290,Legal,88.0,komentarielektronik.magjistratura.edu.al,Neni 67-d: Shqyrtimi i çështjes\n\nGjykata Kus...,Article 67-D: Review of the case\n\nThe Consti...,"{'text_id': 'macocu.sq.1684290', 'domain': 'ko...",Legal,"[▁Ne, ni, ▁67, -, d, :, ▁Sh, qy, r, timi, ▁i, ...","[799, 93, 12661, 9, 71, 12, 7525, 73838, 42, 2..."


Number of all tokens: 2817
[('"', 1), ("'", 1), (')', 3), (',', 84), ('-', 9), ('.', 62), ('.03.2015', 1), ('.05.', 1), ('.10.', 1), ('/', 7), ('/1', 1), ('/2', 1), ('/2007', 1), ('/2008', 1), ('/2013', 1), ('/2015', 1), ('/3', 1), ('06.', 1), ('1/4', 1), ('166', 1)]
Number of all types: 903
Processing Promotion


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
18,macocu.sq.772190,Promotion,104.0,oazifestiv.al,Bashkoju komunitetit tonë\n\nA e keni shënuar ...,Join our community\n\nHave you scored your mos...,"{'text_id': 'macocu.sq.772190', 'domain': 'oaz...",Promotion,"[▁Bashk, o, ju, ▁komun, itetit, ▁tonë, ▁A, ▁e,...","[70960, 31, 461, 25560, 92593, 37799, 62, 28, ..."
20,macocu.sq.293086,Promotion,78.0,kontakt.al,"Shes Shtepi\n\nShes Apartamente 1+1, 2+1 prane...","House\n\nSell 1+1, 2+1 apartments near the new...","{'text_id': 'macocu.sq.293086', 'domain': 'kon...",Promotion,"[▁She, s, ▁Sh, te, pi, ▁She, s, ▁Apartament, e...","[4687, 7, 7525, 67, 1434, 4687, 7, 117873, 13,..."
43,macocu.sq.293573,Promotion,110.0,bkt-ks.com,Karriera në BKT\n\nIdetë tuaja sjellin ndryshi...,Career at BKT\n\nYour ideas bring about a diff...,"{'text_id': 'macocu.sq.293573', 'domain': 'bkt...",Promotion,"[▁Kar, rier, a, ▁në, ▁B, KT, ▁Ide, të, ▁tuaja,...","[3423, 25388, 11, 322, 335, 28503, 24037, 3012..."


Number of all tokens: 1795
[('!', 3), ('"', 1), (')', 1), ('+', 2), ('+1', 1), (',', 49), ('-', 5), ('-13', 1), ('-20', 1), ('.', 68), ('...', 1), ('13', 1), ('3', 1), ('3,4', 1), ('40', 1), ('46', 1), ('5', 1), ('5%', 1), ('500', 1), ('6.00', 1)]
Number of all types: 858
Processing is


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.is.639516,Legal,[is] Því er við hæfi að reglur verði settar in...,[IS] It is therefore appropriate that rules be...,{'text_id': 'macocu.is.639516'},Legal,"[▁[, is, ], ▁Því, ▁er, ▁við, ▁hæ, fi, ▁að, ▁re...","[378, 164, 268, 139806, 72, 1497, 33423, 1029,..."
1,macocu.is.1301366,Instruction,Sækja um fulla aðild \n\nKennitala * \n\nNetfa...,Apply full membership\n\nSocial Security numbe...,{'text_id': 'macocu.is.1301366'},Information/Explanation,"[▁Sæ, kja, ▁um, ▁full, a, ▁að, ild, ▁Kenn, ita...","[71595, 28643, 286, 4393, 11, 389, 38472, 5906..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,macocu.is.1301366,Instruction,Sækja um fulla aðild \n\nKennitala * \n\nNetfa...,Apply full membership\n\nSocial Security numbe...,{'text_id': 'macocu.is.1301366'},Information/Explanation,"[▁Sæ, kja, ▁um, ▁full, a, ▁að, ild, ▁Kenn, ita...","[71595, 28643, 286, 4393, 11, 389, 38472, 5906..."
2,macocu.is.1528713,Information/Explanation,Inngangur Íslenskur landbúnaður hefur þróast ö...,Introduction Icelandic agriculture has evolved...,{'text_id': 'macocu.is.1528713'},Information/Explanation,"[▁Inn, gangur, ▁Íslensk, ur, ▁land, búnað, ur,...","[11151, 160409, 122022, 474, 3551, 74026, 474,..."
16,macocu.is.238261,Information/Explanation,Það þarf geysilega öflugan tölvubúnað til að g...,It takes a very powerful computer equipment to...,{'text_id': 'macocu.is.238261'},Information/Explanation,"[▁Það, ▁þarf, ▁gey, si, lega, ▁öflug, an, ▁töl...","[10256, 33130, 97968, 172, 3257, 214160, 66, 1..."


Number of all tokens: 2151
[(')', 2), (').', 1), (',', 40), ('-', 10), ('.', 63), ('/', 1), ('006', 2), ('09.', 1), ('123', 1), ('15', 1), ('2009', 1), ('2°', 1), ('342', 1), ('3°', 1), ('500', 1), ('86', 1), (':', 3), (':15', 1), (';', 1), ('C', 2)]
Number of all types: 907
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
14,macocu.is.1066081,News,Barein í uppnámi. \n\nMótmælin í fyrra voru ba...,Bahrain upset.\n\nLast year's protests were be...,{'text_id': 'macocu.is.1066081'},News,"[▁Bare, in, ▁í, ▁upp, ná, mi, ., ▁Mó, t, mæli,...","[54336, 73, 439, 1407, 1113, 266, 5, 62453, 18..."
22,macocu.is.1619392,Information/Explanation,Aflaverðmæti jan.-okt. jókst um 15 milljarða \...,Catch value of Jan.-Oct.increased by 15 billio...,{'text_id': 'macocu.is.1619392'},News,"[▁Afla, verð, mæt, i, ▁jan, ., -, ok, t, ., ▁j...","[175724, 76729, 141685, 14, 13673, 5, 9, 685, ..."
29,macocu.is.856922,Legal,Ný lög eiga að setja upp varnir gegn hagsmunaá...,New laws are supposed to set up defense agains...,{'text_id': 'macocu.is.856922'},News,"[▁Ný, ▁lög, ▁eiga, ▁að, ▁setja, ▁upp, ▁var, ni...","[54036, 37568, 57176, 389, 93998, 1407, 285, 9..."


Number of all tokens: 4149
[('"', 6), ("'", 1), (',', 75), (',7%', 1), ('-', 16), ('-2011', 1), ('-2016', 1), ('.', 118), ('."', 1), ('/', 1), ('/2018', 1), ('1%', 1), ('16', 1), ('1998', 1), ('2', 1), ('2%', 2), ('2003', 1), ('4', 4), ('4%', 3), ('4,6', 1)]
Number of all types: 1424
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
8,macocu.is.1899016,Instruction,Sumarsúrkál \n\nSúrkál er frábært meðlæti með ...,Summer pelvic\n\nSourcke is a great treat with...,{'text_id': 'macocu.is.1899016'},Instruction,"[▁Suma, rs, úr, k, ál, ▁Sú, rk, ál, ▁er, ▁fráb...","[74663, 4295, 12595, 92, 2978, 27831, 7190, 29..."
19,macocu.is.1628328,Instruction,Kanilterta – í kaffi hjá Þóru Guðmunds \n\nKan...,Kanilterta - in coffee with Thora Guðmunds\n\n...,{'text_id': 'macocu.is.1628328'},Instruction,"[▁Kan, ilte, rta, ▁–, ▁í, ▁kaffi, ▁hjá, ▁Þór, ...","[2734, 53325, 12014, 46, 439, 186063, 16022, 5..."
37,macocu.is.1646749,Instruction,Hollenskir möndluklattar – Gevulde koek \n\nHo...,Dutch almond clutter - Gevulde Koek\n\nDutch a...,{'text_id': 'macocu.is.1646749'},Instruction,"[▁Holl, ens, kir, ▁m, önd, luk, lat, tar, ▁–, ...","[109383, 1755, 8372, 347, 111679, 5865, 2335, ..."


Number of all tokens: 3915
[('!', 1), ('"', 4), (')', 3), ('),', 2), (').', 1), (',', 75), ('-', 3), ('-1', 1), ('-3', 1), ('.', 140), ('."', 1), ('/', 1), ('2', 1), ('5°', 1), (':', 6), ('?', 2), ('@', 1), ('ATH', 1), ('C', 1), ('K', 1)]
Number of all types: 1314
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,macocu.is.2121796,Information/Explanation,Um leitina \n\nAlmenn leit \n\nNr. 11001 p1 He...,About the search\n\nSearch\n\nNo.11001 P1 Auth...,{'text_id': 'macocu.is.2121796'},Opinion/Argumentation,"[▁Um, ▁lei, tina, ▁Al, menn, ▁leit, ▁Nr, ., ▁1...","[2793, 4302, 5516, 884, 10638, 76434, 9308, 5,..."
15,macocu.is.596625,Opinion/Argumentation,Hann fæddist með miklu brambolti drengurinn se...,He was born with the great bramble boy who tur...,{'text_id': 'macocu.is.596625'},Opinion/Argumentation,"[▁Hann, ▁, fædd, ist, ▁með, ▁miklu, ▁bra, mbol...","[24179, 6, 201680, 1419, 1915, 85642, 1620, 36..."
18,macocu.is.286702,Opinion/Argumentation,Trú presta og þjóðar \n\nÍ hinum athyglisverðu...,The faith of priests and nations\n\nIn the not...,{'text_id': 'macocu.is.286702'},Opinion/Argumentation,"[▁T, rú, ▁presta, ▁og, ▁þjóð, ar, ▁Í, ▁hin, um...","[384, 11780, 11209, 60, 34762, 147, 4975, 5122..."


Number of all tokens: 4970
[('!', 4), ('!!!', 1), ('!!!!', 1), ('"', 1), ('".', 1), ('&', 1), ("'", 3), (')', 2), (').', 2), (',', 88), ('-', 17), ('.', 171), ('001', 1), ('1', 1), ('2', 1), ('200', 1), (':', 5), (';', 3), ('?', 1), ('???', 1)]
Number of all types: 1628
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
5,macocu.is.620708,Forum,Vodkabað og After eight... ...en hvað lífið ge...,Vodka bath and after eight ... ... but what li...,{'text_id': 'macocu.is.620708'},Forum,"[▁Vod, kab, að, ▁og, ▁After, ▁eight, ..., ▁......","[47572, 63911, 9068, 60, 24372, 136659, 27, 15..."
34,macocu.is.1448001,Forum,Eins og einhverjir tóku eftir á fiskifundinum ...,"As someone noticed at the last fish meeting, I...",{'text_id': 'macocu.is.1448001'},Forum,"[▁Ein, s, ▁og, ▁einhver, jir, ▁tók, u, ▁eftir,...","[2991, 7, 60, 48424, 50051, 56497, 34, 6643, 3..."
53,macocu.is.553725,Forum,"Athugasemdir \n\nÞað skilja ekki allir, hvað G...",Comments\n\nNot everyone understands what G.W....,{'text_id': 'macocu.is.553725'},Forum,"[▁At, huga, sem, dir, ▁Það, ▁skilja, ▁ekki, ▁a...","[1913, 95046, 5765, 936, 10256, 169146, 2495, ..."


Number of all tokens: 2769
[('!', 3), ('!!!', 1), ('!"', 1), ('"', 6), (')', 4), ('*', 1), (',', 58), ('-', 6), ('-2', 1), ('.', 89), ('...', 21), ('..."', 1), ('.....', 1), ('/', 5), ('17', 2), ('22', 2), ('25', 1), ('28', 1), ('4', 1), ('40', 1)]
Number of all types: 1068
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
6,macocu.is.2120473,Prose/Lyrical,Um leitina \n\nAlmenn leit \n\nNr. 7238 p1 Sók...,About the search\n\nSearch\n\nNo.7238 P1 paris...,{'text_id': 'macocu.is.2120473'},Prose/Lyrical,"[▁Um, ▁lei, tina, ▁Al, menn, ▁leit, ▁Nr, ., ▁7...","[2793, 4302, 5516, 884, 10638, 76434, 9308, 5,..."
13,macocu.is.331680,Prose/Lyrical,Meny \n\nJóhannesarguðspjallið (4/21) \n\nÞað ...,Manry\n\nJohn's Gospel (4/21)\n\nWhat you have...,{'text_id': 'macocu.is.331680'},Prose/Lyrical,"[▁Meny, ▁Jóhann, es, arg, uð, sp, jal, lið, ▁(...","[94600, 143518, 90, 61477, 13140, 7008, 5457, ..."
30,macocu.is.1652153,Prose/Lyrical,"I. KAPITULI. \n\n""Hvenær skyldi sá dagur koma,...","Chapter I.\n\n""When should that day come, that...",{'text_id': 'macocu.is.1652153'},Prose/Lyrical,"[▁I, ., ▁KAP, ITU, LI, ., ▁"", H, ven, ær, ▁sky...","[87, 5, 102379, 56024, 10927, 5, 44, 841, 1353..."


Number of all tokens: 4649
[('!', 7), ('"', 6), ('".', 1), ('(', 1), (')', 3), ('),', 2), (').', 3), ('*', 1), ('**', 1), (',', 126), ('-', 6), ('.', 149), ('."', 9), ('/', 1), ('0', 1), ('00000', 1), ('1', 1), ('10', 2), ('11', 2), ('12', 1)]
Number of all types: 1481
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.is.639516,Legal,[is] Því er við hæfi að reglur verði settar in...,[IS] It is therefore appropriate that rules be...,{'text_id': 'macocu.is.639516'},Legal,"[▁[, is, ], ▁Því, ▁er, ▁við, ▁hæ, fi, ▁að, ▁re...","[378, 164, 268, 139806, 72, 1497, 33423, 1029,..."
9,macocu.is.867399,Legal,Eftirfarandi breytingar verða á 31. gr. lagann...,The following amendments will be made to Artic...,{'text_id': 'macocu.is.867399'},Legal,"[▁Eftir, far, andi, ▁breytingar, ▁verða, ▁á, ▁...","[94685, 3814, 8915, 130168, 35045, 392, 15554,..."
11,macocu.is.534145,News,"Skipulagsráð \n\n142. fundur 2008 \n\nÁr 2008,...","Planning\n\n142nd meeting 2008\n\nYear 2008, W...",{'text_id': 'macocu.is.534145'},Legal,"[▁Skip, u, lag, s, ráð, ▁142, ., ▁, fundur, ▁2...","[91958, 34, 3668, 7, 26068, 71725, 5, 6, 14443..."


Number of all tokens: 4005
[('"', 2), ('",', 1), (')', 8), ('),', 2), (').', 1), (',', 80), ('-', 19), ('-1)', 1), ('-18', 1), ('-2', 1), ('-2005', 1), ('-3', 1), ('-4', 1), ('-5', 1), ('.', 182), ('."', 2), ('.12.', 2), ('/', 9), ('01', 2), ('03', 2)]
Number of all types: 1203
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
12,macocu.is.729974,Promotion,Hefur þú keypt eða selt bíl á bland.is? – Yfir...,Have you bought or sold a car on bland.is?- Ov...,{'text_id': 'macocu.is.729974'},Promotion,"[▁He, fur, ▁þú, ▁keypt, ▁eða, ▁, selt, ▁bíl, ▁...","[1529, 11443, 11746, 206811, 5577, 6, 11197, 6..."
17,macocu.is.2173105,Instruction,Til þess að líta alltaf vel út heimsækja stúlk...,"In order to always look good, girls visit spa ...",{'text_id': 'macocu.is.2173105'},Promotion,"[▁Til, ▁þess, ▁að, ▁lí, ta, ▁alltaf, ▁vel, ▁út...","[5064, 8939, 389, 11652, 102, 65729, 1518, 289..."
20,macocu.is.1835441,Promotion,Netspjall \n\nOther languages \n\nSpennandi st...,Chat\n\nOther language\n\nAn exciting job at t...,{'text_id': 'macocu.is.1835441'},Promotion,"[▁Net, sp, ja, ll, ▁Other, ▁language, s, ▁Spe,...","[10086, 7008, 145, 1181, 64511, 46876, 7, 2653..."


Number of all tokens: 3036
[(').', 1), (',', 58), ('-', 2), ('-16', 1), ('-21', 1), ('-22', 1), ('.', 105), ('."', 1), ('/', 1), ('101', 1), ('105', 1), ('433', 1), ('78', 3), ('990', 1), ('999', 1), (':', 10), (':00', 1), (';', 3), ('?', 4), ('@', 1)]
Number of all types: 1192
Processing uk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307..."
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
21,macocu.uk.14920944,Information/Explanation,РОЛЬ ПРОСТОРОВО-ЧАСОВОЇ ОРГАНІЗАЦІЇ В СТРУКТУР...,The role of space-time organization in the str...,{'text_id': 'macocu.uk.14920944'},Information/Explanation,"[▁Р, ОЛЬ, ▁ПРО, С, ТОР, ОВО, -, ЧА, С, ОВО, Ї,...","[5729, 207725, 29952, 1560, 78834, 144340, 9, ..."
28,macocu.uk.10117711,Information/Explanation,Мета. Метою роботи є моделювання й розробка си...,Goal.The purpose of the work is to model and d...,{'text_id': 'macocu.uk.10117711'},Information/Explanation,"[▁Мета, ., ▁Мет, ою, ▁роботи, ▁є, ▁модел, юван...","[92327, 5, 154124, 2375, 11581, 3273, 23012, 3..."
36,macocu.uk.6120150,Information/Explanation,Стандартизація унормування експертних процесів...,Standardization of normalization of expert pro...,{'text_id': 'macocu.uk.6120150'},Information/Explanation,"[▁Стандарт, из, ація, ▁у, норм, ування, ▁експе...","[136898, 4135, 26809, 84, 110756, 5401, 58219,..."


Number of all tokens: 4352
[('"', 5), ('".', 1), ("'", 6), (')', 6), ('),', 4), (').', 2), (');', 1), (',', 199), ('-', 21), ('-18', 1), ('-5', 1), ('.', 146), ('...', 2), ('004', 1), ('02.', 1), ('105', 1), ('132', 1), ('140', 1), ('150', 1), ('154', 1)]
Number of all types: 1780
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
7,macocu.uk.10146900,News,Для гостей з Румунії в Калуші проведуть майсте...,For guests from Romania in Kalush will hold ma...,{'text_id': 'macocu.uk.10146900'},News,"[▁Для, ▁гостей, ▁з, ▁Ру, мун, ії, ▁в, ▁Кал, у,...","[5074, 121642, 210, 14853, 29824, 3645, 49, 27..."
11,macocu.uk.6201875,News,Двох псевдопрацівниць Пенсійного фонду взято п...,Two pseudo -workers of the Pension Fund is tak...,{'text_id': 'macocu.uk.6201875'},News,"[▁Д, во, х, ▁псевдо, пра, ців, ни, ць, ▁Пенс, ...","[1729, 2325, 244, 202624, 11447, 14530, 358, 3..."
13,macocu.uk.19598355,News,В Одесі презентували стратегії розвитку громад...,Odessa presented community development strateg...,{'text_id': 'macocu.uk.19598355'},News,"[▁В, ▁Одес, і, ▁, презент, ували, ▁стратегії, ...","[417, 88957, 260, 6, 117900, 16931, 243134, 16..."


Number of all tokens: 3386
[('!', 1), ('!).', 1), ('"', 5), ('",', 4), ('".', 1), ('$', 1), ("'", 5), (')', 3), (').', 2), ('*', 1), (',', 155), ('-', 13), ('.', 101), ('...', 1), ('09.', 1), ('2013', 1), (':', 7), (':29', 1), (':53', 1), ('Jan', 1)]
Number of all types: 1453
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307..."
10,macocu.uk.7357466,Instruction,"Пижмо для переривання вагітності, трави виклик...","Tansy to interrupt pregnancy, herbs cause misc...",{'text_id': 'macocu.uk.7357466'},Instruction,"[▁Пи, ж, мо, ▁для, ▁пере, ри, вання, ▁вагітнос...","[16515, 861, 1455, 518, 1741, 3315, 20371, 158..."
22,macocu.uk.20482617,Instruction,Як пізнати щільність молока \n\nЩільністю моло...,How to know the milk density\n\nMilk density i...,{'text_id': 'macocu.uk.20482617'},Instruction,"[▁Як, ▁, пізна, ти, ▁щільн, ість, ▁молока, ▁Щ,...","[4910, 6, 139842, 640, 227346, 8501, 193643, 8..."


Number of all tokens: 4411
[('!', 1), ('"', 2), ("'", 8), (')', 3), ('),', 1), (').', 3), ('):', 1), (',', 182), ('-', 3), ('.', 155), ('...', 1), ('/', 2), ('07.', 1), ('2022', 1), (':', 14), (';', 3), ('?', 3), ('?»', 1), ('ID', 2), ('bank', 1)]
Number of all types: 1709
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
5,macocu.uk.1265510,Promotion,"Цей путівник створено для тих, хто потай сумує...",This guide is created for those who secretly m...,{'text_id': 'macocu.uk.1265510'},Opinion/Argumentation,"[▁Цей, ▁пут, ів, ник, ▁створен, о, ▁для, ▁тих,...","[89562, 12469, 790, 2549, 60258, 197, 518, 262..."
12,macocu.uk.6914295,Opinion/Argumentation,Правда і неправда телефільму «Сага» \n\n2020 р...,"The truth and false film of ""saga""\n\nIn 2020,...",{'text_id': 'macocu.uk.6914295'},Opinion/Argumentation,"[▁Правда, ▁і, ▁не, правда, ▁теле, фільм, у, ▁«...","[93461, 189, 77, 142081, 18293, 212517, 105, 9..."
15,macocu.uk.3300590,Opinion/Argumentation,"Безліч позитивних емоцій, пісні, танці, спілку...","Many positive emotions, songs, dances, communi...",{'text_id': 'macocu.uk.3300590'},Opinion/Argumentation,"[▁Без, ліч, ▁позитивни, х, ▁емоцій, ,, ▁пісні,...","[16041, 37792, 160382, 244, 172059, 4, 154603,..."


Number of all tokens: 5386
[('!', 13), ('!!', 1), ('!»', 3), ('"', 2), ('",', 1), ('%', 1), ('(', 2), (')', 10), (')))', 2), ('),', 1), (').', 11), (',', 234), ('-', 17), ('-16', 2), ('.', 175), ('...', 8), ('1', 1), ('1991', 1), ('1996', 1), ('2', 1)]
Number of all types: 2162
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
2,macocu.uk.8192353,Forum,------ Що ж читаєм молитву ще .......9 Ви ж мо...,------ What we read the prayer yet ....... 9 Y...,{'text_id': 'macocu.uk.8192353'},Forum,"[▁, ------, ▁Що, ▁ж, ▁чита, єм, ▁молитв, у, ▁щ...","[6, 110405, 21565, 2637, 16911, 24032, 103374,..."
6,macocu.uk.8466785,Forum,сподобалось \n\nПрозорий образ. Портрет вічног...,Liked\n\nTransparent image.Portrait of an eter...,{'text_id': 'macocu.uk.8466785'},Forum,"[▁спо, доб, а, лось, ▁Про, зор, ий, ▁образ, .,...","[12375, 14193, 59, 10041, 2443, 27285, 983, 17..."
14,macocu.uk.13839011,Forum,Параметри пошуку \n\nЯкщо це не відповідає ваш...,Search parameters\n\nIf this does not match yo...,{'text_id': 'macocu.uk.13839011'},Forum,"[▁Пара, метри, ▁пошуку, ▁Якщо, ▁це, ▁не, ▁відп...","[59283, 146331, 119528, 15963, 2157, 77, 13813..."


Number of all tokens: 3528
[('!', 12), ('!!!', 6), ('"', 15), ('".', 3), ('"?', 1), ('$', 2), ("'", 6), ('(', 3), (')', 17), (')))', 6), (').', 1), (',', 134), ('-', 11), ('------', 3), ('-100', 1), ('-16', 1), ('.', 137), ('."', 2), ('...', 9), ('..."', 1)]
Number of all types: 1491
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9..."
19,macocu.uk.480365,Prose/Lyrical,"Загадки про комах, павуків. Ігор Голомозий \n\...","Insect puzzles, spiders.Igor Golomoziy\n\nColl...",{'text_id': 'macocu.uk.480365'},Prose/Lyrical,"[▁За, гад, ки, ▁про, ▁ком, ах, ,, ▁па, ву, ків...","[829, 22302, 751, 591, 14468, 1214, 4, 753, 52..."
31,macocu.uk.18757287,Prose/Lyrical,Вихід \n\n8 Після того до Рефідı́ма прийшли ам...,"Entrance\n\n8 After that, the Amalani+ came to...",{'text_id': 'macocu.uk.18757287'},Prose/Lyrical,"[▁Ви, хід, ▁8, ▁Після, ▁того, ▁до, ▁Ре, фі, д,...","[2857, 22011, 382, 39092, 2574, 255, 5560, 213..."


Number of all tokens: 4282
[('!', 13), ('!"', 2), ('"', 1), ('")', 1), ('").', 1), ("'", 5), (')', 9), ('*', 3), ('+', 9), (',', 199), ('-', 11), ('.', 123), ('...', 25), ('1', 1), ('2', 1), ('41', 1), (':', 6), (':28', 1), (';', 1), ('?', 3)]
Number of all types: 1491
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,macocu.uk.13245909,Legal,"Про визнання майнових прав, права власності на...","On recognition of property rights, ownership o...",{'text_id': 'macocu.uk.13245909'},Legal,"[▁Про, ▁визнання, ▁майно, вих, ▁прав, ,, ▁прав...","[2443, 192396, 129232, 31493, 5999, 4, 2868, 7..."
8,macocu.uk.20112201,Legal,З 1 березня 2017 року змінено Порядок забезпеч...,"From March 1, 2017, the procedure for providin...",{'text_id': 'macocu.uk.20112201'},Legal,"[▁З, ▁1, ▁березня, ▁2017, ▁року, ▁змін, ено, ▁...","[1522, 106, 59596, 505, 4509, 21187, 7025, 155..."
17,macocu.uk.10126627,Legal,Оплачуємо викладацьку роботу \n\nКерівні праці...,We pay for teaching\n\nManagers of educational...,{'text_id': 'macocu.uk.10126627'},Legal,"[▁О, плачу, ємо, ▁виклад, аць, ку, ▁роботу, ▁К...","[1089, 126909, 53132, 93208, 11505, 928, 40097..."


Number of all tokens: 3484
[('!»', 1), ('"', 5), ('&', 4), ("'", 13), (')', 11), ('),', 3), (').', 6), (',', 141), ('-', 30), ('-1', 1), ('-21', 1), ('.', 112), ('.1.1', 1), ('.11.20', 1), ('.3.', 2), ('.4.', 1), ('.5.', 1), ('/', 9), ('/08', 1), ('/2006', 8)]
Number of all types: 1302
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
3,macocu.uk.14283095,Promotion,Круглий стіл «Корпоративна соціальна відповіда...,"Round table ""Corporate social responsibility o...",{'text_id': 'macocu.uk.14283095'},Promotion,"[▁Кру, гли, й, ▁стіл, ▁«, Кор, пор, ативна, ▁с...","[61797, 29034, 312, 199472, 94, 110879, 28551,..."
24,macocu.uk.6585419,Promotion,"Купити квартиру в Черкасах, площа кухні, м2: 5...","Buy an apartment in Cherkasy, kitchen area, m2...",{'text_id': 'macocu.uk.6585419'},Promotion,"[▁Куп, ити, ▁квартиру, ▁в, ▁Черкас, ах, ,, ▁пл...","[48228, 3494, 114527, 49, 224761, 1214, 4, 632..."
26,macocu.uk.1776824,Promotion,Rocket Espresso \n\nRocket Espresso Milano - і...,Rocket Espresso\n\nRocket Espresso Milano is a...,{'text_id': 'macocu.uk.1776824'},Promotion,"[▁Rock, et, ▁Espresso, ▁Rock, et, ▁Espresso, ▁...","[14434, 126, 240751, 14434, 126, 240751, 30717..."


Number of all tokens: 2848
[('!', 1), ('&', 7), ("'", 10), (')', 4), ('),', 1), (').', 2), ('+', 1), (',', 108), ('-', 26), ('.', 82), ('/', 1), ('2', 1), ('2.', 1), ('3', 1), ('3-', 1), ('43', 1), ('6', 1), ('61', 1), ('8', 1), (':', 21)]
Number of all types: 1318
Processing ca


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.ca.2248072,Opinion/Argumentation,Pàgines \n\nEnfeinada \n\nPorto uns dies una m...,Pages\n\nCoined\n\nI have been a little busy f...,{'text_id': 'macocu.ca.2248072'},Forum,"[▁P, à, gine, s, ▁En, fei, nada, ▁Porto, ▁uns,...","[436, 1298, 63023, 7, 357, 51899, 28866, 24952..."
1,macocu.ca.756254,Information/Explanation,Info \n\nLa Casa nova dels Banys de Sant Vicen...,Info\n\nThe Casa Nova dels Banys de Sant Vicen...,{'text_id': 'macocu.ca.756254'},Information/Explanation,"[▁Info, ▁La, ▁Casa, ▁nova, ▁dels, ▁Ban, ys, ▁d...","[14048, 239, 8591, 4678, 2323, 5458, 4778, 8, ..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,macocu.ca.756254,Information/Explanation,Info \n\nLa Casa nova dels Banys de Sant Vicen...,Info\n\nThe Casa Nova dels Banys de Sant Vicen...,{'text_id': 'macocu.ca.756254'},Information/Explanation,"[▁Info, ▁La, ▁Casa, ▁nova, ▁dels, ▁Ban, ys, ▁d...","[14048, 239, 8591, 4678, 2323, 5458, 4778, 8, ..."
13,macocu.ca.401308,Information/Explanation,Josep Trò \n\nTrombonista nascut a Calp (Alaca...,Josep Tros\n\nTrombonist born in Calpe (Alican...,{'text_id': 'macocu.ca.401308'},Information/Explanation,"[▁Josep, ▁T, rò, ▁Tro, mbo, nista, ▁nascut, ▁a...","[55337, 384, 23017, 8302, 7567, 81680, 141347,..."
17,macocu.ca.525727,News,El SSIBE presenta la Memòria de Sostenibilitat...,The SSIBE presents the 2020 Sustainability Mem...,{'text_id': 'macocu.ca.525727'},Information/Explanation,"[▁El, ▁, SSI, BE, ▁presenta, ▁la, ▁Mem, ò, ria...","[540, 6, 98529, 20090, 12198, 21, 17443, 2516,..."


Number of all tokens: 4017
[("'", 88), (')', 15), ('),', 1), (').', 5), (',', 127), ('-', 12), ('-19', 3), ('.', 100), ('...', 1), ('.[1]', 1), ('/', 3), ('/07', 1), ('/08', 1), ('01', 1), ('070', 1), ('08)', 1), ('1/10', 1), ('10', 1), ('1945', 2), ('1968', 2)]
Number of all types: 1465
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
9,macocu.ca.982680,News,Esteu aquí: Inici / Cambrils / Ajuntament / Co...,You are here: Start / Cambrils / City Council ...,{'text_id': 'macocu.ca.982680'},News,"[▁Este, u, ▁aquí, :, ▁Inici, ▁/, ▁Cam, bril, s...","[3224, 34, 9877, 12, 74572, 248, 10071, 74143,..."
27,macocu.ca.3244912,News,"23/09/2015 \n\nLa Fundació Apadis, juntament a...","09/23/2015\n\nThe Apadis Foundation, together ...",{'text_id': 'macocu.ca.3244912'},News,"[▁23, /09/, 2015, ▁La, ▁Fundació, ▁Apa, dis, ,...","[1105, 44270, 5357, 239, 91571, 9795, 3827, 4,..."
38,macocu.ca.3837466,News,BASURAS posa el seu granet de sorra. Projecció...,Basuras puts his little bit of sand.Screening ...,{'text_id': 'macocu.ca.3837466'},News,"[▁BAS, URA, S, ▁posa, ▁el, ▁seu, ▁gran, et, ▁d...","[96971, 45593, 294, 41471, 88, 1169, 2855, 126..."


Number of all tokens: 2612
[('!', 1), ("'", 8), (')', 2), ('),', 3), (').', 1), (',', 97), ('-', 12), ('-3', 1), ('.', 48), ('...).', 1), ('/', 1), ('/04/', 1), ('/09/', 2), ('1', 1), ('137', 1), ('2015', 1), ('2016', 1), ('2018', 1), ('442', 1), ('6.000', 1)]
Number of all types: 994
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
2,macocu.ca.1944492,Instruction,Mousse de llimona Escola Mare de Deu de Montse...,Lemon Mousse School Mare de Deu de Montserrat ...,{'text_id': 'macocu.ca.1944492'},Instruction,"[▁Mou, sse, ▁de, ▁l, limo, na, ▁Escola, ▁Mare,...","[45709, 2619, 8, 96, 70430, 76, 41483, 22172, ..."
45,macocu.ca.1029906,Instruction,Guisat d’azukis amb carbassa \n\nUn plat de ta...,Azukis stew with pumpkin\n\nAn Autumn-Winter P...,{'text_id': 'macocu.ca.1029906'},Instruction,"[▁Gui, sat, ▁d, ’, a, zuki, s, ▁amb, ▁carb, as...","[53513, 6489, 104, 26, 11, 174529, 7, 920, 111..."
64,macocu.ca.3352777,Instruction,"Elaboració Descongelarem el salmó durant 1,5 h...",Elaboration will thaw the salmon for 1.5 hours...,{'text_id': 'macocu.ca.3352777'},Instruction,"[▁E, laboració, ▁Desc, onge, lar, em, ▁el, ▁sa...","[241, 49457, 68320, 63675, 320, 195, 88, 1552,..."


Number of all tokens: 2133
[('!!!!!!!', 1), ("'", 18), (')', 7), ('),', 1), (').', 1), (',', 63), ('-', 25), ('-20', 2), ('.', 63), ('...', 3), ('......', 1), ('.........', 1), ('.............', 1), ('/', 2), ('/12/', 1), ('/2009', 1), ('00', 1), ('0000', 1), ('1', 2), ('23', 2)]
Number of all types: 835
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
7,macocu.ca.538780,Opinion/Argumentation,"Caos \n\nRenou, trànsit, multitud, desordre, c...","Chaos\n\nRenou, traffic, crowd, clutter, chaos...",{'text_id': 'macocu.ca.538780'},Opinion/Argumentation,"[▁Cao, s, ▁Re, nou, ,, ▁trànsit, ,, ▁multitud,...","[56978, 7, 853, 3165, 4, 223676, 4, 95937, 4, ..."
21,macocu.ca.5024991,Opinion/Argumentation,Tele-treball o presencialitat? \n\nEl correu e...,Tele-work or face-to-face?\n\nThe email that E...,{'text_id': 'macocu.ca.5024991'},Opinion/Argumentation,"[▁Tele, -, tre, ball, ▁o, ▁presencial, itat, ?...","[10142, 9, 2921, 12126, 36, 195059, 6152, 32, ..."
24,macocu.ca.1828812,Opinion/Argumentation,Fa uns mesos la vaig repescar en un dels canal...,A few months ago I repeated it in one of the T...,{'text_id': 'macocu.ca.1828812'},Opinion/Argumentation,"[▁Fa, ▁uns, ▁mesos, ▁la, ▁vaig, ▁repe, scar, ▁...","[3036, 2234, 72006, 21, 25728, 69495, 39719, 2..."


Number of all tokens: 3368
[('!', 6), ('!!!', 1), ('!).', 2), ("'", 16), ('(', 1), (')', 2), (').', 1), (',', 155), ('-', 18), ('.', 87), ('...', 7), ('/', 2), ('1', 2), ('2021', 1), ('3/', 2), ('300', 2), (':', 5), ('://', 1), (';', 6), ('?', 6)]
Number of all types: 1187
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.ca.2248072,Opinion/Argumentation,Pàgines \n\nEnfeinada \n\nPorto uns dies una m...,Pages\n\nCoined\n\nI have been a little busy f...,{'text_id': 'macocu.ca.2248072'},Forum,"[▁P, à, gine, s, ▁En, fei, nada, ▁Porto, ▁uns,...","[436, 1298, 63023, 7, 357, 51899, 28866, 24952..."
3,macocu.ca.5371529,Forum,"Mourinho: Les mateixes paranoies a Anglaterra,...","Mourinho: The same paranoia in England, Italy ...",{'text_id': 'macocu.ca.5371529'},Forum,"[▁Mourinho, :, ▁Les, ▁mateix, es, ▁para, no, i...","[136691, 12, 1734, 24253, 90, 121, 157, 3387, ..."
10,macocu.ca.5477726,Forum,3 participantes \n\nAutor \n\nMensaje \n\nTema...,3 participants\n\nAuthor\n\nMensaje\n\nTopic: ...,{'text_id': 'macocu.ca.5477726'},Forum,"[▁3, ▁participantes, ▁Autor, ▁Men, sa, je, ▁Te...","[138, 44247, 11328, 1111, 433, 236, 17533, 12,..."


Number of all tokens: 3755
[('!', 26), ('!!', 6), ('!!!', 7), ('!!!!!!', 1), ('!),', 1), ('"', 5), ('&', 2), ("'", 59), (')', 8), ('),', 1), (').', 3), (');', 1), ('+', 1), (',', 159), ('-', 12), ('.', 72), ('...', 46), ('...)', 1), ('.....', 5), ('......', 3)]
Number of all types: 1257
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
5,macocu.ca.5857784,Opinion/Argumentation,Enllaços ràpids \n\nEmilio i Rosa han creat un...,Fast Links\n\nEmilio and Rosa have created a p...,{'text_id': 'macocu.ca.5857784'},Prose/Lyrical,"[▁En, lla, ços, ▁ràpid, s, ▁Emilio, ▁i, ▁Rosa,...","[357, 1165, 53455, 138107, 7, 175208, 17, 2964..."
8,macocu.ca.919403,Prose/Lyrical,Esborranys d’una oda al prefaci de la primera ...,Drafts of an ode to the preface to the first e...,{'text_id': 'macocu.ca.919403'},Prose/Lyrical,"[▁Es, bor, ran, ys, ▁d, ’, una, ▁oda, ▁al, ▁pr...","[1184, 3422, 1603, 4778, 104, 26, 1946, 23571,..."
12,macocu.ca.1184407,Prose/Lyrical,Sardana: Colla Dansa Eterna Autor de la Música...,Sardana: Eternal dance group Author of music: ...,{'text_id': 'macocu.ca.1184407'},Prose/Lyrical,"[▁Sar, dana, :, ▁Coll, a, ▁Dans, a, ▁E, terna,...","[7374, 15586, 12, 88842, 11, 12612, 11, 241, 3..."


Number of all tokens: 4092
[('!', 27), ('!!', 1), ('!).', 1), ('!”', 8), ('"', 1), ("'", 65), (')', 3), (').', 1), (',', 193), ('-', 19), ('.', 131), ('...', 7), ('.«', 1), ('1981', 1), ('297', 1), (':', 18), (';', 3), ('?', 9), ('A', 2), ('ANY', 1)]
Number of all types: 1353
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,macocu.ca.2389428,Legal,"LLEI 15/2005, de 27 de desembre, de reforma pa...","Law 15/2005, of December 27, on partial reform...",{'text_id': 'macocu.ca.2389428'},Legal,"[▁L, LEI, ▁15, /2005, ,, ▁de, ▁27, ▁de, ▁desem...","[339, 96130, 423, 105523, 4, 8, 1438, 8, 40333..."
16,macocu.ca.2580070,Legal,22/04/2022 \n\nNomés se'n demanarà el seu ús e...,04/22/2022\n\nOnly use will be requested in ce...,{'text_id': 'macocu.ca.2580070'},Legal,"[▁22, /04/, 2022, ▁Només, ▁se, ', n, ▁demanar,...","[1039, 48409, 151159, 169332, 40, 25, 19, 1447..."
18,macocu.ca.2935961,Legal,Funcionament \n\nD'entre els membres dels CBAs...,Functioning\n\nAmong the members of the CBAS w...,{'text_id': 'macocu.ca.2935961'},Legal,"[▁Funciona, ment, ▁D, ', entre, ▁els, ▁membres...","[214678, 674, 391, 25, 22089, 1115, 35864, 232..."


Number of all tokens: 3336
[('",', 1), ("'", 55), (')', 9), ('),', 8), (').', 5), (',', 151), ('-', 24), ('-19', 2), ('-5', 1), ('.', 65), ('/', 27), ('/03/', 1), ('/04/', 1), ('/08', 1), ('/2000', 1), ('/2002', 3), ('/2003', 1), ('/2004', 1), ('/2005', 1), ('/2010', 2)]
Number of all types: 1095
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
11,macocu.ca.1102508,Promotion,Hacks per a dones emprenedores a Lleida \n\nLe...,Hacks for Entrepreneurial Women in Lleida\n\nE...,{'text_id': 'macocu.ca.1102508'},Promotion,"[▁Hack, s, ▁per, ▁a, ▁dones, ▁, emprenedor, es...","[52922, 7, 117, 10, 62251, 6, 242180, 90, 10, ..."
15,macocu.ca.2044996,Promotion,RECURSOS ELECTRòNICS DE L'àMBIT DE NàUTICA \n\...,Electronic Resources in the field of nautical\...,{'text_id': 'macocu.ca.2044996'},Promotion,"[▁REC, URS, OS, ▁E, LECT, R, ò, NIC, S, ▁DE, ▁...","[98166, 112045, 7285, 241, 144832, 1052, 2516,..."
23,macocu.ca.3741786,Promotion,Alchimiaweb.com posa al teu abast aquesta prem...,Alchimiaweb.com offers this professional elect...,{'text_id': 'macocu.ca.3741786'},Promotion,"[▁Al, chi, mia, web, ., com, ▁posa, ▁al, ▁teu,...","[884, 1861, 7605, 14051, 5, 277, 41471, 144, 1..."


Number of all tokens: 4231
[('"', 1), ('&', 1), ("'", 51), ('(', 1), (')', 4), ('),', 1), (').', 2), (',', 135), ('-', 11), ('.', 109), ('/', 5), ('/03/', 1), ('/05/20', 1), ('000.000', 1), ('1.4', 1), ('173', 1), ('2', 2), ('2015', 1), ('2017', 1), ('2022', 1)]
Number of all types: 1444
Processing mk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
18,CLASSLA-web.mk.138126,Information/Explanation,Широката распространетост на РЕЛ-заварувањето ...,The widespread distribution of the relief and ...,"CLASSLA-web.mk.138126', 'domain': 'emiter.com....",Information/Explanation,"[▁, Широк, ата, ▁рас, простран, ет, ост, ▁на, ...","[6, 184470, 1912, 3433, 115762, 1730, 7204, 29..."
22,CLASSLA-web.mk.18600,Information/Explanation,Топењето на мразот е посебен феномен од 21-от ...,Melting ice is a special 21st century phenomen...,"CLASSLA-web.mk.18600', 'domain': 'radar.mk'}",Information/Explanation,"[▁Топ, ењето, ▁на, ▁мраз, от, ▁е, ▁посебен, ▁ф...","[45805, 63156, 29, 193740, 512, 218, 186321, 1..."
32,CLASSLA-web.mk.313556,Information/Explanation,Сексуално преносливи болести\nАвтори:\n978-608...,Sexually Transmitted Diseases Authors: 978-608...,"CLASSLA-web.mk.313556', 'domain': '1000knigi.m...",Information/Explanation,"[▁Секс, у, ално, ▁пренос, ливи, ▁болести, ▁Авт...","[148406, 105, 20743, 92332, 36561, 45011, 3293..."


Number of all tokens: 3324
[('!', 3), ('"', 9), ('",', 2), ('".', 2), ('%', 1), (')', 5), (').', 2), (',', 118), ('-', 12), ('-24', 1), ('-30', 1), ('-8', 1), ('.', 109), ('.000', 1), ('.10.', 1), ('/', 3), ('/10', 1), ('1', 2), ('1%', 1), ('1974', 1)]
Number of all types: 1334
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44..."
7,CLASSLA-web.mk.1109768,News,Фотографијата на дете залепено на брзиномер е ...,The photo of a child glued to a speeder is old...,"CLASSLA-web.mk.1109768', 'domain': 'vistinomer...",News,"[▁Фотограф, ијата, ▁на, ▁дете, ▁за, леп, ено, ...","[170011, 24628, 29, 23744, 61, 17583, 7025, 29..."
8,CLASSLA-web.mk.1220253,News,Министерката Царовска нагласи дека првпат конк...,Minister Jarovska emphasized that for the firs...,"CLASSLA-web.mk.1220253', 'domain': 'novatv.mk'}",News,"[▁Министер, ката, ▁Цар, овска, ▁нагласи, ▁дека...","[133082, 4120, 42701, 95557, 172403, 2038, 228..."


Number of all tokens: 3778
[('"', 11), ('",', 5), ("'", 1), ('(', 1), (')', 1), ('),', 2), (').', 1), (',', 116), ('-', 7), ('.', 86), ('...', 1), ('..."', 1), ('.11.', 1), ('/', 1), ('/18', 1), ('02.', 1), ('05', 1), ('13', 1), ('190', 1), ('2', 1)]
Number of all types: 1447
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
2,CLASSLA-web.mk.1043814,Instruction,Најголем фактор на ризик за развој на проширен...,The biggest risk factor for the development of...,"CLASSLA-web.mk.1043814', 'domain': 'puls24.mk'}",Instruction,"[▁Најголем, ▁фактор, ▁на, ▁ризик, ▁за, ▁развој...","[238783, 25873, 29, 50641, 61, 25348, 29, 591,..."
4,CLASSLA-web.mk.1051753,Instruction,Запознавањето нови луѓе може да биде предизвик...,"Getting to know new people can be a challenge,...","CLASSLA-web.mk.1051753', 'domain': 'kafepauza....",Instruction,"[▁За, познавање, то, ▁нови, ▁луѓе, ▁може, ▁да,...","[829, 189619, 328, 9502, 27118, 1252, 69, 8814..."
47,CLASSLA-web.mk.400041,Instruction,За ојачување на имунитетот многу е важен видот...,The type and quality of the groceries we eat i...,"CLASSLA-web.mk.400041', 'domain': 'balansplus....",Instruction,"[▁За, ▁о, ја, чување, ▁на, ▁имунитет, от, ▁мно...","[829, 407, 1852, 91482, 29, 218047, 512, 9582,..."


Number of all tokens: 3271
[('!', 1), ('"', 1), ('%', 1), ('(', 1), (')', 2), (',', 113), ('-', 2), ('->', 9), ('.', 110), ('...', 1), ('/', 7), ('10', 2), ('8/', 2), (':', 6), ('?', 5), ('?"', 3), ('A', 1), ('Ap', 1), ('B', 1), ('Dis', 3)]
Number of all types: 1158
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
12,CLASSLA-web.mk.1247279,Opinion/Argumentation,"ФАЛБИ БЕЗ СРАМ!\nИмено, овој исклучително необ...","Falcades without shame! Namely, this extremely...","CLASSLA-web.mk.1247279', 'domain': 'proverkana...",Opinion/Argumentation,"[▁Ф, АЛ, БИ, ▁БЕЗ, ▁С, РАМ, !, ▁Име, но, ,, ▁о...","[4560, 23151, 53267, 149862, 589, 203826, 38, ..."
13,CLASSLA-web.mk.125691,Information/Explanation,Со зголемување на густината на радиосообраќајо...,With increasing radio traffic density and the ...,"CLASSLA-web.mk.125691', 'domain': 'emiter.com....",Opinion/Argumentation,"[▁Со, ▁зголемување, ▁на, ▁густ, ината, ▁на, ▁р...","[4056, 149598, 29, 122398, 87755, 29, 21220, 6..."
17,CLASSLA-web.mk.1373627,Opinion/Argumentation,На лузните треба да сме горди затоа што тие се...,We need to be proud of scars because they are ...,"CLASSLA-web.mk.1373627', 'domain': 'navalica.c...",Opinion/Argumentation,"[▁На, ▁лу, з, ните, ▁треба, ▁да, ▁сме, ▁горд, ...","[672, 38974, 1316, 1523, 4553, 69, 5063, 11638..."


Number of all tokens: 3007
[('!', 2), ('!"', 1), ('"', 14), ('",', 5), ('".', 1), (')', 3), (').', 2), ('*', 5), ('+', 1), (',', 111), ('-', 4), ('.', 70), ('...', 3), ('..."', 1), ('/', 3), ('13', 1), ('37', 1), ('5', 1), (':', 5), ('://', 1)]
Number of all types: 1237
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416..."
3,CLASSLA-web.mk.1050389,Forum,Провери си тука http://www.autobulbsdirect.co....,Check out here http://www.autobulbsdirect.co.u...,"CLASSLA-web.mk.1050389', 'domain': 'forum.carc...",Forum,"[▁Про, вер, и, ▁си, ▁тука, ▁http, ://, www, .,...","[2443, 3806, 89, 801, 72456, 1621, 696, 1574, ..."
16,CLASSLA-web.mk.1342417,Forum,Душички најмили имав едно кученце многу си го ...,I had my dearest my dearest I loved him very m...,"CLASSLA-web.mk.1342417', 'domain': 'forum.femi...",Forum,"[▁Душ, ички, ▁нај, ми, ли, ▁има, в, ▁едно, ▁ку...","[186606, 26613, 4308, 827, 546, 1698, 652, 807..."


Number of all tokens: 4072
[('!', 4), ('"', 8), ('",', 1), ('".', 1), ('&', 1), ("'", 2), ('(', 7), (')', 13), ('),', 1), (').', 2), (',', 136), ('-', 7), ('.', 112), ('...', 21), ('.4.', 1), ('/', 9), ('1', 1), ('18', 1), ('2', 1), ('2010', 1)]
Number of all types: 1536
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
10,CLASSLA-web.mk.1231279,Prose/Lyrical,"Вести\nсветиот свештеномаченик Елевтериј,\nсве...","News Holy Martyr Elevterius, St. The Church re...","CLASSLA-web.mk.1231279', 'domain': 'preminport...",Prose/Lyrical,"[▁Вести, ▁свети, от, ▁све, штен, ома, че, ник,...","[88598, 50247, 512, 3913, 102152, 31705, 1794,..."
11,CLASSLA-web.mk.1234991,Prose/Lyrical,Тропар\nТропар на светите Христови маченици Ки...,Tropar Tropar of the Holy Martyrs of Christ Ki...,"CLASSLA-web.mk.1234991', 'domain': 'preminport...",Prose/Lyrical,"[▁Тро, пар, ▁Тро, пар, ▁на, ▁свет, ите, ▁Христ...","[48002, 20141, 48002, 20141, 29, 8802, 742, 87..."
20,CLASSLA-web.mk.1389858,Prose/Lyrical,Сестра Еврозија: Смиреноста е надеж и надежта ...,Sister Eurosia: Comfort is hope and hope is ca...,"CLASSLA-web.mk.1389858', 'domain': 'katolici.mk'}",Prose/Lyrical,"[▁Се, стра, ▁Евро, зија, :, ▁С, мир, е, носта,...","[6891, 7133, 61282, 42325, 12, 589, 8688, 103,..."


Number of all tokens: 3892
[('!', 10), ('!!!', 1), ('!"', 2), ('",', 1), ('".', 1), ("'", 1), (')', 1), (',', 184), ('-', 2), ('.', 107), ('."', 1), ('...', 7), ('06.', 1), ('2012', 1), (':', 18), (';', 5), ('?', 4), ('a', 1), ('deli', 1), ('e', 61)]
Number of all types: 1245
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
14,CLASSLA-web.mk.1282476,Legal,Изјава за приватност\nДневни Новости ја почиту...,Privacy Statement Daily News respects the priv...,"CLASSLA-web.mk.1282476', 'domain': 'dnevninovo...",Legal,"[▁Изјава, ▁за, ▁приватност, ▁Д, не, вни, ▁Ново...","[182663, 61, 213350, 1729, 336, 47124, 97836, ..."
60,CLASSLA-web.mk.556171,Legal,Целта за носење Предлог – законот за градежнит...,The purpose of adopting the Draft Law on Const...,"CLASSLA-web.mk.556171', 'domain': 'pravdiko.mk'}",Legal,"[▁Целта, ▁за, ▁носе, ње, ▁Предлог, ▁–, ▁законо...","[147967, 61, 104412, 1412, 167404, 46, 153692,..."
63,CLASSLA-web.mk.609997,Legal,-Усвојување на Записникот од 27-та пленарна се...,-Uagement of the minutes of the 27th plenary s...,"CLASSLA-web.mk.609997', 'domain': 'kiselavoda....",Legal,"[▁-, У, свој, ување, ▁на, ▁За, пис, никот, ▁од...","[20, 3159, 181098, 6154, 29, 829, 15309, 65212..."


Number of all tokens: 3412
[('"', 5), (')', 5), ('),', 2), (').', 2), (');', 2), ('+3', 1), (',', 130), ('-', 5), ('-0', 8), ('-31', 1), ('.', 70), ('.000,00', 4), ('.04.20', 4), ('.12.', 1), ('/', 15), ('/05', 1), ('/06', 1), ('/07', 1), ('/08', 1), ('/10', 1)]
Number of all types: 1050
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
6,CLASSLA-web.mk.1107538,Promotion,"BOSCH WAN24063BY, Серија 4, Машина за перење а...","Bosch WAN24063BY, Series 4, Washing Machine wi...","CLASSLA-web.mk.1107538', 'domain': 'setec.mk'}",Promotion,"[▁BO, SCH, ▁, WAN, 240, 63, BY, ,, ▁Сер, ија, ...","[14216, 97692, 6, 53912, 62635, 15748, 36354, ..."
21,CLASSLA-web.mk.159292,Promotion,"Почитувани бруцоши, ни претставува особена чес...","Distinguished freshmen, it is a great honor fo...","CLASSLA-web.mk.159292', 'domain': 'finki.ukim....",Promotion,"[▁По, чит, увани, ▁, бру, цо, ши, ,, ▁ни, ▁пре...","[901, 34496, 119894, 6, 58703, 38140, 4774, 4,..."
23,CLASSLA-web.mk.203159,Opinion/Argumentation,"Main menu\nПребарување\nHacklab пријави\nВто, ...","Main Menu Search Hacklab Report Second, 09/01/...","CLASSLA-web.mk.203159', 'domain': 'slobodensof...",Promotion,"[▁Main, ▁menu, ▁Пре, бар, ување, ▁Hack, lab, ▁...","[12321, 8026, 9783, 8868, 6154, 52922, 6114, 1..."


Number of all tokens: 2883
[('!', 2), ('"', 5), ('",', 1), (')', 2), ('),', 1), ('++', 1), (',', 113), ('-', 8), ('.', 74), ('...', 1), ('.09.2015', 1), ('/01/', 1), ('02', 1), ('18', 2), ('2', 1), ('2009', 1), ('229', 1), ('240', 1), ('3', 1), ('3⁄4', 1)]
Number of all types: 1298
Processing hr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.hr.1033815,Promotion,"O proizvodu\nColor Transformer, za pametno i j...","About the Color Transformer product, for smart...","CLASSLA-web.hr.1033815', 'domain': 'hairshop.hr'}",Promotion,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, former, ,, ...","[180, 43170, 34, 51193, 11062, 82772, 4, 80, 6..."
2,CLASSLA-web.hr.1119579,Promotion,Sunčano selo / Sunny village\nNa obroncima Bil...,Sunshine / Sunny Village on the slopes of Bilo...,"CLASSLA-web.hr.1119579', 'domain': 'vikendi.com'}",Promotion,"[▁Sun, čan, o, ▁se, lo, ▁/, ▁Sunny, ▁village, ...","[7550, 17129, 31, 40, 365, 248, 151197, 54427,..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
3,CLASSLA-web.hr.1147303,Information/Explanation,Proteini sirutke korisni i ženama\nProteini si...,Cheese proteins also useful to women's whey pr...,"CLASSLA-web.hr.1147303', 'domain': 'vitamini.hr'}",Information/Explanation,"[▁Protein, i, ▁si, rut, ke, ▁koris, ni, ▁i, ▁ž...","[129228, 14, 78, 16014, 350, 107760, 93, 17, 2..."
10,CLASSLA-web.hr.1486486,Opinion/Argumentation,"SADRŽAJ\nDrama. Tridesete godine u Njemačkoj, ...",The contents of the drama. In the thirties in ...,"CLASSLA-web.hr.1486486', 'domain': 'tvprofil.c...",Information/Explanation,"[▁SAD, R, Ž, AJ, ▁Drama, ., ▁Tri, deset, e, ▁g...","[47854, 1052, 12373, 25582, 46406, 5, 4699, 64..."
14,CLASSLA-web.hr.160326,Information/Explanation,O klubu\nKlub za starije osobe „Mariška“ nepro...,"The club for the elderly ""Mariska"" club is a n...","CLASSLA-web.hr.160326', 'domain': 'klub-marisk...",Information/Explanation,"[▁O, ▁klubu, ▁Klub, ▁za, ▁starije, ▁osobe, ▁„,...","[180, 32140, 23512, 80, 198117, 42941, 137, 63..."


Number of all tokens: 4027
[('!', 3), ('".', 1), ("'", 3), ("''", 2), (')', 21), ('),', 7), (').', 7), ('):', 1), (',', 147), ('-', 15), ('-17', 2), ('-21', 1), ('-31', 9), ('-8', 1), ('.', 106), ('...', 1), ('.11.20', 4), ('.12.', 24), ('.4.', 1), ('.”', 1)]
Number of all types: 1646
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
32,CLASSLA-web.hr.265086,News,POČEO FESTIVAL\nVodeća imena svjetske glazbene...,The festival of the leading names of the World...,"CLASSLA-web.hr.265086', 'domain': 'metro-porta...",News,"[▁PO, ČE, O, ▁FESTIVAL, ▁Vod, e, ća, ▁imena, ▁...","[8574, 79881, 670, 222225, 47572, 13, 11309, 8..."
34,CLASSLA-web.hr.2707746,News,Pretraži sadržaj\nMUH – Protiv podizanja cijen...,Search Muh Content - Against Raising the price...,"CLASSLA-web.hr.2707746', 'domain': 'buz.hr'}",News,"[▁Pret, raži, ▁sadržaj, ▁MU, H, ▁–, ▁Pro, tiv,...","[80061, 69602, 44322, 11183, 841, 46, 1250, 13..."
41,CLASSLA-web.hr.3264482,News,Foto: PR Photos\nFrontmen grupe Guns N' Roses ...,Photo: PO Photos frontman Guns n 'Roses receiv...,"CLASSLA-web.hr.3264482', 'domain': 'mojtv.hr'}",News,"[▁Foto, :, ▁PR, ▁Photos, ▁Front, men, ▁grupe, ...","[5198, 12, 10865, 79632, 43643, 1055, 52760, 2..."


Number of all tokens: 3237
[('"', 6), ('",', 7), ('".', 1), ('&', 2), ("'", 7), (')', 2), ('),', 1), (',', 146), ('-', 9), ('-19', 1), ('.', 81), ('...', 1), ('/', 4), ('02.', 1), ('11', 1), ('169', 1), ('1971', 1), ('2.', 1), ('20', 1), ('2010', 1)]
Number of all types: 1447
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
48,CLASSLA-web.hr.3619805,Instruction,Priprema\nIzrežite govedinu i povrće na kockic...,Preparation cut beef and vegetables into cubes...,"CLASSLA-web.hr.3619805', 'domain': 'centarzdra...",Instruction,"[▁Pri, prema, ▁Iz, re, žite, ▁go, ved, inu, ▁i...","[2319, 109887, 3519, 107, 74460, 738, 4126, 42..."
51,CLASSLA-web.hr.377808,Instruction,Stvari koje u seksu nisu poželjne\nPreglasno u...,Things that are not desirable in sexual sighs ...,"CLASSLA-web.hr.377808', 'domain': 'cosmopolita...",Instruction,"[▁St, vari, ▁koje, ▁u, ▁seks, u, ▁nisu, ▁požel...","[2907, 21690, 3304, 75, 6530, 34, 16968, 20239..."
61,CLASSLA-web.hr.4211067,Instruction,Paleo i gluten free torta sa šljivama za kasno...,Paleo and gluten free plums with a late summer...,"CLASSLA-web.hr.4211067', 'domain': 'miss7zdrav...",Instruction,"[▁Pale, o, ▁i, ▁gluten, ▁free, ▁torta, ▁sa, ▁š...","[62814, 31, 17, 69065, 4092, 118341, 57, 9717,..."


Number of all tokens: 1701
[('"', 1), ('(', 1), (')', 2), (',', 42), ('-', 1), ('.', 59), ('9%', 1), (':', 6), ('@', 1), ('C', 1), ('Kad', 1), ('Mal', 1), ('O', 1), ('S', 2), ('Sta', 1), ('Tor', 1), ('a', 31), ('aj', 1), ('ak', 2), ('aka', 1)]
Number of all types: 777
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
15,CLASSLA-web.hr.1624888,Opinion/Argumentation,Gaženje principa kao jeftina medijska strategi...,Pushing principles as a cheap media strategy w...,"CLASSLA-web.hr.1624888', 'domain': 'muzika.hr'}",Opinion/Argumentation,"[▁Ga, ženje, ▁princip, a, ▁kao, ▁jeftin, a, ▁m...","[2902, 67180, 24702, 11, 2428, 209083, 11, 804..."
20,CLASSLA-web.hr.202668,Opinion/Argumentation,Guingamp: Više članova nego stanovnika\nPrije ...,Guingamp: More members than inhabitants less t...,"CLASSLA-web.hr.202668', 'domain': 'nogometplus...",Opinion/Argumentation,"[▁Gu, ing, amp, :, ▁Više, ▁članova, ▁nego, ▁st...","[6955, 214, 25133, 12, 82356, 122298, 9807, 20..."
26,CLASSLA-web.hr.229413,Opinion/Argumentation,Poezija koju ćete razumijeti: “Oda godinama”\n...,"The poetry you will understand: ""ODA years"" po...","CLASSLA-web.hr.229413', 'domain': 'zagrebonlin...",Opinion/Argumentation,"[▁Po, e, zija, ▁koju, ▁ćete, ▁razumije, ti, :,...","[663, 13, 35858, 15781, 44685, 130906, 118, 12..."


Number of all tokens: 3427
[('!', 5), ('!”', 1), (')', 6), (',', 121), ('-', 7), ('.', 85), ('.11.', 1), ('.”', 1), ('/', 1), ('003', 1), ('2.5', 1), ('29', 1), ('3', 1), ('374', 1), ('4.5', 1), ('5.9', 1), ('7.000', 1), (':', 5), ('?', 5), ('Bo', 1)]
Number of all types: 1460
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
5,CLASSLA-web.hr.1210319,Forum,Pa gdje ti zivis 😉 Kod mene sunce sija ko teps...,So where do you live 😉 The sun shines with me ...,"CLASSLA-web.hr.1210319', 'domain': 'poslovni.hr'}",Forum,"[▁Pa, ▁gdje, ▁ti, ▁zi, vis, ▁😉, ▁Kod, ▁mene, ▁...","[1342, 15731, 1053, 3546, 1824, 10154, 49717, ..."
6,CLASSLA-web.hr.1232985,Forum,Kada govorimo o dionicama i njihovim budućim v...,When it comes to shares and their future value...,"CLASSLA-web.hr.1232985', 'domain': 'poslovni.hr'}",Forum,"[▁Kada, ▁govori, mo, ▁o, ▁dio, nicama, ▁i, ▁nj...","[30613, 26670, 432, 36, 8314, 127513, 17, 1332..."
13,CLASSLA-web.hr.1532311,Instruction,Opseg glave kod novorođenčeta\nP: Sin je rođen...,Head circumference at the infant P: Son was bo...,"CLASSLA-web.hr.1532311', 'domain': 'roditelji....",Forum,"[▁Op, seg, ▁glave, ▁kod, ▁novo, ro, đen, če, t...","[2331, 25398, 166727, 8645, 5474, 516, 57310, ..."


Number of all tokens: 3811
[('!', 14), ('!!!', 1), ('!!!!!!!!!!!!', 1), ('"', 8), ('",', 1), ('".', 2), ('#', 1), (')', 13), ('),', 1), (').', 3), (',', 82), ('-', 14), ('-2000', 1), ('.', 110), ('...', 22), ('.....', 1), ('.........', 2), ('.05.', 1), ('/', 5), ('/15', 1)]
Number of all types: 1567
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,CLASSLA-web.hr.1203503,Opinion/Argumentation,Redatelj Walter Hill poslužio se scenarijem Ak...,Director Walter Hill used the screenplay of Ak...,"CLASSLA-web.hr.1203503', 'domain': 'filmski.net'}",Prose/Lyrical,"[▁Red, a, telj, ▁Walter, ▁Hill, ▁posluži, o, ▁...","[6096, 11, 55512, 80490, 37639, 192882, 31, 40..."
11,CLASSLA-web.hr.1497985,Prose/Lyrical,"Pa čekali te jadi, Pa raso putem trn, Moj braj...","Well, you waited for you, then Raso through th...","CLASSLA-web.hr.1497985', 'domain': 'poezija.in...",Prose/Lyrical,"[▁Pa, ▁čekal, i, ▁te, ▁jadi, ,, ▁Pa, ▁ras, o, ...","[1342, 209071, 14, 120, 4751, 4, 1342, 6534, 3..."
12,CLASSLA-web.hr.1504684,Prose/Lyrical,"Ujam\nPokraj bučna slapa, umilnu na dolu, Bliz...","The slope next to the noisy waterfall, welcome...","CLASSLA-web.hr.1504684', 'domain': 'poezija.in...",Prose/Lyrical,"[▁U, jam, ▁Po, kraj, ▁bu, čna, ▁slap, a, ,, ▁u...","[345, 5095, 663, 50140, 373, 22638, 57115, 11,..."


Number of all tokens: 2651
[('!', 3), ('!”', 2), ('",', 2), ("'", 3), (',', 117), ('.', 80), ('."', 3), ('...', 2), ('.”', 1), (':', 16), (';', 9), ('?', 2), ('?"', 1), ('?”', 1), ('A', 1), ('AST', 1), ('Amerika', 1), ('Bo', 1), ('C', 1), ('ICE', 1)]
Number of all types: 1250
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
23,CLASSLA-web.hr.2243345,Legal,Upravni odjel za opće poslove i ured gradonače...,The Department of General Affairs and the Offi...,"CLASSLA-web.hr.2243345', 'domain': 'vukovar.hr'}",Legal,"[▁U, prav, ni, ▁odjel, ▁za, ▁opće, ▁poslov, e,...","[345, 7707, 93, 145345, 80, 177653, 69271, 13,..."
36,CLASSLA-web.hr.2837762,Legal,Postoji osnovana sumnja da je 29-godišnjak oko...,There is a reasonable suspicion that the 29-ye...,"CLASSLA-web.hr.2837762', 'domain': 'dorh.hr'}",Legal,"[▁Post, oji, ▁osnova, na, ▁sum, nja, ▁da, ▁je,...","[2795, 18689, 106183, 76, 10554, 1648, 48, 55,..."
37,CLASSLA-web.hr.2846068,Legal,Predlaže se donošenje Preporuke HZJZ-a za rad ...,It is proposed to adopt the CIJZ recommendatio...,"CLASSLA-web.hr.2846068', 'domain': 'hrsk.hr'}",Legal,"[▁Pred, la, že, ▁se, ▁do, nošenje, ▁Pre, poru,...","[7145, 143, 1948, 40, 54, 171427, 1914, 30484,..."


Number of all tokens: 3490
[('"', 1), (')', 6), (').', 6), (',', 85), ('-', 27), ('-19', 5), ('-2', 4), ('-8', 2), ('.', 126), ('...', 2), ('.05.', 1), ('/', 9), ('/1', 2), ('/16', 1), ('/3', 2), ('002', 1), ('02.', 1), ('03', 1), ('05', 1), ('07', 1)]
Number of all types: 1303
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.hr.1033815,Promotion,"O proizvodu\nColor Transformer, za pametno i j...","About the Color Transformer product, for smart...","CLASSLA-web.hr.1033815', 'domain': 'hairshop.hr'}",Promotion,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, former, ,, ...","[180, 43170, 34, 51193, 11062, 82772, 4, 80, 6..."
2,CLASSLA-web.hr.1119579,Promotion,Sunčano selo / Sunny village\nNa obroncima Bil...,Sunshine / Sunny Village on the slopes of Bilo...,"CLASSLA-web.hr.1119579', 'domain': 'vikendi.com'}",Promotion,"[▁Sun, čan, o, ▁se, lo, ▁/, ▁Sunny, ▁village, ...","[7550, 17129, 31, 40, 365, 248, 151197, 54427,..."
8,CLASSLA-web.hr.1401439,Promotion,Popust s GOLDEN TOWER karticom za vožnje TAXI ...,The Golden Tower Taxi Discount Taxi Camme is n...,"CLASSLA-web.hr.1401439', 'domain': 'katalozi.n...",Promotion,"[▁Po, pust, ▁s, ▁GOLD, EN, ▁TO, WER, ▁karti, c...","[663, 71036, 91, 213865, 11572, 9926, 91227, 1..."


Number of all tokens: 4202
[('!', 8), ('!!', 1), ('!!!', 2), ("'", 2), (')', 2), (').', 3), ('*', 1), (',', 138), ('-', 9), ('-5', 1), ('.', 119), ('.....', 1), ('.11.', 1), ('.11.20', 1), ('/', 3), ('/4', 1), ('12', 1), ('15', 2), ('165', 1), ('18', 1)]
Number of all types: 1778
Processing sl


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."


Processing Information/Explanation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
9,CLASSLA-web.sl.1461565,Information/Explanation,Dvorec je morda naslednik enega od številnih v...,The mansion may be the successor to one of the...,"CLASSLA-web.sl.1461565', 'domain': 'gradovi.net'}",Information/Explanation,"[▁D, vor, ec, ▁je, ▁morda, ▁na, sled, nik, ▁en...","[391, 3104, 3240, 55, 72573, 24, 16895, 1881, ..."
17,CLASSLA-web.sl.1545419,Information/Explanation,Opis\nBazični prašek z aronijo iz dobro vpojni...,Description Basic powder with aronia from good...,"CLASSLA-web.sl.1545419', 'domain': 'vitalabo.si'}",Information/Explanation,"[▁Opis, ▁Baz, ični, ▁praš, ek, ▁z, ▁a, roni, j...","[83518, 36490, 44415, 46678, 343, 97, 10, 3492..."
22,CLASSLA-web.sl.181515,Information/Explanation,"Mavrica na tri plamene, Bog nas živi vse Slove...","Rainbow on three flames, God lives us all Slov...","CLASSLA-web.sl.181515', 'domain': 'buca.si'}",Information/Explanation,"[▁Ma, v, rica, ▁na, ▁tri, ▁pla, mene, ,, ▁Bog,...","[911, 334, 12056, 24, 1927, 3412, 13616, 4, 23..."


Number of all tokens: 3446
[('".', 1), ("'", 2), (')', 13), (')(', 1), (').', 3), (',', 140), ('-', 1), ('.', 82), ('...', 1), ('.4.', 1), ('150', 1), ('2.', 1), ('2015', 1), ('23', 1), ('25', 1), ('30', 1), ('36', 1), ('43', 1), ('46', 1), ('498', 1)]
Number of all types: 1444
Processing News


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
10,CLASSLA-web.sl.1462585,News,"Barbie in Ken\nBarbie in Ken\nFebruary 14, 200...","Barbie and Ken Barbie and Ken February 14, 200...","CLASSLA-web.sl.1462585', 'domain': 'ujusansa.si'}",News,"[▁Barbie, ▁in, ▁Ken, ▁Barbie, ▁in, ▁Ken, ▁Febr...","[85805, 23, 19825, 85805, 23, 19825, 22482, 61..."
29,CLASSLA-web.sl.2077617,News,Poslanec SD Jan Škoberne in strokovni sodelave...,SD MP Jan Škoberne and PS SD expert Gorazd Pra...,"CLASSLA-web.sl.2077617', 'domain': 'socialnide...",News,"[▁Poslan, ec, ▁SD, ▁Jan, ▁Š, ko, ber, ne, ▁in,...","[158165, 3240, 21864, 3342, 3608, 265, 1297, 8..."


Number of all tokens: 3452
[('!', 1), ('"', 3), ('".', 1), ("'", 1), (')', 5), ('),', 2), (').', 2), (',', 148), (',«', 2), ('-', 13), ('-09-', 2), ('-19', 1), ('-300', 1), ('.', 93), ('."', 2), ('...', 2), ('/', 6), ('/03/', 1), ('/09/', 1), ('/2016', 1)]
Number of all types: 1496
Processing Instruction


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."
18,CLASSLA-web.sl.1568350,Instruction,"Bodite pozorni na skrite znake, ki lahko pomen...",Pay attention to hidden signs that can mean he...,"CLASSLA-web.sl.1568350', 'domain': 'mojaleta.si'}",Instruction,"[▁Bodi, te, ▁pozor, ni, ▁na, ▁skrit, e, ▁znak,...","[89731, 67, 30710, 93, 24, 167422, 13, 40652, ..."
20,CLASSLA-web.sl.160891,Instruction,Pravica do odstopa od pogodbe - vračilo blaga\...,The right to withdraw from the contract - the ...,"CLASSLA-web.sl.160891', 'domain': 'chic.si'}",Instruction,"[▁Pravi, ca, ▁do, ▁odsto, pa, ▁od, ▁pogodbe, ▁...","[71495, 408, 54, 102109, 763, 229, 118941, 20,..."


Number of all tokens: 3395
[('!', 4), ('!!', 1), (')', 7), ('),', 10), (').', 3), ('**', 1), (',', 142), ('-', 2), ('.', 126), ('.03.2016', 1), ('/', 5), ('0,5', 1), ('1%', 2), ('1,5', 1), ('1,8', 1), ('2,5', 1), ('20', 1), ('98', 1), (':', 11), ('?', 5)]
Number of all types: 1306
Processing Opinion/Argumentation


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
3,CLASSLA-web.sl.1230602,Opinion/Argumentation,Danes\nJutri\nPojutrišnjem\nSo res pojedli 200...,Today tomorrow they really ate 200 kg of meats...,"CLASSLA-web.sl.1230602', 'domain': 'kr.trma.si'}",Opinion/Argumentation,"[▁Danes, ▁Ju, tri, ▁Po, ju, tri, šnje, m, ▁So,...","[85405, 3314, 3996, 663, 461, 3996, 69207, 39,..."
21,CLASSLA-web.sl.1794910,Opinion/Argumentation,Zeliščni vrt s čmrlji\nOb burji in mrazu prveg...,The herb garden with bumblebees along the stor...,"CLASSLA-web.sl.1794910', 'domain': 'dobra-pot....",Opinion/Argumentation,"[▁Ze, liš, čni, ▁vrt, ▁s, ▁č, mr, lji, ▁Ob, ▁b...","[3786, 68046, 12771, 23249, 91, 7658, 76492, 3..."
24,CLASSLA-web.sl.1829464,Opinion/Argumentation,Karmen Stavec z groteskno napolnjenimi ustnica...,"Karmen Stavec, with grotesquely filled lips Ka...","CLASSLA-web.sl.1829464', 'domain': 'lifestyle....",Opinion/Argumentation,"[▁Kar, men, ▁Sta, vec, ▁z, ▁grote, sk, no, ▁na...","[3423, 1055, 6512, 35259, 97, 14395, 1042, 157..."


Number of all tokens: 2471
[('!', 6), ('!"', 1), ('!),', 1), ('"...', 1), (')', 1), ('),', 1), (').', 1), (',', 123), ('-', 2), ('.', 68), ('..!', 1), ('...', 3), ('...”', 1), ('.”', 1), (':', 6), (';', 1), ('?', 13), ('?!', 2), ('BOL', 1), ('CI', 1)]
Number of all types: 1128
Processing Forum


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
13,CLASSLA-web.sl.1477606,Forum,"Biarritz, junij 1997\nBiarritz, junij 1997\nKJ...","Biarritz, June 1997 Biarritz, June 1997 Where?...","CLASSLA-web.sl.1477606', 'domain': 'ujusansa.si'}",Forum,"[▁Biar, ritz, ,, ▁juni, j, ▁1997, ▁Biar, ritz,...","[125705, 135427, 4, 13645, 170, 10586, 125705,..."
28,CLASSLA-web.sl.2051599,Forum,"Ja, Kelly se pa spomnim iz razstave, res je pr...","Yes, Kelly remembers from the exhibition, it's...","CLASSLA-web.sl.2051599', 'domain': 'maltezan.m...",Forum,"[▁Ja, ,, ▁Kelly, ▁se, ▁pa, ▁spomni, m, ▁iz, ▁r...","[823, 4, 95062, 40, 249, 174764, 39, 445, 1017..."
32,CLASSLA-web.sl.2133726,Forum,"Zdravo, prosila bi za pomoč pri nalogi iz niha...","Hi, I would ask for help with the oscillation ...","CLASSLA-web.sl.2133726', 'domain': 'forum.kvar...",Forum,"[▁Zdrav, o, ,, ▁prosi, la, ▁bi, ▁za, ▁pomoč, ▁...","[33934, 31, 4, 97032, 143, 333, 80, 38452, 494..."


Number of all tokens: 3970
[('!', 10), ('"', 5), ('&', 8), ('(', 2), (')', 8), ('),', 4), (').', 10), ('*', 1), ('*******', 1), ('+', 1), (',', 144), ('-', 5), ('-6', 1), ('.', 127), ('...', 18), ('...).', 1), ('.....', 2), ('.03.2015', 5), ('.11.20', 1), ('/', 10)]
Number of all types: 1519
Processing Prose/Lyrical


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
11,CLASSLA-web.sl.1463952,Prose/Lyrical,Ti ljubezen si\nIzvajalec: Werner Izvajalec: M...,You love you: Werner performer: Maja Blagdan A...,"CLASSLA-web.sl.1463952', 'domain': 'besedilo.si'}",Prose/Lyrical,"[▁Ti, ▁ljubezen, ▁si, ▁Iz, vaja, lec, :, ▁Wern...","[2371, 147642, 78, 3519, 38899, 8996, 12, 1727..."
12,CLASSLA-web.sl.1475509,Prose/Lyrical,"Ribič\nNad mestom se dan budi, galebi kričijo,...","The fisherman above the city is awakening, the...","CLASSLA-web.sl.1475509', 'domain': 'besedilo.si'}",Prose/Lyrical,"[▁Rib, ič, ▁Nad, ▁mesto, m, ▁se, ▁dan, ▁budi, ...","[59867, 29507, 31252, 22041, 39, 40, 123, 8946..."
14,CLASSLA-web.sl.1494196,Prose/Lyrical,Kdor je v srcu mlad\nSvet vrti se vedno v isto...,Anyone who is in the heart of the young world ...,"CLASSLA-web.sl.1494196', 'domain': 'besedilo.si'}",Prose/Lyrical,"[▁K, dor, ▁je, ▁v, ▁srcu, ▁mlad, ▁Svet, ▁vrt, ...","[341, 1846, 55, 81, 188148, 69092, 40562, 2324..."


Number of all tokens: 2909
[('!', 7), ("''", 1), (')', 2), ('*', 2), ('+', 31), (',', 187), ('.', 93), ('...', 1), ('.«', 1), ('2)', 1), ('25', 1), (':', 8), (':1', 1), (';', 7), ('?', 5), ('Kom', 1), ('M', 1), ('Ma', 1), ('Prav', 1), ('Pri', 1)]
Number of all types: 1131
Processing Legal


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
6,CLASSLA-web.sl.1320205,Legal,JEDRO: Verjetnost obstoja terjatve se presoja ...,Core: The likelihood of the existence of a cla...,"CLASSLA-web.sl.1320205', 'domain': 'sodisce.si'}",Legal,"[▁JE, DRO, :, ▁Ver, jet, nost, ▁obstoj, a, ▁te...","[17754, 122359, 12, 1446, 4249, 3110, 147265, ..."
7,CLASSLA-web.sl.1341965,Legal,JEDRO: V predlogih predstavljene okoliščine na...,Core: In proposals presented circumstances of ...,"CLASSLA-web.sl.1341965', 'domain': 'sodisce.si'}",Legal,"[▁JE, DRO, :, ▁V, ▁predlog, ih, ▁predstavljen,...","[17754, 122359, 12, 310, 68615, 1043, 128967, ..."
31,CLASSLA-web.sl.2114460,Legal,75SUB-EPPO19\nDatum objave:\nUpravičene osebe ...,75SUB-EPPO19 Date of publication: The eligible...,"CLASSLA-web.sl.2114460', 'domain': 'energetika...",Legal,"[▁75, S, UB, -, EP, PO, 19, ▁Datum, ▁objave, :...","[4948, 294, 32338, 9, 21290, 9698, 2947, 76887..."


Number of all tokens: 3446
[('"', 3), ('(1)', 1), (')', 7), ('),', 2), (').', 3), (',', 143), ('-', 7), ('---', 1), ('-1', 1), ('-1)', 1), ('-16', 1), ('.', 106), ('...).', 1), ('.11.', 3), ('.4.', 4), ('.5.', 1), ('/', 5), ('/12', 2), ('/2002', 2), ('/2003', 2)]
Number of all types: 1116
Processing Promotion


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
4,CLASSLA-web.sl.1258381,Promotion,"Umivalniki, ki jih nudimo v naši spletni proda...",The washbasins we offer in our online store ca...,"CLASSLA-web.sl.1258381', 'domain': 'emundia.si'}",Promotion,"[▁Um, ival, niki, ,, ▁ki, ▁jih, ▁nudi, mo, ▁v,...","[2793, 47898, 10926, 4, 200, 5008, 39488, 432,..."
5,CLASSLA-web.sl.1307286,Promotion,Commodore C64\nCommodore USA je začel s prodaj...,The Commodore C64 Commodore USA started sellin...,"CLASSLA-web.sl.1307286', 'domain': 'monitor.si'}",Promotion,"[▁Com, modo, re, ▁C, 64, ▁Com, modo, re, ▁USA,...","[3288, 74445, 107, 313, 13307, 3288, 74445, 10..."
8,CLASSLA-web.sl.1416741,Promotion,Po lepi modri Donavi z Ladjo. Čari starih evro...,After a beautiful blue Danube with a boat. Cha...,"CLASSLA-web.sl.1416741', 'domain': 'mmturist.si'}",Promotion,"[▁Po, ▁lepi, ▁mod, ri, ▁Dona, vi, ▁z, ▁Lad, jo...","[663, 110722, 2811, 416, 100127, 686, 97, 6433..."


Number of all tokens: 3203
[('!', 2), (')', 2), ('),', 2), (').', 1), (',', 127), ('-', 2), ('-20', 1), ('.', 110), ('...', 1), ('/', 1), ('150', 1), ('2', 2), ('64', 7), ('895', 1), ('95', 1), (':', 11), ('A', 2), ('AJ', 1), ('BL', 2), ('C', 3)]
Number of all types: 1371


In [50]:
pd.DataFrame(lang_results["mt"])

Unnamed: 0,token_count_dict,token_count,type_count
Information/Explanation,"{'""': 9, '"",': 6, ''': 68, ')': 3, '),': 2, ')...",6656,1446
News,"{'""': 3, '"",': 1, ''': 159, ')': 3, ',': 126, ...",6978,1614
Instruction,"{'!': 2, '""': 3, '"",': 1, '"".': 2, ''': 81, ')...",9288,1585
Opinion/Argumentation,"{'!': 4, '""': 1, '"",': 1, ''': 56, ')': 3, '),...",4096,1109
Forum,"{''': 10, ',': 11, '-': 48, '-21': 1, '.': 9, ...",512,240
Prose/Lyrical,"{''': 2, ')': 4, ',': 10, '-': 25, '-24': 1, '...",512,241
Legal,"{'#': 20, ''': 43, '(': 3, '(1)': 4, '(2)': 4,...",5120,1439
Promotion,"{'!': 3, '""': 18, '"",': 2, '"".': 4, ''': 43, '...",5878,1361


### 

In [39]:
# Open label-based train token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count.json", "r") as train_label_count_file:
	label_token_count_train = json.load(train_label_count_file)

label_token_count_train.keys()

dict_keys(['Information/Explanation', 'News', 'Instruction', 'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal', 'Promotion'])

In [40]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

list(main_dict["sl"]["token_overlap"]["token_count"].items())[:10]

[('!', 30),
 ('!!', 1),
 ('!"', 1),
 ('!),', 1),
 ('"', 11),
 ('".', 2),
 ('"...', 1),
 ('&', 8),
 ("'", 3),
 ("''", 1)]

In [None]:
cosine_sim = {}
vector_size = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")
	# Get token count for current lang
	current_lang_count = main_dict[lang]["token_overlap"]["token_count"]

	print(f"Number of token types for {lang}: {len(list(current_lang_count.keys()))}")

	# For each test set, create a vector of token counts. Take only tokens that are present either in train_df or test set.
	intersection_dict = {}

	# First, create a list of tokens that are present in either one or the other list
	intersection_keys = []
	intersection_keys.extend(list(current_lang_count.keys()))
	intersection_keys.extend(list(train_count.keys()))
	# Remove duplicated keys
	intersection_keys = list(set(intersection_keys))
	print(f"Number of intersecting types: {len(intersection_keys)}")

	# Then create a dictionary for 1) train df and 2) test df with counts of token types that occur in either of the datasets
	train_intersect_dict = {}
	test_intersect_dict = {}

	for i in intersection_keys:
		try:
			train_intersect_dict[i] = train_count[i]
		except:
			train_intersect_dict[i] = 0
		try:
			test_intersect_dict[i] = current_lang_count[i]
		except:
			test_intersect_dict[i] = 0


	# Create a df with intersecting keys
	intersect_df = pd.DataFrame({"train": train_intersect_dict, "test": test_intersect_dict})

	display(intersect_df.head(10))

	# Calculate cosine similarity
	current_cosine_sim = cosine_similarity(np.array(intersect_df["train"].to_list()), np.array(intersect_df["test"].to_list()))

	print(f"Cosine similarity for {lang}: {current_cosine_sim}")

	# Add to the main dictionary
	main_dict[lang]["token_overlap"]["cosine_similarity"] = current_cosine_sim
	main_dict[lang]["token_overlap"]["intersection_df"] = intersect_df.to_dict()
	main_dict[lang]["token_overlap"]["intersection_vector_size"] = len(intersection_keys)

	# Add to a dict of results
	cosine_sim[lang] = current_cosine_sim
	vector_size[lang] = len(intersection_keys)

Processing mt
Number of token types for mt: 4787
Number of intersecting types: 28226


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,1
▁Ż,0,7
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for mt: 0.4142476865826809
Processing el
Number of token types for el: 4751
Number of intersecting types: 30954


Unnamed: 0,train,test
▁Απριλίου,0,3
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
έλθει,0,1
▁Bran,9,0
NJE,3,0


Cosine similarity for el: 0.5273207128330722
Processing tr
Number of token types for tr: 6272
Number of intersecting types: 30846


Unnamed: 0,train,test
▁boot,11,1
▁Trend,2,0
set,47,0
▁Neden,0,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for tr: 0.5938472232210807
Processing sq
Number of token types for sq: 4891
Number of intersecting types: 29168


Unnamed: 0,train,test
▁boot,11,0
▁pavarur,0,1
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for sq: 0.43448915477115685
Processing is
Number of token types for is: 4615
Number of intersecting types: 29518


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0
TION,18,0


Cosine similarity for is: 0.5251354659540077
Processing uk
Number of token types for uk: 6507
Number of intersecting types: 33121


Unnamed: 0,train,test
та,0,33
▁boot,11,0
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
фи,0,1
NJE,3,0


Cosine similarity for uk: 0.5846121788244142
Processing ca
Number of token types for ca: 5314
Number of intersecting types: 29443


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,1
TION,18,0


Cosine similarity for ca: 0.5254392489246424
Processing mk
Number of token types for mk: 5468
Number of intersecting types: 31837


Unnamed: 0,train,test
та,0,206
▁boot,11,0
▁Јан,0,1
ензи,0,3
▁Trend,2,0
сметаат,0,5
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0


Cosine similarity for mk: 0.4225320009293607
Processing hr
Number of token types for hr: 6222
Number of intersecting types: 28864


Unnamed: 0,train,test
▁boot,11,0
▁Essen,0,2
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for hr: 0.5662098277228971
Processing sl
Number of token types for sl: 5763
Number of intersecting types: 27507


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,1
▁primordial,1,0
TION,18,0


Cosine similarity for sl: 0.6334513674796362


In [None]:
# Show results
cosine_sim_df = pd.DataFrame({"cosine_similarity": cosine_sim, "vector_size": vector_size}).sort_values(by="cosine_similarity", ascending=False)
print(cosine_sim_df.to_markdown())

|    |   cosine_similarity |   vector_size |
|:---|--------------------:|--------------:|
| sl |            0.633451 |         27507 |
| tr |            0.593847 |         30846 |
| uk |            0.584612 |         33121 |
| hr |            0.56621  |         28864 |
| el |            0.527321 |         30954 |
| ca |            0.525439 |         29443 |
| is |            0.525135 |         29518 |
| sq |            0.434489 |         29168 |
| mk |            0.422532 |         31837 |
| mt |            0.414248 |         28226 |


In [None]:
main_dict["sl"]["token_overlap"].keys()

dict_keys(['overlap_percentage', 'token_list', 'overlap_token_list', 'token_count', 'cosine_similarity', 'intersection_df', 'intersection_vector_size'])

In [None]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)