In [1]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=6

import pandas as pd
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
import re

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=6


  from .autonotebook import tqdm as notebook_tqdm


## Tokenize and count tokens for train_df

Code for tokenization (it is now already done):

In [18]:
# Import the train dataset
train = load_dataset("TajaKuzman/X-GENRE-multilingual-text-genre-dataset", "train")

# To open as Pandas DataFrame:
train_df = pd.DataFrame(train["train"])

display(train_df.head(2))

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the train dataset
tokens_train = []
integers_train = []
token_list_train = []

for text in tqdm(train_df.text.to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens_train, except the beginning (<s>) and end (</s>) token
	current_tokens_train = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	tokens_train.append(current_tokens_train)
	token_list_train.extend(current_tokens_train)
	integers_train.append(encoded_text.input_ids[1:-1])

train_df["tokens_train"] = tokens_train
train_df["token_ids"] = integers_train

#print(token_list_train[:10])
#print(len(token_list_train))

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

train_df.head(3)

# Save the tokenized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English


  0%|          | 0/1772 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1810 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1772/1772 [00:09<00:00, 189.74it/s]


699465


Code to count tokens:

In [6]:
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51..."


In [7]:
# Create a dictionary that counts all the token occurrences

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

# Create a dictionary which counts the occurrences of the words

word_dict_train = Counter(train_tokens_shortened)

# Sort the dictionary alphabetically based on keys
word_dict_train = dict(sorted(word_dict_train.items()))

print(list(word_dict_train.items())[:100])
print(len(word_dict_train))

699465
[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1), ('!!!!!!', 1), ('!!!!!!!', 1), ('!"', 14), ('!)', 10), ('!),', 1), ('!).', 2), ('"', 528), ('")', 7), ('"),', 6), ('").', 7), ('",', 56), ('".', 83), ('"...', 3), ('";', 3), ('"?', 6), ('#', 4), ('$', 8), ('%', 4), ('%)', 1), ('&', 46), ("'", 4517), ('(', 31), ('(1', 1), (')', 788), ('),', 242), (').', 297), ('):', 13), (');', 5), ('*', 19), ('**', 2), ('****', 1), ('+', 15), ('+5', 1), (',', 23447), (',«', 12), ('-', 2803), ('---', 3), ('------', 5), ('----------------', 41), ('-0', 3), ('-01', 1), ('-02', 2), ('-02-', 2), ('-03-', 4), ('-06', 1), ('-06-', 3), ('-09-', 1), ('-1', 15), ('-1)', 3), ('-10', 2), ('-10-', 1), ('-11', 6), ('-11-', 1), ('-12', 8), ('-13', 6), ('-14', 3), ('-15', 7), ('-16', 9), ('-17', 3), ('-18', 9), ('-19', 5), ('-2', 10), ('-20', 3), ('-2000', 3), ('-2005', 1), ('-2007', 8), ('-2009', 1), ('-2010', 1), ('-2011', 1), ('-2012', 1), ('-2014', 1), ('-2020', 1), ('-21', 4), ('-22', 3), ('-

In [6]:
len(set(train_tokens_shortened))

27025

The train dataset has 699.465 tokens and 27.025 unique words.

In [7]:
# See the most frequent tokens:

# Sort the dictionary by values (word counts) in descending order
sorted(word_dict_train.items(), key=lambda x: x[1], reverse=True)[:10]


[(',', 23447),
 ('.', 21407),
 ('▁', 19553),
 ('▁the', 18860),
 ('s', 14184),
 ('▁to', 10762),
 ('▁of', 9912),
 ('▁and', 9752),
 ('▁in', 9140),
 ('▁a', 8341)]

In [14]:
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "w") as train_count_file:
	json.dump(word_dict_train, train_count_file)

# Tokenize and count tokens for test sets & calculate percentage overlap

Code with which I tokenized the datasets:

In [4]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [None]:
# Code with which the test sets were tokenized

# Define an array of token overlap
token_overlap_results = {}

# Loop through the datasets and calculate token overlap
for lang in list(main_dict.keys()):
	df = pd.DataFrame(main_dict[lang]["dataset"])

	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

	tokens = []
	integers = []
	token_list = []

	print("Tokenizing text.")

	for text in tqdm(df.text.to_list()):
		encoded_text = tokenizer(text)
		# Take all tokens, except the beginning (<s>) and end (</s>) token
		current_tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
		# Shorten the list to 512, as tokens after that were not observed by the classifier
		current_tokens = current_tokens[:512]
		tokens.append(current_tokens)
		token_list.extend(current_tokens)
		integers.append(encoded_text.input_ids[1:-1][:512])

	df["tokens"] = tokens
	df["token_ids"] = integers

	print(token_list[:10])
	print("All tokens:")
	print(len(token_list))

	# See how many tokens overlap
	overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_token_list = []

	# We calculate overlap by dividing the number of tokens that are shared in train and test set by the number of all tokens in both sets
	#overlap_counter = sum(1 for element in token_list if element in train_tokens_shortened)
	for token in tqdm(token_list):
		if token in train_tokens_shortened:
			# Add to the overlap counter number of all times the token appears in both datasets
			overlap_counter += 1
			overlap_token_list.append(token)


	# Out of all tokens in test set, how many overlap?
	overlap_per = overlap_counter/len(token_list)

	print(f"Number of tokens that overlap: {overlap_counter}")
	print(f"Percentage of overlap: {overlap_per}")

	# Update the dataset in the dictionary
	main_dict[lang]["dataset"] = df.to_dict()

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"] = {"overlap_percentage":overlap_per, "token_list": token_list, "overlap_token_list":overlap_token_list}

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list": overlap_token_list}

	# Convert tokens back to words
	#print(tokenizer.convert_tokens_to_string(tokens))

# Inspect the results
overlap_df = pd.DataFrame(token_overlap_results)

In [8]:
# Improved code to calculate token overlap - we can simply calculate which tokens do not overlap - meaning that they occur only in one of the dataset, and calculate the percentage overlap based on that
token_overlap_results = {}

# Loop through the datasets and calculate token overlap
for lang in list(main_dict.keys()):
	token_list = main_dict[lang]["token_overlap"]['token_list']
	print(token_list[:10])
	print("All tokens:")
	print(len(token_list))

	# See how many tokens do not overlap
	no_overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_token_list = []

	# We calculate overlap by counting how many tokens do not appear in both sets
	for token in tqdm(token_list):
		if token not in train_tokens_shortened:
			no_overlap_counter += 1
		else:
			overlap_token_list.append(token)

	# Out of all tokens in test set, how many do not overlap with train set?
	no_overlap_per = no_overlap_counter/len(token_list)

	# Calculate percentage of overlap based on that
	overlap_per = 1-no_overlap_per

	print(f"Number of tokens that overlap: {len(overlap_token_list)}")
	print(f"Number of different tokens that overlap: {len(set(overlap_token_list))}")
	print(f"Percentage of overlap: {overlap_per}")

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"]["overlap_percentage"] = overlap_per
	main_dict[lang]["token_overlap"]["overlap_token_list"] = overlap_token_list

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list_size": len(overlap_token_list), "overlap_set_size": len(set(overlap_token_list))}


# Inspect the results
overlap_df = pd.DataFrame(token_overlap_results)

['▁Angel', 'o', '▁Che', 't', 'cuti', ',', '▁se', '▁j', 'kun', '▁qe']
All tokens:
39040
Calculating overlap.


100%|██████████| 39040/39040 [01:28<00:00, 439.17it/s] 


Number of tokens that overlap: 31899
Number of different tokens that overlap: 3586
Percentage of overlap: 0.8170850409836066
['▁Ενημέρωση', '▁του', '▁Pegasus', '▁Esti', 'asi', '▁με', '▁τις', '▁εισ', 'ερ', 'χ']
All tokens:
31240
Calculating overlap.


100%|██████████| 31240/31240 [03:41<00:00, 141.06it/s]


Number of tokens that overlap: 5043
Number of different tokens that overlap: 822
Percentage of overlap: 0.1614276568501921
['▁A', 'Ö', 'L', '▁Der', 's', '▁Seçim', 'i', '▁ve', '▁Sınav', '▁Giriş']
All tokens:
29578
Calculating overlap.


100%|██████████| 29578/29578 [02:07<00:00, 231.09it/s]


Number of tokens that overlap: 15425
Number of different tokens that overlap: 2451
Percentage of overlap: 0.5215024680505782
['▁Blog', '▁“', 'U', 'në', '▁të', '▁kam', '▁dashur', '▁me', '▁një', '▁dashuri']
All tokens:
26769
Calculating overlap.


100%|██████████| 26769/26769 [01:50<00:00, 242.39it/s]


Number of tokens that overlap: 16216
Number of different tokens that overlap: 2748
Percentage of overlap: 0.605775337143711
['▁[', 'is', ']', '▁Því', '▁er', '▁við', '▁hæ', 'fi', '▁að', '▁reg']
All tokens:
29644
Calculating overlap.


100%|██████████| 29644/29644 [02:19<00:00, 212.04it/s]


Number of tokens that overlap: 15343
Number of different tokens that overlap: 2122
Percentage of overlap: 0.5175752260153825
['▁Не', 'стандарт', 'ний', '▁підхід', '▁для', '▁виготовлення', '▁Ак', 'вар', 'і', 'у']
All tokens:
31677
Calculating overlap.


100%|██████████| 31677/31677 [03:42<00:00, 142.62it/s]


Number of tokens that overlap: 4963
Number of different tokens that overlap: 411
Percentage of overlap: 0.15667519020109233
['▁P', 'à', 'gine', 's', '▁En', 'fei', 'nada', '▁Porto', '▁uns', '▁dies']
All tokens:
27544
Calculating overlap.


100%|██████████| 27544/27544 [01:12<00:00, 378.13it/s]


Number of tokens that overlap: 20517
Number of different tokens that overlap: 2896
Percentage of overlap: 0.7448809178042405
['▁Ек', 'шу', 'ли', ',', '▁T', 'CL', '▁ги', '▁прави', '▁смартфон', 'овите']
All tokens:
27639
Calculating overlap.


100%|██████████| 27639/27639 [03:22<00:00, 136.20it/s]


Number of tokens that overlap: 4035
Number of different tokens that overlap: 656
Percentage of overlap: 0.14598936285683273
['▁O', '▁proizvod', 'u', '▁Color', '▁Trans', 'former', ',', '▁za', '▁pamet', 'no']
All tokens:
26546
Calculating overlap.


100%|██████████| 26546/26546 [01:08<00:00, 389.67it/s]


Number of tokens that overlap: 21808
Number of different tokens that overlap: 4383
Percentage of overlap: 0.8215173660815189
['▁Kita', 'jsko', '▁mesto', '▁duhov', '▁V', '▁Notranj', 'i', '▁Mongol', 'iji', '▁raste']
All tokens:
26292
Calculating overlap.


100%|██████████| 26292/26292 [00:49<00:00, 527.18it/s]

Number of tokens that overlap: 25616
Number of different tokens that overlap: 5281
Percentage of overlap: 0.9742887570363609





In [9]:
overlap_df = pd.DataFrame(token_overlap_results).transpose()
overlap_df

Unnamed: 0,percentage,overlap_list_size,overlap_set_size
mt,0.817085,31899.0,3586.0
el,0.161428,5043.0,822.0
tr,0.521502,15425.0,2451.0
sq,0.605775,16216.0,2748.0
is,0.517575,15343.0,2122.0
uk,0.156675,4963.0,411.0
ca,0.744881,20517.0,2896.0
mk,0.145989,4035.0,656.0
hr,0.821517,21808.0,4383.0
sl,0.974289,25616.0,5281.0


In [10]:
overlap_df.sort_values(by="percentage", ascending=False)

Unnamed: 0,percentage,overlap_list_size,overlap_set_size
sl,0.974289,25616.0,5281.0
hr,0.821517,21808.0,4383.0
mt,0.817085,31899.0,3586.0
ca,0.744881,20517.0,2896.0
sq,0.605775,16216.0,2748.0
tr,0.521502,15425.0,2451.0
is,0.517575,15343.0,2122.0
el,0.161428,5043.0,822.0
uk,0.156675,4963.0,411.0
mk,0.145989,4035.0,656.0


In [11]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

Add token counts information

In [23]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [48]:
token_number = {}
type_number = {}

for lang in list(main_dict.keys()):
	#print(f"Creating token dict for {lang}")
	current_token_list = main_dict[lang]["token_overlap"]["token_list"]

	#print(f"No of tokens: {len(current_token_list)}")

	# Create a dictionary which counts the occurrences of the tokens

	#word_dict_test = Counter(current_token_list)

	# Sort the dictionary alphabetically based on keys
	#word_dict_test = dict(sorted(word_dict_test.items()))

	# Add information on no. of tokens and words to a dict
	token_number[lang] = len(current_token_list)
	#type_number[lang] = len(word_dict_test)
	type_number[lang] = len(set(current_token_list))

	#print(list(word_dict_test.items())[:100])
	#print(f"No of unique tokens: {len(word_dict_test)}")

	# Add the count of tokens to the main dictionary
	#main_dict[lang]["token_overlap"]["token_count"] = word_dict_test

# Create a dataframe for statistics
token_df = pd.DataFrame({"tokens": token_number, "types": type_number})
print(token_df.to_markdown())

|    |   tokens |   types |
|:---|---------:|--------:|
| mt |    39040 |    4787 |
| el |    31240 |    4751 |
| tr |    29578 |    6272 |
| sq |    26769 |    4891 |
| is |    29644 |    4615 |
| uk |    31677 |    6507 |
| ca |    27544 |    5314 |
| mk |    27639 |    5468 |
| hr |    26546 |    6222 |
| sl |    26292 |    5763 |


In [59]:
# Merge token_df and overlap_df
merged = pd.concat([overlap_df, token_df], axis=1)
merged

# Reorder columns

merged = merged[["percentage", "tokens", "overlap_list_size", "types","overlap_set_size"]]

# Rename columns

merged.columns = ["overlap_percentage", "all_tokens", "overlapping_tokens", "all_types", "overlapping_types"]

merged

Unnamed: 0,overlap_percentage,all_tokens,overlapping_tokens,all_types,overlapping_types
mt,0.817085,39040,31899.0,4787,3586.0
el,0.161428,31240,5043.0,4751,822.0
tr,0.521502,29578,15425.0,6272,2451.0
sq,0.605775,26769,16216.0,4891,2748.0
is,0.517575,29644,15343.0,4615,2122.0
uk,0.156675,31677,4963.0,6507,411.0
ca,0.744881,27544,20517.0,5314,2896.0
mk,0.145989,27639,4035.0,5468,656.0
hr,0.821517,26546,21808.0,6222,4383.0
sl,0.974289,26292,25616.0,5763,5281.0


In [61]:
print(merged.to_markdown())

|    |   overlap_percentage |   all_tokens |   overlapping_tokens |   all_types |   overlapping_types |
|:---|---------------------:|-------------:|---------------------:|------------:|--------------------:|
| mt |             0.817085 |        39040 |                31899 |        4787 |                3586 |
| el |             0.161428 |        31240 |                 5043 |        4751 |                 822 |
| tr |             0.521502 |        29578 |                15425 |        6272 |                2451 |
| sq |             0.605775 |        26769 |                16216 |        4891 |                2748 |
| is |             0.517575 |        29644 |                15343 |        4615 |                2122 |
| uk |             0.156675 |        31677 |                 4963 |        6507 |                 411 |
| ca |             0.744881 |        27544 |                20517 |        5314 |                2896 |
| mk |             0.145989 |        27639 |                 403

In [35]:
# Let's see the most frequent tokens
most_frequent = {}
# Sort the dictionary by values (word counts) in descending order
for lang in list(main_dict.keys()):
	most_frequent[lang] = (sorted(main_dict[lang]['token_overlap']['token_count'].items(), key=lambda x: x[1], reverse=True)[:10])

print(pd.DataFrame({"most_frequent_type": most_frequent}).to_markdown())

|    | most_frequent_type                                                                                                                       |
|:---|:-----------------------------------------------------------------------------------------------------------------------------------------|
| ca | [(',', 1080), ('▁de', 1014), ('.', 675), ('s', 648), ('▁i', 564), ('▁la', 559), ('▁a', 529), ('▁que', 438), ("'", 360), ('’', 334)]      |
| el | [('▁', 1017), ('.', 801), (',', 782), ('▁και', 553), ('ς', 525), ('▁να', 351), ('▁το', 321), ('▁του', 292), ('▁την', 268), ('▁με', 264)] |
| hr | [(',', 878), ('.', 766), ('▁i', 546), ('▁u', 430), ('a', 413), ('▁je', 350), ('▁na', 282), ('▁za', 253), ('▁se', 239), ('e', 219)]       |
| is | [('.', 1017), ('▁og', 628), ('▁að', 613), (',', 600), ('▁', 528), ('▁í', 434), ('▁á', 388), ('▁er', 357), ('s', 326), ('▁sem', 279)]     |
| mk | [(',', 1021), ('▁на', 983), ('.', 738), ('▁и', 619), ('▁за', 475), ('▁да', 378), ('▁во', 376), ('▁се', 365), ('▁', 34

In [36]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

### Cosine similarity

Create a dictionary with all tokens from the train set, iterate through the tokens of each test set and count how many times each ocurrs. Calculate cosine similarity.

Do this on label level as well to see whether this explains good performance on some of the labels for Maltese.

Cosine similarity is a metric used to measure the similarity of two vectors. Specifically, it measures the similarity in the direction or orientation of the vectors ignoring differences in their magnitude or scale. Both vectors need to be part of the same inner product space, meaning they must produce a scalar through inner product multiplication. The similarity of two vectors is measured by the cosine of the angle between them. The similarity can take values between -1 and +1. Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity. 

Cosine similarity ignores 0-0 matches. Counting 0-0 matches in sparse data would inflate similarity scores. Another commonly used metric that ignores 0-0 matches is Jaccard Similarity.

In [2]:
def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [3]:
# Import train token count
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "r") as train_count_file:
	train_count = json.load(train_count_file)

list(train_count.items())[:5]

[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1)]

In [4]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

list(main_dict["sl"]["token_overlap"]["token_count"].items())[:10]

[('!', 30),
 ('!!', 1),
 ('!"', 1),
 ('!),', 1),
 ('"', 11),
 ('".', 2),
 ('"...', 1),
 ('&', 8),
 ("'", 3),
 ("''", 1)]

In [6]:
cosine_sim = {}
vector_size = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")
	# Get token count for current lang
	current_lang_count = main_dict[lang]["token_overlap"]["token_count"]

	print(f"Number of token types for {lang}: {len(list(current_lang_count.keys()))}")

	# For each test set, create a vector of token counts. Take only tokens that are present in train_df.

	# First, create a list of tokens that are present in either one or the other list
	intersection_keys = list(train_count.keys())

	# Then create a dictionary for 1) train df and 2) test df with counts of token types that occur in train_df
	train_intersect_dict = train_count
	test_intersect_dict = {}

	for i in intersection_keys:
		try:
			test_intersect_dict[i] = current_lang_count[i]
		except:
			test_intersect_dict[i] = 0


	# Create a df with intersecting keys
	intersect_df = pd.DataFrame({"train": train_intersect_dict, "test": test_intersect_dict})

	display(intersect_df.head(10))

	# Calculate cosine similarity
	current_cosine_sim = cosine_similarity(np.array(intersect_df["train"].to_list()), np.array(intersect_df["test"].to_list()))

	print(f"Cosine similarity for {lang}: {current_cosine_sim}")

	# Add to the main dictionary
	main_dict[lang]["token_overlap"]["cosine_similarity"] = current_cosine_sim
	main_dict[lang]["token_overlap"]["intersection_df"] = intersect_df.to_dict()
	main_dict[lang]["token_overlap"]["intersection_vector_size"] = len(intersection_keys)

	# Add to a dict of results
	cosine_sim[lang] = current_cosine_sim
	vector_size[lang] = len(intersection_keys)

Processing mt
Number of token types for mt: 4787


Unnamed: 0,train,test
!,430,9
!!,23,0
!!!,14,0
!!!!,6,0
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,0
!),10,0
"!),",1,0


Cosine similarity for mt: 0.49192396040790515
Processing el
Number of token types for el: 4751


Unnamed: 0,train,test
!,430,23
!!,23,2
!!!,14,5
!!!!,6,0
!!!!!,1,0
!!!!!!,1,1
!!!!!!!,1,0
"!""",14,0
!),10,0
"!),",1,0


Cosine similarity for el: 0.7069651767644073
Processing tr
Number of token types for tr: 6272


Unnamed: 0,train,test
!,430,21
!!,23,2
!!!,14,1
!!!!,6,0
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,2
!),10,0
"!),",1,0


Cosine similarity for tr: 0.6152024824126724
Processing sq
Number of token types for sq: 4891


Unnamed: 0,train,test
!,430,29
!!,23,3
!!!,14,4
!!!!,6,0
!!!!!,1,1
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,0
!),10,0
"!),",1,0


Cosine similarity for sq: 0.5456891066673359
Processing is
Number of token types for is: 4615


Unnamed: 0,train,test
!,430,15
!!,23,0
!!!,14,2
!!!!,6,1
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,1
!),10,0
"!),",1,0


Cosine similarity for is: 0.6101806148315608
Processing uk
Number of token types for uk: 6507


Unnamed: 0,train,test
!,430,41
!!,23,1
!!!,14,6
!!!!,6,0
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,2
!),10,0
"!),",1,0


Cosine similarity for uk: 0.6908456237440821
Processing ca
Number of token types for ca: 5314


Unnamed: 0,train,test
!,430,60
!!,23,7
!!!,14,8
!!!!,6,0
!!!!!,1,0
!!!!!!,1,1
!!!!!!!,1,1
"!""",14,0
!),10,0
"!),",1,1


Cosine similarity for ca: 0.5290309314823628
Processing mk
Number of token types for mk: 5468


Unnamed: 0,train,test
!,430,22
!!,23,0
!!!,14,1
!!!!,6,0
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,3
!),10,0
"!),",1,0


Cosine similarity for mk: 0.689022808410823
Processing hr
Number of token types for hr: 6222


Unnamed: 0,train,test
!,430,33
!!,23,1
!!!,14,3
!!!!,6,0
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,0
!),10,0
"!),",1,0


Cosine similarity for hr: 0.5699962229590452
Processing sl
Number of token types for sl: 5763


Unnamed: 0,train,test
!,430,30
!!,23,1
!!!,14,0
!!!!,6,0
!!!!!,1,0
!!!!!!,1,0
!!!!!!!,1,0
"!""",14,1
!),10,0
"!),",1,1


Cosine similarity for sl: 0.6335818041038892


In [34]:
# Create a list of tokens in train_df with only tokens that have at least one letter character - so, we remove tokens that consist only of punctuation or numbers
train_count_lexical_list = [item for item in list(train_count.keys()) if any(char.isalpha() for char in item)]

# Additionally, we keep only tokens that are longer than 1 character

train_count_lexical = {x:train_count[x] for x in train_count_lexical_list if len(x.replace("▁", "")) > 1}



list(train_count_lexical.items())[:4]

[('AA', 10), ('AAN', 1), ('AAR', 1), ('AB', 1)]

In [35]:
len(list(train_count_lexical.keys()))

25225

In [36]:
# experiment with removing punctuation, white spaces and numbers from the token list

cosine_sim = {}
vector_size = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")
	# Get token count for current lang
	current_lang_count = main_dict[lang]["token_overlap"]["token_count"]

	print(f"Number of token types for {lang}: {len(list(current_lang_count.keys()))}")

	# For each test set, create a vector of token counts. Take only tokens that are present in train_df.

	# First, create a list of tokens that are present in either one or the other list

	intersection_keys = list(train_count_lexical.keys())

	# Then create a dictionary for 1) train df and 2) test df with counts of token types that occur in train_df
	train_intersect_dict = train_count_lexical
	test_intersect_dict = {}

	for i in intersection_keys:
		try:
			test_intersect_dict[i] = current_lang_count[i]
		except:
			test_intersect_dict[i] = 0


	# Create a df with intersecting keys
	intersect_df = pd.DataFrame({"train": train_intersect_dict, "test": test_intersect_dict})

	display(intersect_df.head(10))

	# Calculate cosine similarity
	current_cosine_sim = cosine_similarity(np.array(intersect_df["train"].to_list()), np.array(intersect_df["test"].to_list()))

	print(f"Cosine similarity for {lang}: {current_cosine_sim}")

	# Add to the main dictionary
	main_dict[lang]["token_overlap"]["cosine_similarity"] = current_cosine_sim
	main_dict[lang]["token_overlap"]["intersection_df"] = intersect_df.to_dict()
	main_dict[lang]["token_overlap"]["intersection_vector_size"] = len(intersection_keys)

	# Add to a dict of results
	cosine_sim[lang] = current_cosine_sim
	vector_size[lang] = len(intersection_keys)

Processing mt
Number of token types for mt: 4787


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,1
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for mt: 0.2842150848173066
Processing el
Number of token types for el: 4751


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,0
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for el: 0.12768966885809033
Processing tr
Number of token types for tr: 6272


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,1
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for tr: 0.07260964319933215
Processing sq
Number of token types for sq: 4891


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,0
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,1
ACH,3,0
ACI,2,0


Cosine similarity for sq: 0.13639871257526284
Processing is
Number of token types for is: 4615


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,1
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for is: 0.06621617873023707
Processing uk
Number of token types for uk: 6507


Unnamed: 0,train,test
AA,10,3
AAN,1,0
AAR,1,0
AB,1,0
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for uk: 0.011605931691160141
Processing ca
Number of token types for ca: 5314


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,3
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,1


Cosine similarity for ca: 0.04938153951776159
Processing mk
Number of token types for mk: 5468


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,0
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for mk: 0.22877625063792426
Processing hr
Number of token types for hr: 6222


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,0
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,0
ACI,2,0


Cosine similarity for hr: 0.2076741403073573
Processing sl
Number of token types for sl: 5763


Unnamed: 0,train,test
AA,10,0
AAN,1,0
AAR,1,0
AB,1,0
ABILI,1,0
ABLE,6,0
AC,3,0
ACE,8,0
ACH,3,1
ACI,2,0


Cosine similarity for sl: 0.331293659491296


In [9]:
# Experiment with normalizing the values

In [13]:
# Experiment what happens if we normalize the values using min max
# Normalize the values using the min-max normalization
# Min-max is a scaling technique where values are rescaled and shifted so that they range between 0 and 1 or between -1 and 1.

def normalize(list):
	data = np.array(list).reshape(-1, 1)
	scaler = MinMaxScaler()
	scaler.fit(data)
	norm_list = scaler.transform(data)

	return norm_list

In [14]:
df = pd.DataFrame(main_dict["uk"]["token_overlap"]["intersection_df"])
df.head(4)

Unnamed: 0,train,test
!,430,41
!!,23,1
!!!,14,6
!!!!,6,0


In [None]:
cosine_similarity(np.array(df["train"].to_list()), np.array(df["test"].to_list()))

In [None]:
cosine_similarity(np.array(normalize(df["train"].to_list())), np.array(normalize(df["test"].to_list())))

In [16]:
normalize(df["test"].to_list())

array([[0.03032544],
       [0.00073964],
       [0.00443787],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [8]:
# Show results
cosine_sim_df = pd.DataFrame({"cosine_similarity": cosine_sim, "vector_size": vector_size}).sort_values(by="cosine_similarity", ascending=False)
print(cosine_sim_df.to_markdown())

|    |   cosine_similarity |   vector_size |
|:---|--------------------:|--------------:|
| el |            0.706965 |         27025 |
| uk |            0.690846 |         27025 |
| mk |            0.689023 |         27025 |
| sl |            0.633582 |         27025 |
| tr |            0.615202 |         27025 |
| is |            0.610181 |         27025 |
| hr |            0.569996 |         27025 |
| sq |            0.545689 |         27025 |
| ca |            0.529031 |         27025 |
| mt |            0.491924 |         27025 |


In [11]:
overlap_per = {}
for lang in list(main_dict.keys()):
	overlap_per[lang] = main_dict[lang]["token_overlap"]["overlap_percentage"]

In [12]:
# Show results
cosine_sim_df = pd.DataFrame({"cosine_similarity": cosine_sim, "vector_size": vector_size, "overlap_percentage": overlap_per}).sort_values(by="cosine_similarity", ascending=False)
print(cosine_sim_df.to_markdown())

|    |   cosine_similarity |   vector_size |   overlap_percentage |
|:---|--------------------:|--------------:|---------------------:|
| el |            0.706965 |         27025 |             0.161428 |
| uk |            0.690846 |         27025 |             0.156675 |
| mk |            0.689023 |         27025 |             0.145989 |
| sl |            0.633582 |         27025 |             0.974289 |
| tr |            0.615202 |         27025 |             0.521502 |
| is |            0.610181 |         27025 |             0.517575 |
| hr |            0.569996 |         27025 |             0.821517 |
| sq |            0.545689 |         27025 |             0.605775 |
| ca |            0.529031 |         27025 |             0.744881 |
| mt |            0.491924 |         27025 |             0.817085 |


# Compare token overlap on label level 

### Create label-level token counts for train dataset

In [3]:
# Open the tokenized train df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51..."


In [4]:
train_df.labels.unique()

array(['Other', 'Information/Explanation', 'News', 'Instruction',
       'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal',
       'Promotion'], dtype=object)

In [None]:
# Separate the train df into label-based dfs

# Create lists of tokens that are label based
label_tokens = {}
token_count = {}
type_count = {}

for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
	print(f"Processing {label}")

	label_df = train_df[train_df["labels"] == label]
	display(label_df.head(3))

	# Create a list of tokens, where we take only the first 512 tokens
	train_tokens_shortened = []

	for i in label_df["tokens_train"].to_list():
		train_tokens_shortened.extend(i[:512])

	print(f"Number of all tokens: {len(train_tokens_shortened)}")

	# Add to dictionaries
	label_tokens[label] = train_tokens_shortened
	token_count[label] = len(train_tokens_shortened)
	type_count[label] = len(set(train_tokens_shortened))

In [8]:
# Calculate numbers for each label
label_results_train = pd.DataFrame({"token_count": token_count, "type_count": type_count})

print(label_results_train.to_markdown())

|                         |   token_count |   type_count |
|:------------------------|--------------:|-------------:|
| Information/Explanation |        124130 |        14678 |
| News                    |        136557 |        15319 |
| Instruction             |         83750 |         8929 |
| Opinion/Argumentation   |        103141 |        13088 |
| Forum                   |         58900 |         8555 |
| Prose/Lyrical           |         46860 |         5990 |
| Legal                   |         28496 |         4425 |
| Promotion               |         88626 |        12548 |


In [9]:
# Save the label token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count.json", "w") as train_label_count_file:
	json.dump(label_tokens, train_label_count_file)

### Create label-level token counts for test sets

In [12]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict["uk"].keys()

dict_keys(['accuracy', 'micro_f1', 'macro_f1', 'dataset', 'token_overlap'])

In [13]:
pd.DataFrame(main_dict["sl"]["dataset"]).head(2)

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."


In [14]:
# Do the same as with the dataset, but on every language
lang_results = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")

	label_token_dict = {}

	# Current df
	df = pd.DataFrame(main_dict[lang]["dataset"])
	display(df.head(2))

	for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
		print(f"Processing {label}")

		label_df = df[df["y_true"] == label]

		# Create a list of tokens
		token_list = []

		for i in label_df["tokens"].to_list():
			token_list.extend(i)

		# Add to dictionary
		label_token_dict[label] = token_list

	# Add to main dict
	main_dict[lang]["token_overlap"]["label_level_token_lists"] = label_token_dict


Processing mt


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262..."
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing el


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ..."
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing tr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.tr.15851513,Instruction,AÖL Ders Seçimi ve Sınav Giriş Merkezi Belirle...,AÖL warned of frequent negligence and errors o...,{'text_id': 'macocu.tr.15851513'},Instruction,"[▁A, Ö, L, ▁Der, s, ▁Seçim, i, ▁ve, ▁Sınav, ▁G...","[62, 8655, 866, 1310, 7, 166134, 14, 173, 1762..."
1,macocu.tr.12699738,Legal,Banka promosyonu ihalesinde uygulanacak kriter...,Criteria to be applied in the tender for bank ...,{'text_id': 'macocu.tr.12699738'},Legal,"[▁Banka, ▁promo, syon, u, ▁i, hale, sinde, ▁uy...","[81847, 8891, 10270, 34, 17, 50742, 19209, 633..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing sq


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
0,macocu.sq.1061396,Opinion/Argumentation,341.0,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",Opinion/Argumentation,"[▁Blog, ▁“, U, në, ▁të, ▁kam, ▁dashur, ▁me, ▁n...","[5061, 52, 1062, 3208, 134, 3840, 57168, 163, ..."
2,macocu.sq.183383,Legal,140.0,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...,"{'text_id': 'macocu.sq.183383', 'domain': 'euk...",Legal,"[▁Li, ria, ▁nga, ▁keq, t, raj, timi, ▁Kon, ven...","[1261, 1651, 817, 39184, 18, 10185, 20520, 369..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing is


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.is.639516,Legal,[is] Því er við hæfi að reglur verði settar in...,[IS] It is therefore appropriate that rules be...,{'text_id': 'macocu.is.639516'},Legal,"[▁[, is, ], ▁Því, ▁er, ▁við, ▁hæ, fi, ▁að, ▁re...","[378, 164, 268, 139806, 72, 1497, 33423, 1029,..."
1,macocu.is.1301366,Instruction,Sækja um fulla aðild \n\nKennitala * \n\nNetfa...,Apply full membership\n\nSocial Security numbe...,{'text_id': 'macocu.is.1301366'},Information/Explanation,"[▁Sæ, kja, ▁um, ▁full, a, ▁að, ild, ▁Kenn, ita...","[71595, 28643, 286, 4393, 11, 389, 38472, 5906..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing uk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307..."
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing ca


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.ca.2248072,Opinion/Argumentation,Pàgines \n\nEnfeinada \n\nPorto uns dies una m...,Pages\n\nCoined\n\nI have been a little busy f...,{'text_id': 'macocu.ca.2248072'},Forum,"[▁P, à, gine, s, ▁En, fei, nada, ▁Porto, ▁uns,...","[436, 1298, 63023, 7, 357, 51899, 28866, 24952..."
1,macocu.ca.756254,Information/Explanation,Info \n\nLa Casa nova dels Banys de Sant Vicen...,Info\n\nThe Casa Nova dels Banys de Sant Vicen...,{'text_id': 'macocu.ca.756254'},Information/Explanation,"[▁Info, ▁La, ▁Casa, ▁nova, ▁dels, ▁Ban, ys, ▁d...","[14048, 239, 8591, 4678, 2323, 5458, 4778, 8, ..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing mk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing hr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.hr.1033815,Promotion,"O proizvodu\nColor Transformer, za pametno i j...","About the Color Transformer product, for smart...","CLASSLA-web.hr.1033815', 'domain': 'hairshop.hr'}",Promotion,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, former, ,, ...","[180, 43170, 34, 51193, 11062, 82772, 4, 80, 6..."
2,CLASSLA-web.hr.1119579,Promotion,Sunčano selo / Sunny village\nNa obroncima Bil...,Sunshine / Sunny Village on the slopes of Bilo...,"CLASSLA-web.hr.1119579', 'domain': 'vikendi.com'}",Promotion,"[▁Sun, čan, o, ▁se, lo, ▁/, ▁Sunny, ▁village, ...","[7550, 17129, 31, 40, 365, 248, 151197, 54427,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing sl


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion


In [15]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

## Calculate overlap on label level

In [19]:
# Open label-based train token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count.json", "r") as train_label_count_file:
	label_token_count_train = json.load(train_label_count_file)

label_token_count_train.keys()

dict_keys(['Information/Explanation', 'News', 'Instruction', 'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal', 'Promotion'])

In [None]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

main_dict["sl"]["token_overlap"]["label_level_token_lists"]

In [18]:
# Improved code to calculate token overlap - we can simply calculate which tokens do not overlap - meaning that they occur only in one of the dataset, and calculate the percentage overlap based on that
token_overlap_label_results = {}
results = {}

# Loop through the datasets and labels and calculate token overlap
for lang in list(main_dict.keys()):
	print(lang)
	# Create a dict for labels
	label_overlap = {}
	label_overlap_tokens = {}

	# loop through labels
	for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
		print(label)
		token_list_test = main_dict[lang]["token_overlap"]["label_level_token_lists"][label]

		token_list_train = label_token_count_train[label]

		# See how many tokens do not overlap
		no_overlap_counter = 0

		# Save tokens that overlap for further inspection
		overlap_token_list = []

		# We calculate overlap by counting how many tokens do not appear in both sets
		for token in tqdm(token_list_test):
			if token not in token_list_train:
				no_overlap_counter += 1
			else:
				overlap_token_list.append(token)

		# Out of all tokens in test set, how many do not overlap with train set?
		no_overlap_per = no_overlap_counter/len(token_list_test)

		# Calculate percentage of overlap based on that
		overlap_per = 1-no_overlap_per

		print(f"Percentage of overlap: {overlap_per}")

		label_overlap[label] = overlap_per
		label_overlap_tokens[label] = overlap_token_list

		# Add to the results
		results[f"{lang}-{label}"] = overlap_per

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"]["label_overlap_percentage"] = label_overlap
	main_dict[lang]["token_overlap"]["label_overlap_token_list"] = label_overlap_tokens

mt
Information/Explanation


100%|██████████| 6656/6656 [00:03<00:00, 2037.47it/s]


Percentage of overlap: 0.7145432692307692
News


100%|██████████| 6978/6978 [00:03<00:00, 2046.50it/s]


Percentage of overlap: 0.7123817712811693
Instruction


100%|██████████| 9288/9288 [00:03<00:00, 2881.30it/s]


Percentage of overlap: 0.6194013781223083
Opinion/Argumentation


100%|██████████| 4096/4096 [00:01<00:00, 2549.66it/s]


Percentage of overlap: 0.68212890625
Forum


100%|██████████| 512/512 [00:00<00:00, 3719.44it/s]


Percentage of overlap: 0.583984375
Prose/Lyrical


100%|██████████| 512/512 [00:00<00:00, 4707.33it/s]


Percentage of overlap: 0.591796875
Legal


100%|██████████| 5120/5120 [00:00<00:00, 10036.83it/s]


Percentage of overlap: 0.6763671875
Promotion


100%|██████████| 5878/5878 [00:01<00:00, 3103.47it/s]


Percentage of overlap: 0.6578768288533514
el
Information/Explanation


100%|██████████| 5608/5608 [00:06<00:00, 907.70it/s] 


Percentage of overlap: 0.13908701854493577
News


100%|██████████| 3501/3501 [00:03<00:00, 943.57it/s] 


Percentage of overlap: 0.1162524992859183
Instruction


100%|██████████| 2971/2971 [00:01<00:00, 1662.94it/s]


Percentage of overlap: 0.16122517670817904
Opinion/Argumentation


100%|██████████| 5607/5607 [00:04<00:00, 1240.54it/s]


Percentage of overlap: 0.11699661137863382
Forum


100%|██████████| 5137/5137 [00:02<00:00, 2315.90it/s]


Percentage of overlap: 0.17033287911232242
Prose/Lyrical


100%|██████████| 2872/2872 [00:01<00:00, 2827.42it/s]


Percentage of overlap: 0.14589136490250698
Legal


100%|██████████| 3368/3368 [00:00<00:00, 4736.99it/s]


Percentage of overlap: 0.1395486935866983
Promotion


100%|██████████| 2176/2176 [00:01<00:00, 1570.86it/s]


Percentage of overlap: 0.15900735294117652
tr
Information/Explanation


100%|██████████| 2289/2289 [00:01<00:00, 1411.61it/s]


Percentage of overlap: 0.47924858016601135
News


100%|██████████| 3342/3342 [00:02<00:00, 1423.06it/s]


Percentage of overlap: 0.44105326152004787
Instruction


100%|██████████| 4049/4049 [00:01<00:00, 2115.87it/s]


Percentage of overlap: 0.38281057051123735
Opinion/Argumentation


100%|██████████| 5613/5613 [00:03<00:00, 1833.53it/s]


Percentage of overlap: 0.4332798859789774
Forum


100%|██████████| 3219/3219 [00:01<00:00, 3051.48it/s]


Percentage of overlap: 0.44330537433985706
Prose/Lyrical


100%|██████████| 3764/3764 [00:00<00:00, 3882.97it/s]


Percentage of overlap: 0.41339001062699254
Legal


100%|██████████| 3677/3677 [00:00<00:00, 5758.01it/s]


Percentage of overlap: 0.3146586891487626
Promotion


100%|██████████| 3625/3625 [00:01<00:00, 2183.09it/s]


Percentage of overlap: 0.39420689655172414
sq
Information/Explanation


100%|██████████| 5196/5196 [00:03<00:00, 1361.00it/s]


Percentage of overlap: 0.4844110854503464
News


100%|██████████| 2415/2415 [00:01<00:00, 1400.32it/s]


Percentage of overlap: 0.4650103519668737
Instruction


100%|██████████| 3204/3204 [00:01<00:00, 2169.89it/s]


Percentage of overlap: 0.4297752808988764
Opinion/Argumentation


100%|██████████| 4440/4440 [00:02<00:00, 1882.93it/s]


Percentage of overlap: 0.4747747747747748
Forum


100%|██████████| 3509/3509 [00:00<00:00, 3555.70it/s]


Percentage of overlap: 0.5950413223140496
Prose/Lyrical


100%|██████████| 3393/3393 [00:00<00:00, 4236.06it/s]


Percentage of overlap: 0.49277925139994105
Legal


100%|██████████| 2817/2817 [00:00<00:00, 6103.46it/s]


Percentage of overlap: 0.37841675541356057
Promotion


100%|██████████| 1795/1795 [00:00<00:00, 2553.77it/s]


Percentage of overlap: 0.5337047353760446
is
Information/Explanation


100%|██████████| 2151/2151 [00:01<00:00, 1357.65it/s]


Percentage of overlap: 0.45560204556020456
News


100%|██████████| 4149/4149 [00:03<00:00, 1272.20it/s]


Percentage of overlap: 0.4196191853458665
Instruction


100%|██████████| 3915/3915 [00:01<00:00, 1982.39it/s]


Percentage of overlap: 0.383397190293742
Opinion/Argumentation


100%|██████████| 4970/4970 [00:02<00:00, 1715.09it/s]


Percentage of overlap: 0.4191146881287726
Forum


100%|██████████| 2769/2769 [00:00<00:00, 2918.99it/s]


Percentage of overlap: 0.40195016251354276
Prose/Lyrical


100%|██████████| 4649/4649 [00:01<00:00, 3567.51it/s]


Percentage of overlap: 0.37491933749193374
Legal


100%|██████████| 4005/4005 [00:00<00:00, 6007.83it/s]


Percentage of overlap: 0.3468164794007491
Promotion


100%|██████████| 3036/3036 [00:01<00:00, 2110.08it/s]


Percentage of overlap: 0.43346508563899866
uk
Information/Explanation


100%|██████████| 4352/4352 [00:04<00:00, 931.01it/s] 


Percentage of overlap: 0.13304227941176472
News


100%|██████████| 3386/3386 [00:03<00:00, 971.51it/s] 


Percentage of overlap: 0.13112817483756645
Instruction


100%|██████████| 4411/4411 [00:02<00:00, 1589.06it/s]


Percentage of overlap: 0.11902063024257536
Opinion/Argumentation


100%|██████████| 5386/5386 [00:04<00:00, 1303.95it/s]


Percentage of overlap: 0.14611956925362046
Forum


100%|██████████| 3528/3528 [00:01<00:00, 2338.17it/s]


Percentage of overlap: 0.16638321995464855
Prose/Lyrical


100%|██████████| 4282/4282 [00:01<00:00, 2941.81it/s]


Percentage of overlap: 0.16184026156001863
Legal


100%|██████████| 3484/3484 [00:00<00:00, 5085.64it/s]


Percentage of overlap: 0.1702066590126292
Promotion


100%|██████████| 2848/2848 [00:01<00:00, 1586.98it/s]


Percentage of overlap: 0.1629213483146067
ca
Information/Explanation


100%|██████████| 4017/4017 [00:02<00:00, 1694.22it/s]


Percentage of overlap: 0.6069205875031118
News


100%|██████████| 2612/2612 [00:01<00:00, 1775.23it/s]


Percentage of overlap: 0.6125574272588055
Instruction


100%|██████████| 2133/2133 [00:00<00:00, 2979.27it/s]


Percentage of overlap: 0.6207219878105954
Opinion/Argumentation


100%|██████████| 3368/3368 [00:01<00:00, 2657.55it/s]


Percentage of overlap: 0.6339073634204275
Forum


100%|██████████| 3755/3755 [00:00<00:00, 4183.81it/s]


Percentage of overlap: 0.6399467376830892
Prose/Lyrical


100%|██████████| 4092/4092 [00:00<00:00, 4936.87it/s]


Percentage of overlap: 0.5916422287390029
Legal


100%|██████████| 3336/3336 [00:00<00:00, 7248.70it/s]


Percentage of overlap: 0.4787170263788969
Promotion


100%|██████████| 4231/4231 [00:01<00:00, 3027.25it/s]


Percentage of overlap: 0.616875443157646
mk
Information/Explanation


100%|██████████| 3324/3324 [00:03<00:00, 939.12it/s] 


Percentage of overlap: 0.16817087845968715
News


100%|██████████| 3778/3778 [00:04<00:00, 941.74it/s]


Percentage of overlap: 0.10534674430915825
Instruction


100%|██████████| 3271/3271 [00:02<00:00, 1618.28it/s]


Percentage of overlap: 0.1213696117395292
Opinion/Argumentation


100%|██████████| 3007/3007 [00:02<00:00, 1295.36it/s]


Percentage of overlap: 0.12337878284003989
Forum


100%|██████████| 4072/4072 [00:01<00:00, 2288.53it/s]


Percentage of overlap: 0.15446954813359526
Prose/Lyrical


100%|██████████| 3892/3892 [00:01<00:00, 2860.47it/s]


Percentage of overlap: 0.1500513874614594
Legal


100%|██████████| 3412/3412 [00:00<00:00, 4725.52it/s]


Percentage of overlap: 0.11371629542790151
Promotion


100%|██████████| 2883/2883 [00:01<00:00, 1501.35it/s]


Percentage of overlap: 0.13146028442594515
hr
Information/Explanation


100%|██████████| 4027/4027 [00:02<00:00, 1887.75it/s]


Percentage of overlap: 0.7047429848522473
News


100%|██████████| 3237/3237 [00:01<00:00, 1955.14it/s]


Percentage of overlap: 0.7194933580475749
Instruction


100%|██████████| 1701/1701 [00:00<00:00, 2705.21it/s]


Percentage of overlap: 0.6202233980011758
Opinion/Argumentation


100%|██████████| 3427/3427 [00:01<00:00, 2353.35it/s]


Percentage of overlap: 0.6807703530784943
Forum


100%|██████████| 3811/3811 [00:01<00:00, 3549.18it/s]


Percentage of overlap: 0.6452374704801889
Prose/Lyrical


100%|██████████| 2651/2651 [00:00<00:00, 4353.90it/s]


Percentage of overlap: 0.5620520558279894
Legal


100%|██████████| 3490/3490 [00:00<00:00, 7708.46it/s]


Percentage of overlap: 0.5272206303724929
Promotion


100%|██████████| 4202/4202 [00:01<00:00, 3649.87it/s]


Percentage of overlap: 0.6972870061875298
sl
Information/Explanation


100%|██████████| 3446/3446 [00:01<00:00, 2598.65it/s]


Percentage of overlap: 0.8682530470110272
News


100%|██████████| 3452/3452 [00:01<00:00, 2437.77it/s]


Percentage of overlap: 0.9122247972190035
Instruction


100%|██████████| 3395/3395 [00:01<00:00, 3087.92it/s]


Percentage of overlap: 0.7705449189985273
Opinion/Argumentation


100%|██████████| 2471/2471 [00:00<00:00, 2986.05it/s]


Percentage of overlap: 0.8887090246863618
Forum


100%|██████████| 3970/3970 [00:01<00:00, 3760.73it/s]


Percentage of overlap: 0.801007556675063
Prose/Lyrical


100%|██████████| 2909/2909 [00:00<00:00, 5512.87it/s]


Percentage of overlap: 0.6995531110347198
Legal


100%|██████████| 3446/3446 [00:00<00:00, 11590.15it/s]


Percentage of overlap: 0.7556587347649448
Promotion


100%|██████████| 3203/3203 [00:00<00:00, 6488.16it/s]

Percentage of overlap: 0.8741804558226662





In [21]:
results

{'mt-Information/Explanation': 0.7145432692307692,
 'mt-News': 0.7123817712811693,
 'mt-Instruction': 0.6194013781223083,
 'mt-Opinion/Argumentation': 0.68212890625,
 'mt-Forum': 0.583984375,
 'mt-Prose/Lyrical': 0.591796875,
 'mt-Legal': 0.6763671875,
 'mt-Promotion': 0.6578768288533514,
 'el-Information/Explanation': 0.13908701854493577,
 'el-News': 0.1162524992859183,
 'el-Instruction': 0.16122517670817904,
 'el-Opinion/Argumentation': 0.11699661137863382,
 'el-Forum': 0.17033287911232242,
 'el-Prose/Lyrical': 0.14589136490250698,
 'el-Legal': 0.1395486935866983,
 'el-Promotion': 0.15900735294117652,
 'tr-Information/Explanation': 0.47924858016601135,
 'tr-News': 0.44105326152004787,
 'tr-Instruction': 0.38281057051123735,
 'tr-Opinion/Argumentation': 0.4332798859789774,
 'tr-Forum': 0.44330537433985706,
 'tr-Prose/Lyrical': 0.41339001062699254,
 'tr-Legal': 0.3146586891487626,
 'tr-Promotion': 0.39420689655172414,
 'sq-Information/Explanation': 0.4844110854503464,
 'sq-News': 0.465

In [22]:
# Inspect the results
overlap_df = pd.DataFrame({"label": list(results.keys()), "overlap": list(results.values())})
overlap_df

Unnamed: 0,label,overlap
0,mt-Information/Explanation,0.714543
1,mt-News,0.712382
2,mt-Instruction,0.619401
3,mt-Opinion/Argumentation,0.682129
4,mt-Forum,0.583984
...,...,...
75,sl-Opinion/Argumentation,0.888709
76,sl-Forum,0.801008
77,sl-Prose/Lyrical,0.699553
78,sl-Legal,0.755659


In [35]:
# Save the label-level-overlap
with open("datasets/label-level-token-overlap.csv", "w") as file:
	overlap_df.to_csv(file)

In [23]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

# Calculate token overlap with transliterated texts

For languages which are in non-Latin script, let's experiment also with transliteration to see what is the effect on the results.

## Prepare transliterated and normalized versions

In [10]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [25]:
main_dict["sl"]["token_overlap"].keys()

dict_keys(['overlap_percentage', 'token_list', 'overlap_token_list', 'token_count', 'cosine_similarity', 'intersection_df', 'intersection_vector_size', 'label_level_token_lists'])

In [21]:
# Transliterate the Cyrillic scripts, using the cyrtranslit library: https://github.com/opendatakosovo/cyrillic-transliteration

In [26]:
def transliterate(text, lang):
	import cyrtranslit
	transl_dict = {"uk": "ua", "bg": "bg", "cnr": "me", "mk": "mk"}
	if lang == "sr":
		transl_text = cyrtranslit.to_latin(text)
	else:
		transl_text = cyrtranslit.to_latin(text, transl_dict[lang])
	return transl_text

for lang in list(main_dict.keys()):
	if lang in ["uk", "mk"]:
		# Open the df
		df = pd.DataFrame(main_dict[lang]["dataset"])
		display(df.head(2))
		# Transliterate and save text to new column
		df["text_latin"] = [transliterate(x, lang) for x in df["text"].to_list()]
		display(df.head(2))

		# Save the df
		main_dict[lang]["dataset"] = df.to_dict()

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307..."
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9..."


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_latin
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307...",Nestandartnyj pidxid dlja vyhotovlennja Akvari...
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9...",MUČENYKY BUČA-IRPIN' \n\nNe snylos' poljanam j...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44..."


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_latin
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416...","Ekšuli, TCL gi pravi smartfonovite, a TCL e sm..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44...",Red Valentino prognozira bura od printovi za s...


In [None]:
with pd.option_context('display.max_colwidth', 200):
	display(pd.DataFrame(main_dict["uk"]["dataset"])[["text", "text_latin"]].head(3))

In [None]:
with pd.option_context('display.max_colwidth', 200):
	display(pd.DataFrame(main_dict["el"]["dataset"])[["text", "text_latin"]].head(3))

In [27]:
# Transliterate greek script to Latin, using the transliterate library: https://pypi.org/project/transliterate/
def transliterate_greek(text):
	from transliterate import translit, get_available_language_codes
	transl_text = translit(u"{}".format(text), 'el', reversed=True)
	return transl_text

for lang in list(main_dict.keys()):
	if lang == "el":
		# Open the df
		df = pd.DataFrame(main_dict[lang]["dataset"])
		display(df.head(2))
		# Transliterate and save text to new column
		df["text_latin"] = [transliterate_greek(x) for x in df["text"].to_list()]
		display(df.head(2))

		# Save the df
		main_dict[lang]["dataset"] = df.to_dict()

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ..."
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,..."


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_latin
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ...",Enimerosi toy Pegasus Estiasi me tis eiserchom...
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,...",i timi tis ekdosis 8GB/ 128GB einai 1.299 euro...


In [28]:
# Furthermore, let's also "normalize" the characters in all the languages to get rid of special characters, using the unidecode library: https://pypi.org/project/Unidecode/, and also lowercase all characters

def normalize(text):
	# First, make the text lowercase
	text = text.lower()
	from unidecode import unidecode
	norm_text = unidecode(u"{}".format(text))
	return norm_text

for lang in list(main_dict.keys()):
	# Open the df
	df = pd.DataFrame(main_dict[lang]["dataset"])

	if "lang" in ["uk", "mk", "el"]:
		# Take the transliterated text, normalize and save text to new column
		df["text_norm"] = [normalize(x) for x in df["text_latin"].to_list()]
		display(df.head(2))
	else:
		# Normalize and save text to new column
		df["text_norm"] = [normalize(x) for x in df["text"].to_list()]
		display(df.head(2))
	# Save the df
	main_dict[lang]["dataset"] = df.to_dict()

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262...","angelo chetcuti, se jkun qed jiehu post bjorn ..."
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24...",poltergeist jirreferi ghal fenomeni ohra tal-m...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_latin,text_norm
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ...",Enimerosi toy Pegasus Estiasi me tis eiserchom...,enemerose tou pegasus estiasi me tis eiserkhom...
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,...",i timi tis ekdosis 8GB/ 128GB einai 1.299 euro...,e time tes ekdoses 8gb/ 128gb einai 1.299 euro...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm
0,macocu.tr.15851513,Instruction,AÖL Ders Seçimi ve Sınav Giriş Merkezi Belirle...,AÖL warned of frequent negligence and errors o...,{'text_id': 'macocu.tr.15851513'},Instruction,"[▁A, Ö, L, ▁Der, s, ▁Seçim, i, ▁ve, ▁Sınav, ▁G...","[62, 8655, 866, 1310, 7, 166134, 14, 173, 1762...",aol ders secimi ve sinav giris merkezi belirle...
1,macocu.tr.12699738,Legal,Banka promosyonu ihalesinde uygulanacak kriter...,Criteria to be applied in the tender for bank ...,{'text_id': 'macocu.tr.12699738'},Legal,"[▁Banka, ▁promo, syon, u, ▁i, hale, sinde, ▁uy...","[81847, 8891, 10270, 34, 17, 50742, 19209, 633...",banka promosyonu ihalesinde uygulanacak kriter...


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids,text_norm
0,macocu.sq.1061396,Opinion/Argumentation,341.0,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",Opinion/Argumentation,"[▁Blog, ▁“, U, në, ▁të, ▁kam, ▁dashur, ▁me, ▁n...","[5061, 52, 1062, 3208, 134, 3840, 57168, 163, ...","blog\n\n""une te kam dashur me nje dashuri te p..."
2,macocu.sq.183383,Legal,140.0,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...,"{'text_id': 'macocu.sq.183383', 'domain': 'euk...",Legal,"[▁Li, ria, ▁nga, ▁keq, t, raj, timi, ▁Kon, ven...","[1261, 1651, 817, 39184, 18, 10185, 20520, 369...",liria nga keqtrajtimi\n\nkonventa e te drejtav...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm
0,macocu.is.639516,Legal,[is] Því er við hæfi að reglur verði settar in...,[IS] It is therefore appropriate that rules be...,{'text_id': 'macocu.is.639516'},Legal,"[▁[, is, ], ▁Því, ▁er, ▁við, ▁hæ, fi, ▁að, ▁re...","[378, 164, 268, 139806, 72, 1497, 33423, 1029,...",[is] thvi er vid haefi ad reglur verdi settar ...
1,macocu.is.1301366,Instruction,Sækja um fulla aðild \n\nKennitala * \n\nNetfa...,Apply full membership\n\nSocial Security numbe...,{'text_id': 'macocu.is.1301366'},Information/Explanation,"[▁Sæ, kja, ▁um, ▁full, a, ▁að, ild, ▁Kenn, ita...","[71595, 28643, 286, 4393, 11, 389, 38472, 5906...",saekja um fulla adild \n\nkennitala * \n\nnetf...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_latin,text_norm
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307...",Nestandartnyj pidxid dlja vyhotovlennja Akvari...,nestandartnii pidkhid dlia vigotovlennia akvar...
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9...",MUČENYKY BUČA-IRPIN' \n\nNe snylos' poljanam j...,mucheniki bucha-irpin' \n\nne snilos' polianam...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm
0,macocu.ca.2248072,Opinion/Argumentation,Pàgines \n\nEnfeinada \n\nPorto uns dies una m...,Pages\n\nCoined\n\nI have been a little busy f...,{'text_id': 'macocu.ca.2248072'},Forum,"[▁P, à, gine, s, ▁En, fei, nada, ▁Porto, ▁uns,...","[436, 1298, 63023, 7, 357, 51899, 28866, 24952...",pagines \n\nenfeinada \n\nporto uns dies una m...
1,macocu.ca.756254,Information/Explanation,Info \n\nLa Casa nova dels Banys de Sant Vicen...,Info\n\nThe Casa Nova dels Banys de Sant Vicen...,{'text_id': 'macocu.ca.756254'},Information/Explanation,"[▁Info, ▁La, ▁Casa, ▁nova, ▁dels, ▁Ban, ys, ▁d...","[14048, 239, 8591, 4678, 2323, 5458, 4778, 8, ...",info \n\nla casa nova dels banys de sant vicen...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_latin,text_norm
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416...","Ekšuli, TCL gi pravi smartfonovite, a TCL e sm...","ekshuli, tcl gi pravi smartfonovite, a tcl e s..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44...",Red Valentino prognozira bura od printovi za s...,red valentino prognozira bura od printovi za s...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm
0,CLASSLA-web.hr.1033815,Promotion,"O proizvodu\nColor Transformer, za pametno i j...","About the Color Transformer product, for smart...","CLASSLA-web.hr.1033815', 'domain': 'hairshop.hr'}",Promotion,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, former, ,, ...","[180, 43170, 34, 51193, 11062, 82772, 4, 80, 6...","o proizvodu\ncolor transformer, za pametno i j..."
2,CLASSLA-web.hr.1119579,Promotion,Sunčano selo / Sunny village\nNa obroncima Bil...,Sunshine / Sunny Village on the slopes of Bilo...,"CLASSLA-web.hr.1119579', 'domain': 'vikendi.com'}",Promotion,"[▁Sun, čan, o, ▁se, lo, ▁/, ▁Sunny, ▁village, ...","[7550, 17129, 31, 40, 365, 248, 151197, 54427,...",suncano selo / sunny village\nna obroncima bil...


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ...",kitajsko mesto duhov\nv notranji mongoliji ras...
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414...","krompir skuhamo, olupimo in narezemo na tanke ..."


In [29]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

In [12]:
# Open and normalize also the training dataset
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁..."


In [34]:
# "normalize" the characters to get rid of special characters, using the unidecode library: https://pypi.org/project/Unidecode/, and also lowercase all characters

def normalize(text):
	# First, make the text lowercase
	text = text.lower()
	from unidecode import unidecode
	norm_text = unidecode(u"{}".format(text))
	return norm_text

# Normalize and save text to new column
train_df["text_norm"] = [normalize(x) for x in train_df["text"].to_list()]
display(train_df.head(2))

# Save the noramalized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...


In [37]:
train_df[train_df["language"] == "Slovenian"].tail(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm
1140,"SPREJEM GASILSKEGA VOZILA GVM-1 <p/> V soboto,...",News,GINCO,Slovenian,"[▁S, PRE, JEM, ▁GA, SIL, SK, EGA, ▁V, OZ, ILA,...","[159, 94632, 164923, 23749, 68785, 10762, 5037...","sprejem gasilskega vozila gvm-1 <p/> v soboto,..."
1141,Dodano k projektu: Simple Shop Poročilo kreira...,Forum,GINCO,Slovenian,"[▁Doda, no, ▁k, ▁projektu, :, ▁Simple, ▁Shop, ...","[66987, 157, 472, 13181, 12, 60552, 24211, 663...",dodano k projektu: simple shop porocilo kreira...


## Overlap on dataset level

First, tokenize the training data

In [13]:
# To open as Pandas DataFrame:
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

display(train_df.head(2))

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁..."


In [4]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the train dataset
tokens_train = []
token_list_train_norm = []

for text in tqdm(train_df["text_norm"].to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens_train, except the beginning (<s>) and end (</s>) token
	current_tokens_train = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	# Shorten the list to 512, as tokens after that were not observed by the classifier
	current_tokens_train = current_tokens_train[:512]
	tokens_train.append(current_tokens_train)
	token_list_train_norm.extend(current_tokens_train)

train_df["tokens_train_norm"] = tokens_train

train_df.head(3)

# Save the tokenized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

  0%|          | 0/1772 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1810 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1772/1772 [00:09<00:00, 189.67it/s]


In [17]:
# Create a token list of all tokens in train
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

token_list_train_norm = []

for i in train_df["tokens_train_norm"].to_list():
	token_list_train_norm.extend(i)

len(token_list_train_norm)


702133

Tokenize also the test sets

In [18]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [19]:
# Code with which the test sets were tokenized

# Define an array of token overlap
token_overlap_results = {}

# Loop through the datasets and calculate token overlap
for lang in list(main_dict.keys()):
	df = pd.DataFrame(main_dict[lang]["dataset"])

	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

	tokens = []
	token_list = []

	print("Tokenizing text.")

	for text in tqdm(df["text_norm"].to_list()):
		encoded_text = tokenizer(text)
		# Take all tokens, except the beginning (<s>) and end (</s>) token
		current_tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
		# Shorten the list to 512, as tokens after that were not observed by the classifier
		current_tokens = current_tokens[:512]
		tokens.append(current_tokens)
		token_list.extend(current_tokens)

	df["tokens_norm"] = tokens

	# Update the dataset in the dictionary
	main_dict[lang]["dataset"] = df.to_dict()

	# See how many tokens do not overlap
	no_overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_token_list = []

	# We calculate overlap by counting how many tokens do not appear in both sets
	for token in tqdm(token_list):
		if token not in token_list_train_norm:
			no_overlap_counter += 1
		else:
			overlap_token_list.append(token)

	# Out of all tokens in test set, how many do not overlap with train set?
	no_overlap_per = no_overlap_counter/len(token_list)

	# Calculate percentage of overlap based on that
	overlap_per = 1-no_overlap_per

	print(f"Number of tokens that overlap: {len(overlap_token_list)}")
	print(f"Number of different tokens that overlap: {len(set(overlap_token_list))}")
	print(f"Percentage of overlap: {overlap_per}")

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"]["overlap_percentage_norm"] = overlap_per
	main_dict[lang]["token_overlap"]["overlap_token_list_norm"] = overlap_token_list

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list_size": len(overlap_token_list), "overlap_set_size": len(set(overlap_token_list))}


# Inspect the results
overlap_df = pd.DataFrame(token_overlap_results)

Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (685 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 394.94it/s]


Calculating overlap.


100%|██████████| 38674/38674 [00:50<00:00, 764.90it/s] 


Number of tokens that overlap: 35146
Number of different tokens that overlap: 3691
Percentage of overlap: 0.908775921807933
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 574.33it/s]


Calculating overlap.


100%|██████████| 32652/32652 [00:52<00:00, 621.16it/s]


Number of tokens that overlap: 29231
Number of different tokens that overlap: 2567
Percentage of overlap: 0.8952284699252726
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 529.31it/s]


Calculating overlap.


100%|██████████| 31002/31002 [01:20<00:00, 386.69it/s]


Number of tokens that overlap: 22623
Number of different tokens that overlap: 2468
Percentage of overlap: 0.7297271143797175
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 722.58it/s]


Calculating overlap.


100%|██████████| 27150/27150 [00:57<00:00, 468.12it/s]


Number of tokens that overlap: 21175
Number of different tokens that overlap: 2597
Percentage of overlap: 0.7799263351749539
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1006 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 614.11it/s]


Calculating overlap.


100%|██████████| 30388/30388 [01:00<00:00, 504.24it/s]


Number of tokens that overlap: 24586
Number of different tokens that overlap: 2447
Percentage of overlap: 0.8090693694879558
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1336 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 466.77it/s]


Calculating overlap.


100%|██████████| 35538/35538 [00:55<00:00, 643.77it/s]


Number of tokens that overlap: 31991
Number of different tokens that overlap: 2957
Percentage of overlap: 0.9001913444763352
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 678.99it/s]


Calculating overlap.


100%|██████████| 27492/27492 [00:48<00:00, 571.35it/s]


Number of tokens that overlap: 22290
Number of different tokens that overlap: 2666
Percentage of overlap: 0.8107813182016587
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (948 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 710.61it/s]


Calculating overlap.


100%|██████████| 30608/30608 [00:48<00:00, 633.56it/s]


Number of tokens that overlap: 27704
Number of different tokens that overlap: 3293
Percentage of overlap: 0.9051228437009932
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 714.09it/s]


Calculating overlap.


100%|██████████| 26993/26993 [00:49<00:00, 540.49it/s]


Number of tokens that overlap: 23588
Number of different tokens that overlap: 3917
Percentage of overlap: 0.8738561849368355
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (829 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 700.43it/s]


Calculating overlap.


100%|██████████| 26760/26760 [00:34<00:00, 774.76it/s]

Number of tokens that overlap: 26320
Number of different tokens that overlap: 4485
Percentage of overlap: 0.9835575485799701





In [20]:
overlap_df.transpose()

Unnamed: 0,percentage,overlap_list_size,overlap_set_size
mt,0.908776,35146.0,3691.0
el,0.895228,29231.0,2567.0
tr,0.729727,22623.0,2468.0
sq,0.779926,21175.0,2597.0
is,0.809069,24586.0,2447.0
uk,0.900191,31991.0,2957.0
ca,0.810781,22290.0,2666.0
mk,0.905123,27704.0,3293.0
hr,0.873856,23588.0,3917.0
sl,0.983558,26320.0,4485.0


In [21]:
print(overlap_df.transpose().sort_values(by="percentage", ascending=False).to_markdown())

|    |   percentage |   overlap_list_size |   overlap_set_size |
|:---|-------------:|--------------------:|-------------------:|
| sl |     0.983558 |               26320 |               4485 |
| mt |     0.908776 |               35146 |               3691 |
| mk |     0.905123 |               27704 |               3293 |
| uk |     0.900191 |               31991 |               2957 |
| el |     0.895228 |               29231 |               2567 |
| hr |     0.873856 |               23588 |               3917 |
| ca |     0.810781 |               22290 |               2666 |
| is |     0.809069 |               24586 |               2447 |
| sq |     0.779926 |               21175 |               2597 |
| tr |     0.729727 |               22623 |               2468 |


In [22]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

## Label level

In [2]:
# Open the tokenized train df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁..."


In [None]:
# Separate the train df into label-based dfs

# Create lists of tokens that are label based
label_tokens = {}
token_count = {}
type_count = {}

for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
	print(f"Processing {label}")

	label_df = train_df[train_df["labels"] == label]
	display(label_df.head(3))

	# Create a list of tokens
	train_tokens_norm = []

	for i in label_df["tokens_train_norm"].to_list():
		train_tokens_norm.extend(i)

	print(f"Number of all tokens: {len(train_tokens_norm)}")

	# Add to dictionaries
	label_tokens[label] = train_tokens_norm
	token_count[label] = len(train_tokens_norm)
	type_count[label] = len(set(train_tokens_norm))

In [4]:
# Calculate numbers for each label
label_results_train_norm = pd.DataFrame({"token_count": token_count, "type_count": type_count})

print(label_results_train_norm.to_markdown())

|                         |   token_count |   type_count |
|:------------------------|--------------:|-------------:|
| Information/Explanation |        124518 |        11708 |
| News                    |        138026 |        11961 |
| Instruction             |         83634 |         7220 |
| Opinion/Argumentation   |        103447 |        10672 |
| Forum                   |         58952 |         7187 |
| Prose/Lyrical           |         46921 |         5119 |
| Legal                   |         28497 |         3707 |
| Promotion               |         88989 |         9768 |


In [5]:
# Save the label token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count-normalized-text.json", "w") as train_label_count_file:
	json.dump(label_tokens, train_label_count_file)

In [6]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

In [7]:
pd.DataFrame(main_dict["mt"]["dataset"]).head(2)

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,text_norm,tokens_norm
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262...","angelo chetcuti, se jkun qed jiehu post bjorn ...","[▁angel, o, ▁che, t, cuti, ,, ▁se, ▁j, kun, ▁q..."
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24...",poltergeist jirreferi ghal fenomeni ohra tal-m...,"[▁pol, ter, geist, ▁jir, re, feri, ▁, ghal, ▁f..."


In [None]:
# Do the same as with the dataset, but on every language
lang_results = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")

	label_token_dict = {}

	# Current df
	df = pd.DataFrame(main_dict[lang]["dataset"])
	display(df.head(2))

	for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
		print(f"Processing {label}")

		label_df = df[df["y_true"] == label]

		# Create a list of tokens
		token_list = []

		for i in label_df["tokens_norm"].to_list():
			token_list.extend(i)

		# Add to dictionary
		label_token_dict[label] = token_list

	# Add to main dict
	main_dict[lang]["token_overlap"]["label_level_token_lists_norm"] = label_token_dict

In [13]:
# Improved code to calculate token overlap - we can simply calculate which tokens do not overlap - meaning that they occur only in one of the dataset, and calculate the percentage overlap based on that
token_overlap_label_results = {}
results = {}

# Loop through the datasets and labels and calculate token overlap
for lang in list(main_dict.keys()):
	print(lang)
	# Create a dict for labels
	label_overlap = {}
	label_overlap_tokens = {}

	# loop through labels
	for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
		print(label)
		token_list_test = main_dict[lang]["token_overlap"]["label_level_token_lists_norm"][label]

		token_list_train = label_tokens[label]

		# See how many tokens do not overlap
		no_overlap_counter = 0

		# Save tokens that overlap for further inspection
		overlap_token_list = []

		# We calculate overlap by counting how many tokens do not appear in both sets
		for token in tqdm(token_list_test):
			if token not in token_list_train:
				no_overlap_counter += 1
			else:
				overlap_token_list.append(token)

		# Out of all tokens in test set, how many do not overlap with train set?
		no_overlap_per = no_overlap_counter/len(token_list_test)

		# Calculate percentage of overlap based on that
		overlap_per = 1-no_overlap_per

		print(f"Percentage of overlap: {overlap_per}")

		label_overlap[label] = overlap_per
		label_overlap_tokens[label] = overlap_token_list

		# Add to the results
		results[f"{lang}-{label}"] = overlap_per

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"]["label_overlap_percentage_norm"] = label_overlap
	main_dict[lang]["token_overlap"]["label_overlap_token_list_norm"] = label_overlap_tokens

mt
Information/Explanation


100%|██████████| 6654/6654 [00:02<00:00, 2741.43it/s]


Percentage of overlap: 0.7790802524797115
News


100%|██████████| 6855/6855 [00:02<00:00, 2507.08it/s]


Percentage of overlap: 0.8023340627279358
Instruction


100%|██████████| 9087/9087 [00:02<00:00, 3212.65it/s]


Percentage of overlap: 0.6831737647188292
Opinion/Argumentation


100%|██████████| 4086/4086 [00:01<00:00, 3073.65it/s]


Percentage of overlap: 0.7689672050905532
Forum


100%|██████████| 512/512 [00:00<00:00, 3976.61it/s]


Percentage of overlap: 0.66796875
Prose/Lyrical


100%|██████████| 512/512 [00:00<00:00, 5692.75it/s]


Percentage of overlap: 0.68359375
Legal


100%|██████████| 5120/5120 [00:00<00:00, 11317.78it/s]


Percentage of overlap: 0.719921875
Promotion


100%|██████████| 5848/5848 [00:01<00:00, 3506.84it/s]


Percentage of overlap: 0.7187072503419973
el
Information/Explanation


100%|██████████| 5863/5863 [00:02<00:00, 2448.11it/s]


Percentage of overlap: 0.7734947978850417
News


100%|██████████| 3876/3876 [00:01<00:00, 2157.01it/s]


Percentage of overlap: 0.7590299277605779
Instruction


100%|██████████| 3105/3105 [00:01<00:00, 2820.88it/s]


Percentage of overlap: 0.6099838969404187
Opinion/Argumentation


100%|██████████| 5836/5836 [00:02<00:00, 2701.23it/s]


Percentage of overlap: 0.7217272104180945
Forum


100%|██████████| 5235/5235 [00:01<00:00, 3930.83it/s]


Percentage of overlap: 0.6286532951289399
Prose/Lyrical


100%|██████████| 2834/2834 [00:00<00:00, 4855.35it/s]


Percentage of overlap: 0.5938602681721947
Legal


100%|██████████| 3546/3546 [00:00<00:00, 6913.78it/s]


Percentage of overlap: 0.4873096446700508
Promotion


100%|██████████| 2357/2357 [00:00<00:00, 2954.69it/s]


Percentage of overlap: 0.6601612218922359
tr
Information/Explanation


100%|██████████| 2523/2523 [00:01<00:00, 2054.57it/s]


Percentage of overlap: 0.6500198176773682
News


100%|██████████| 3820/3820 [00:02<00:00, 1715.76it/s]


Percentage of overlap: 0.6403141361256545
Instruction


100%|██████████| 4174/4174 [00:01<00:00, 2536.26it/s]


Percentage of overlap: 0.5206037374221371
Opinion/Argumentation


100%|██████████| 5694/5694 [00:02<00:00, 2261.88it/s]


Percentage of overlap: 0.597646645591851
Forum


100%|██████████| 3326/3326 [00:00<00:00, 3528.79it/s]


Percentage of overlap: 0.5817799158147925
Prose/Lyrical


100%|██████████| 3795/3795 [00:00<00:00, 4573.34it/s]


Percentage of overlap: 0.5465085638998682
Legal


100%|██████████| 3879/3879 [00:00<00:00, 6329.25it/s]


Percentage of overlap: 0.3967517401392111
Promotion


100%|██████████| 3791/3791 [00:01<00:00, 2618.03it/s]


Percentage of overlap: 0.5560538116591929
sq
Information/Explanation


100%|██████████| 5286/5286 [00:02<00:00, 2396.77it/s]


Percentage of overlap: 0.6912599318955732
News


100%|██████████| 2480/2480 [00:01<00:00, 1942.27it/s]


Percentage of overlap: 0.669758064516129
Instruction


100%|██████████| 3291/3291 [00:01<00:00, 2728.31it/s]


Percentage of overlap: 0.6004254026131874
Opinion/Argumentation


100%|██████████| 4475/4475 [00:01<00:00, 2758.52it/s]


Percentage of overlap: 0.6782122905027933
Forum


100%|██████████| 3499/3499 [00:00<00:00, 3928.41it/s]


Percentage of overlap: 0.6356101743355245
Prose/Lyrical


100%|██████████| 3376/3376 [00:00<00:00, 5162.15it/s]


Percentage of overlap: 0.613744075829384
Legal


100%|██████████| 2918/2918 [00:00<00:00, 7582.79it/s]


Percentage of overlap: 0.521932830705963
Promotion


100%|██████████| 1825/1825 [00:00<00:00, 3219.46it/s]


Percentage of overlap: 0.6597260273972603
is
Information/Explanation


100%|██████████| 2260/2260 [00:01<00:00, 2240.22it/s]


Percentage of overlap: 0.6884955752212389
News


100%|██████████| 4358/4358 [00:02<00:00, 1802.14it/s]


Percentage of overlap: 0.6792106470858192
Instruction


100%|██████████| 3922/3922 [00:01<00:00, 2760.00it/s]


Percentage of overlap: 0.6361550229474757
Opinion/Argumentation


100%|██████████| 5022/5022 [00:01<00:00, 2576.61it/s]


Percentage of overlap: 0.6744324970131421
Forum


100%|██████████| 2865/2865 [00:00<00:00, 3912.44it/s]


Percentage of overlap: 0.6293193717277488
Prose/Lyrical


100%|██████████| 4666/4666 [00:01<00:00, 4566.21it/s]


Percentage of overlap: 0.5653664809258465
Legal


100%|██████████| 4070/4070 [00:00<00:00, 7138.44it/s]


Percentage of overlap: 0.4933660933660934
Promotion


100%|██████████| 3225/3225 [00:01<00:00, 2794.56it/s]


Percentage of overlap: 0.6353488372093024
uk
Information/Explanation


100%|██████████| 4662/4662 [00:01<00:00, 2746.29it/s]


Percentage of overlap: 0.7696267696267696
News


100%|██████████| 4115/4115 [00:01<00:00, 2360.18it/s]


Percentage of overlap: 0.7905224787363305
Instruction


100%|██████████| 4814/4814 [00:01<00:00, 3000.26it/s]


Percentage of overlap: 0.6884088076443706
Opinion/Argumentation


100%|██████████| 5790/5790 [00:01<00:00, 2944.51it/s]


Percentage of overlap: 0.7816925734024179
Forum


100%|██████████| 4007/4007 [00:00<00:00, 4039.49it/s]


Percentage of overlap: 0.7117544297479411
Prose/Lyrical


100%|██████████| 4481/4481 [00:00<00:00, 5241.08it/s]


Percentage of overlap: 0.6407051997322026
Legal


100%|██████████| 4309/4309 [00:00<00:00, 8003.77it/s]


Percentage of overlap: 0.5866790438616849
Promotion


100%|██████████| 3360/3360 [00:00<00:00, 3530.99it/s]


Percentage of overlap: 0.70625
ca
Information/Explanation


100%|██████████| 3997/3997 [00:01<00:00, 2739.61it/s]


Percentage of overlap: 0.7257943457593194
News


100%|██████████| 2627/2627 [00:01<00:00, 2308.43it/s]


Percentage of overlap: 0.7141225732775028
Instruction


100%|██████████| 2135/2135 [00:00<00:00, 3478.78it/s]


Percentage of overlap: 0.674473067915691
Opinion/Argumentation


100%|██████████| 3370/3370 [00:01<00:00, 3177.52it/s]


Percentage of overlap: 0.7115727002967359
Forum


100%|██████████| 3753/3753 [00:00<00:00, 4692.42it/s]


Percentage of overlap: 0.6930455635491607
Prose/Lyrical


100%|██████████| 4059/4059 [00:00<00:00, 5903.18it/s]


Percentage of overlap: 0.6661739344666173
Legal


100%|██████████| 3315/3315 [00:00<00:00, 8332.07it/s]


Percentage of overlap: 0.5562594268476622
Promotion


100%|██████████| 4236/4236 [00:01<00:00, 3444.30it/s]


Percentage of overlap: 0.67233238904627
mk
Information/Explanation


100%|██████████| 3779/3779 [00:01<00:00, 2867.75it/s]


Percentage of overlap: 0.7898915056893359
News


100%|██████████| 4299/4299 [00:01<00:00, 2390.74it/s]


Percentage of overlap: 0.8053035589672017
Instruction


100%|██████████| 3695/3695 [00:01<00:00, 3063.65it/s]


Percentage of overlap: 0.691745602165088
Opinion/Argumentation


100%|██████████| 3205/3205 [00:01<00:00, 2841.11it/s]


Percentage of overlap: 0.778783151326053
Forum


100%|██████████| 4204/4204 [00:00<00:00, 4241.89it/s]


Percentage of overlap: 0.7514272121788772
Prose/Lyrical


100%|██████████| 4111/4111 [00:00<00:00, 5622.35it/s]


Percentage of overlap: 0.6801264899051326
Legal


100%|██████████| 3904/3904 [00:00<00:00, 8844.28it/s]


Percentage of overlap: 0.6116803278688525
Promotion


100%|██████████| 3411/3411 [00:00<00:00, 4367.32it/s]


Percentage of overlap: 0.7692758721782469
hr
Information/Explanation


100%|██████████| 4073/4073 [00:01<00:00, 2801.76it/s]


Percentage of overlap: 0.7795236926098699
News


100%|██████████| 3353/3353 [00:01<00:00, 2289.79it/s]


Percentage of overlap: 0.7948106173575902
Instruction


100%|██████████| 1744/1744 [00:00<00:00, 2952.26it/s]


Percentage of overlap: 0.6771788990825688
Opinion/Argumentation


100%|██████████| 3502/3502 [00:01<00:00, 2769.39it/s]


Percentage of overlap: 0.7569960022844089
Forum


100%|██████████| 3824/3824 [00:00<00:00, 3911.68it/s]


Percentage of overlap: 0.7086820083682008
Prose/Lyrical


100%|██████████| 2637/2637 [00:00<00:00, 5171.00it/s]


Percentage of overlap: 0.6344330678801668
Legal


100%|██████████| 3544/3544 [00:00<00:00, 8610.70it/s]


Percentage of overlap: 0.583803611738149
Promotion


100%|██████████| 4316/4316 [00:00<00:00, 4586.24it/s]


Percentage of overlap: 0.7669138090824837
sl
Information/Explanation


100%|██████████| 3535/3535 [00:00<00:00, 3928.21it/s]


Percentage of overlap: 0.9125884016973126
News


100%|██████████| 3484/3484 [00:01<00:00, 2930.73it/s]


Percentage of overlap: 0.9357060849598163
Instruction


100%|██████████| 3433/3433 [00:01<00:00, 3426.07it/s]


Percentage of overlap: 0.8065831634139237
Opinion/Argumentation


100%|██████████| 2498/2498 [00:00<00:00, 3508.63it/s]


Percentage of overlap: 0.923138510808647
Forum


100%|██████████| 4006/4006 [00:00<00:00, 4210.42it/s]


Percentage of overlap: 0.8419870194707938
Prose/Lyrical


100%|██████████| 2943/2943 [00:00<00:00, 6398.45it/s]


Percentage of overlap: 0.7410805300713558
Legal


100%|██████████| 3563/3563 [00:00<00:00, 13023.67it/s]


Percentage of overlap: 0.8004490597810834
Promotion


100%|██████████| 3298/3298 [00:00<00:00, 8642.27it/s]

Percentage of overlap: 0.9129775621588841





In [14]:
results

{'mt-Information/Explanation': 0.7790802524797115,
 'mt-News': 0.8023340627279358,
 'mt-Instruction': 0.6831737647188292,
 'mt-Opinion/Argumentation': 0.7689672050905532,
 'mt-Forum': 0.66796875,
 'mt-Prose/Lyrical': 0.68359375,
 'mt-Legal': 0.719921875,
 'mt-Promotion': 0.7187072503419973,
 'el-Information/Explanation': 0.7734947978850417,
 'el-News': 0.7590299277605779,
 'el-Instruction': 0.6099838969404187,
 'el-Opinion/Argumentation': 0.7217272104180945,
 'el-Forum': 0.6286532951289399,
 'el-Prose/Lyrical': 0.5938602681721947,
 'el-Legal': 0.4873096446700508,
 'el-Promotion': 0.6601612218922359,
 'tr-Information/Explanation': 0.6500198176773682,
 'tr-News': 0.6403141361256545,
 'tr-Instruction': 0.5206037374221371,
 'tr-Opinion/Argumentation': 0.597646645591851,
 'tr-Forum': 0.5817799158147925,
 'tr-Prose/Lyrical': 0.5465085638998682,
 'tr-Legal': 0.3967517401392111,
 'tr-Promotion': 0.5560538116591929,
 'sq-Information/Explanation': 0.6912599318955732,
 'sq-News': 0.66975806451612

In [15]:
# Inspect the results
overlap_df = pd.DataFrame({"label": list(results.keys()), "overlap": list(results.values())})
overlap_df

Unnamed: 0,label,overlap
0,mt-Information/Explanation,0.779080
1,mt-News,0.802334
2,mt-Instruction,0.683174
3,mt-Opinion/Argumentation,0.768967
4,mt-Forum,0.667969
...,...,...
75,sl-Opinion/Argumentation,0.923139
76,sl-Forum,0.841987
77,sl-Prose/Lyrical,0.741081
78,sl-Legal,0.800449


In [16]:
# Save the label-level-overlap
with open("datasets/label-level-token-overlap-normalized.csv", "w") as file:
	overlap_df.to_csv(file)

In [17]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)