In [1]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=6

import pandas as pd
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=6


  from .autonotebook import tqdm as notebook_tqdm


Calculate token distribution - vector of values per tokens (how many times does each token occur) - create a dictionary with all tokens from train set and specific test set, iterate through the tokens and count how many times each ocurrs. Calculate cosine similarity.

Do this on label level as well to see whether this explains good performance on some of the labels for Maltese.

## Tokenize and count tokens for train_df

Code for tokenization (it is now already done):

In [18]:
# Import the train dataset
train = load_dataset("TajaKuzman/X-GENRE-multilingual-text-genre-dataset", "train")

# To open as Pandas DataFrame:
train_df = pd.DataFrame(train["train"])

display(train_df.head(2))

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the train dataset
tokens_train = []
integers_train = []
token_list_train = []

for text in tqdm(train_df.text.to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens_train, except the beginning (<s>) and end (</s>) token
	current_tokens_train = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	tokens_train.append(current_tokens_train)
	token_list_train.extend(current_tokens_train)
	integers_train.append(encoded_text.input_ids[1:-1])

train_df["tokens_train"] = tokens_train
train_df["token_ids"] = integers_train

#print(token_list_train[:10])
#print(len(token_list_train))

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

train_df.head(3)

# Save the tokenized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English


  0%|          | 0/1772 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1810 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1772/1772 [00:09<00:00, 189.74it/s]


699465


Code to count tokens:

In [2]:
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51..."


In [11]:
# Create a dictionary that counts all the token occurrences

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

# Create a dictionary which counts the occurrences of the words

word_dict_train = Counter(train_tokens_shortened)

# Sort the dictionary alphabetically based on keys
word_dict_train = dict(sorted(word_dict_train.items()))

print(list(word_dict_train.items())[:100])
print(len(word_dict_train))

699465
[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1), ('!!!!!!', 1), ('!!!!!!!', 1), ('!"', 14), ('!)', 10), ('!),', 1), ('!).', 2), ('"', 528), ('")', 7), ('"),', 6), ('").', 7), ('",', 56), ('".', 83), ('"...', 3), ('";', 3), ('"?', 6), ('#', 4), ('$', 8), ('%', 4), ('%)', 1), ('&', 46), ("'", 4517), ('(', 31), ('(1', 1), (')', 788), ('),', 242), (').', 297), ('):', 13), (');', 5), ('*', 19), ('**', 2), ('****', 1), ('+', 15), ('+5', 1), (',', 23447), (',«', 12), ('-', 2803), ('---', 3), ('------', 5), ('----------------', 41), ('-0', 3), ('-01', 1), ('-02', 2), ('-02-', 2), ('-03-', 4), ('-06', 1), ('-06-', 3), ('-09-', 1), ('-1', 15), ('-1)', 3), ('-10', 2), ('-10-', 1), ('-11', 6), ('-11-', 1), ('-12', 8), ('-13', 6), ('-14', 3), ('-15', 7), ('-16', 9), ('-17', 3), ('-18', 9), ('-19', 5), ('-2', 10), ('-20', 3), ('-2000', 3), ('-2005', 1), ('-2007', 8), ('-2009', 1), ('-2010', 1), ('-2011', 1), ('-2012', 1), ('-2014', 1), ('-2020', 1), ('-21', 4), ('-22', 3), ('-

The train dataset has 699.465 tokens and 27.025 unique words.

In [12]:
# See the most frequent tokens:

# Sort the dictionary by values (word counts) in descending order
sorted(word_dict_train.items(), key=lambda x: x[1], reverse=True)[:10]


[(',', 23447),
 ('.', 21407),
 ('▁', 19553),
 ('▁the', 18860),
 ('s', 14184),
 ('▁to', 10762),
 ('▁of', 9912),
 ('▁and', 9752),
 ('▁in', 9140),
 ('▁a', 8341)]

In [14]:
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "w") as train_count_file:
	json.dump(word_dict_train, train_count_file)

# Tokenize and count tokens for test sets

Code with which I tokenized the datasets:

In [15]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [None]:
# Define an array of token overlap
token_overlap_results = {}

# Loop through the datasets and calculate token overlap
for lang in list(main_dict.keys()):
	df = pd.DataFrame(main_dict[lang]["dataset"])

	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

	tokens = []
	integers = []
	token_list = []

	print("Tokenizing text.")

	for text in tqdm(df.text.to_list()):
		encoded_text = tokenizer(text)
		# Take all tokens, except the beginning (<s>) and end (</s>) token
		current_tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
		# Shorten the list to 512, as tokens after that were not observed by the classifier
		current_tokens = current_tokens[:512]
		tokens.append(current_tokens)
		token_list.extend(current_tokens)
		integers.append(encoded_text.input_ids[1:-1][:512])

	df["tokens"] = tokens
	df["token_ids"] = integers

	print(token_list[:10])
	print("All tokens:")
	print(len(token_list))

	# See how many tokens overlap
	overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_token_list = []

	#overlap_counter = sum(1 for element in token_list if element in train_tokens_shortened)
	for token in tqdm(token_list):
		if token in train_tokens_shortened:
			overlap_counter += 1
			overlap_token_list.append(token)

	# Out of all tokens, how many overlap?
	overlap_per = overlap_counter/len(token_list)

	print(f"Number of tokens that overlap: {overlap_counter}")
	print(f"Percentage of overlap: {overlap_per}")

	# Update the dataset in the dictionary
	main_dict[lang]["dataset"] = df.to_dict()

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"] = {"overlap_percentage":overlap_per, "token_list": token_list, "overlap_token_list":overlap_token_list}

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list": overlap_token_list}

	# Convert tokens back to words
	#print(tokenizer.convert_tokens_to_string(tokens))

# Inspect the results
overlap_df = pd.DataFrame(token_overlap_results)
	

In [26]:
overlap_df = pd.DataFrame(token_overlap_results).transpose()
overlap_df

Unnamed: 0,percentage,overlap_list
mt,0.817085,"[▁Angel, o, ▁Che, t, ,, ▁se, ▁j, kun, d, u, ▁p..."
el,0.161428,"[asi, asi, ,, ,, ▁Re, ception, ., ▁driver, ▁es..."
tr,0.521502,"[▁A, L, ▁Der, s, i, ▁ve, ▁Beli, r, leme, ▁S, h..."
sq,0.605775,"[▁Blog, ▁“, U, ▁kam, ▁me, jet, .”, ▁Jer, ▁31, ..."
is,0.517575,"[▁[, is, ], ▁er, fi, ▁reg, ▁sett, ar, lag, sin..."
uk,0.156675,"[., ,, ,, ?, ▁-, ., ?, ▁-, ', ., ,, ., ▁, ▁(, ..."
ca,0.744881,"[▁P, à, gine, s, ▁En, nada, ▁Porto, ▁uns, ▁die..."
mk,0.145989,"[,, ▁T, CL, ,, ▁T, CL, :, ▁Alca, tel, ▁Mobile,..."
hr,0.821517,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, ,, ▁za, ▁pa..."
sl,0.974289,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,..."


In [36]:
for token in overlap_df["overlap_list"].to_list()[0][:2]:
	if len(token) > 1:
		non_short_counter += 1

▁Angel


In [37]:
# Inspect how many tokens are one character long

counters = []
pers = []

for token_list_el in overlap_df["overlap_list"].to_list():
	non_short_counter = 0
	for token in token_list_el:
		if len(token) > 1:
			non_short_counter += 1
	
	counters.append(non_short_counter)
	pers.append(non_short_counter/len(token_list_el))

# Add to df
overlap_df["non_short"] = counters

# Calculate the percentage of overlap tokens that have more than 1 character
overlap_df["non_short_per"] = pers

overlap_df


Unnamed: 0,percentage,overlap_list,non_short,non_short_per
mt,0.817085,"[▁Angel, o, ▁Che, t, ,, ▁se, ▁j, kun, d, u, ▁p...",23083,0.723628
el,0.161428,"[asi, asi, ,, ,, ▁Re, ception, ., ▁driver, ▁es...",1883,0.373389
tr,0.521502,"[▁A, L, ▁Der, s, i, ▁ve, ▁Beli, r, leme, ▁S, h...",11141,0.722269
sq,0.605775,"[▁Blog, ▁“, U, ▁kam, ▁me, jet, .”, ▁Jer, ▁31, ...",12459,0.768315
is,0.517575,"[▁[, is, ], ▁er, fi, ▁reg, ▁sett, ar, lag, sin...",10915,0.711399
uk,0.156675,"[., ,, ,, ?, ▁-, ., ?, ▁-, ', ., ,, ., ▁, ▁(, ...",1258,0.253476
ca,0.744881,"[▁P, à, gine, s, ▁En, nada, ▁Porto, ▁uns, ▁die...",15657,0.763123
mk,0.145989,"[,, ▁T, CL, ,, ▁T, CL, :, ▁Alca, tel, ▁Mobile,...",1270,0.314746
hr,0.821517,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, ,, ▁za, ▁pa...",17678,0.81062
sl,0.974289,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...",21567,0.841935


In [49]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

Add token counts information

In [23]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [25]:
token_number = {}
type_number = {}

for lang in list(main_dict.keys()):
	print(f"Creating token dict for {lang}")
	current_token_list = main_dict[lang]["token_overlap"]["token_list"]

	print(f"No of tokens: {len(current_token_list)}")

	# Create a dictionary which counts the occurrences of the tokens

	word_dict_test = Counter(current_token_list)

	# Sort the dictionary alphabetically based on keys
	word_dict_test = dict(sorted(word_dict_test.items()))

	# Add information on no. of tokens and words to a dict
	token_number[lang] = len(current_token_list)
	type_number[lang] = len(word_dict_test)

	print(list(word_dict_test.items())[:100])
	print(f"No of unique tokens: {len(word_dict_test)}")

	# Add the count of tokens to the main dictionary
	main_dict[lang]["token_overlap"]["token_count"] = word_dict_test

# Create a dataframe for statistics
token_df = pd.DataFrame({"tokens": token_number, "types": type_number})
print(token_df.to_markdown(index=False))

Creating token dict for mt
No of tokens: 39040
[('!', 9), ('"', 34), ('",', 11), ('".', 6), ('#', 20), ("'", 462), ('(', 3), ('(1)', 4), ('(2)', 4), ('(3)', 4), (')', 50), ('),', 15), (').', 15), (');', 1), (',', 884), ('-', 2119), ('-0', 1), ('-1', 1), ('-11', 1), ('-12-', 2), ('-16', 5), ('-17', 1), ('-18', 2), ('-19', 1), ('-2', 5), ('-20', 1), ('-200', 1), ('-2000', 1), ('-2007', 1), ('-2009', 1), ('-2014', 1), ('-2015', 1), ('-2016', 2), ('-2017', 2), ('-21', 1), ('-22', 1), ('-23', 1), ('-24', 2), ('-28', 1), ('-30', 4), ('-35', 1), ('-4', 1), ('-5', 1), ('-500', 1), ('-6', 1), ('-7', 1), ('-90', 2), ('.', 724), ('."', 5), ('...', 1), ('.”', 3), ('/', 11), ('/08', 3), ('/13', 1), ('/14', 3), ('/19', 4), ('/24', 2), ('/3', 6), ('/4', 2), ('/5', 2), ('/7', 2), ('0.7', 1), ('00', 1), ('016', 1), ('02', 2), ('04', 2), ('05', 1), ('050', 1), ('09.', 2), ('1', 5), ('100', 1), ('112', 1), ('12', 2), ('135', 2), ('14', 1), ('152', 1), ('164', 1), ('19', 5), ('1962', 1), ('1998', 1), ('2'

In [26]:
print(token_df.to_markdown())

|    |   tokens |   types |
|:---|---------:|--------:|
| mt |    39040 |    4787 |
| el |    31240 |    4751 |
| tr |    29578 |    6272 |
| sq |    26769 |    4891 |
| is |    29644 |    4615 |
| uk |    31677 |    6507 |
| ca |    27544 |    5314 |
| mk |    27639 |    5468 |
| hr |    26546 |    6222 |
| sl |    26292 |    5763 |


In [35]:
# Let's see the most frequent tokens
most_frequent = {}
# Sort the dictionary by values (word counts) in descending order
for lang in list(main_dict.keys()):
	most_frequent[lang] = (sorted(main_dict[lang]['token_overlap']['token_count'].items(), key=lambda x: x[1], reverse=True)[:10])

print(pd.DataFrame({"most_frequent_type": most_frequent}).to_markdown())

|    | most_frequent_type                                                                                                                       |
|:---|:-----------------------------------------------------------------------------------------------------------------------------------------|
| ca | [(',', 1080), ('▁de', 1014), ('.', 675), ('s', 648), ('▁i', 564), ('▁la', 559), ('▁a', 529), ('▁que', 438), ("'", 360), ('’', 334)]      |
| el | [('▁', 1017), ('.', 801), (',', 782), ('▁και', 553), ('ς', 525), ('▁να', 351), ('▁το', 321), ('▁του', 292), ('▁την', 268), ('▁με', 264)] |
| hr | [(',', 878), ('.', 766), ('▁i', 546), ('▁u', 430), ('a', 413), ('▁je', 350), ('▁na', 282), ('▁za', 253), ('▁se', 239), ('e', 219)]       |
| is | [('.', 1017), ('▁og', 628), ('▁að', 613), (',', 600), ('▁', 528), ('▁í', 434), ('▁á', 388), ('▁er', 357), ('s', 326), ('▁sem', 279)]     |
| mk | [(',', 1021), ('▁на', 983), ('.', 738), ('▁и', 619), ('▁за', 475), ('▁да', 378), ('▁во', 376), ('▁се', 365), ('▁', 34

In [36]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

# Compare train df and test set overlap

Create a dictionary with all tokens from train set and specific test set, iterate through the tokens and count how many times each ocurrs. Calculate cosine similarity.

Do this on label level as well to see whether this explains good performance on some of the labels for Maltese.

Cosine similarity is a metric used to measure the similarity of two vectors. Specifically, it measures the similarity in the direction or orientation of the vectors ignoring differences in their magnitude or scale. Both vectors need to be part of the same inner product space, meaning they must produce a scalar through inner product multiplication. The similarity of two vectors is measured by the cosine of the angle between them. The similarity can take values between -1 and +1. Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity. 

Cosine similarity ignores 0-0 matches. Counting 0-0 matches in sparse data would inflate similarity scores. Another commonly used metric that ignores 0-0 matches is Jaccard Similarity.

In [2]:
def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [3]:
# Import train token count
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "r") as train_count_file:
	train_count = json.load(train_count_file)

list(train_count.items())[:5]

[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1)]

In [4]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

list(main_dict["sl"]["token_overlap"]["token_count"].items())[:10]

[('!', 30),
 ('!!', 1),
 ('!"', 1),
 ('!),', 1),
 ('"', 11),
 ('".', 2),
 ('"...', 1),
 ('&', 8),
 ("'", 3),
 ("''", 1)]

In [8]:
cosine_sim = {}
vector_size = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")
	# Get token count for current lang
	current_lang_count = main_dict[lang]["token_overlap"]["token_count"]

	print(f"Number of token types for {lang}: {len(list(current_lang_count.keys()))}")

	# For each test set, create a vector of token counts. Take only tokens that are present either in train_df or test set.
	intersection_dict = {}

	# First, create a list of tokens that are present in either one or the other list
	intersection_keys = []
	intersection_keys.extend(list(current_lang_count.keys()))
	intersection_keys.extend(list(train_count.keys()))
	# Remove duplicated keys
	intersection_keys = list(set(intersection_keys))
	print(f"Number of intersecting types: {len(intersection_keys)}")

	# Then create a dictionary for 1) train df and 2) test df with counts of token types that occur in either of the datasets
	train_intersect_dict = {}
	test_intersect_dict = {}

	for i in intersection_keys:
		try:
			train_intersect_dict[i] = train_count[i]
		except:
			train_intersect_dict[i] = 0
		try:
			test_intersect_dict[i] = current_lang_count[i]
		except:
			test_intersect_dict[i] = 0


	# Create a df with intersecting keys
	intersect_df = pd.DataFrame({"train": train_intersect_dict, "test": test_intersect_dict})

	display(intersect_df.head(10))

	# Calculate cosine similarity
	current_cosine_sim = cosine_similarity(np.array(intersect_df["train"].to_list()), np.array(intersect_df["test"].to_list()))

	print(f"Cosine similarity for {lang}: {current_cosine_sim}")

	# Add to the main dictionary
	main_dict[lang]["token_overlap"]["cosine_similarity"] = current_cosine_sim
	main_dict[lang]["token_overlap"]["intersection_df"] = intersect_df.to_dict()
	main_dict[lang]["token_overlap"]["intersection_vector_size"] = len(intersection_keys)

	# Add to a dict of results
	cosine_sim[lang] = current_cosine_sim
	vector_size[lang] = len(intersection_keys)

Processing mt
Number of token types for mt: 4787
Number of intersecting types: 28226


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,1
▁Ż,0,7
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for mt: 0.4142476865826809
Processing el
Number of token types for el: 4751
Number of intersecting types: 30954


Unnamed: 0,train,test
▁Απριλίου,0,3
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
έλθει,0,1
▁Bran,9,0
NJE,3,0


Cosine similarity for el: 0.5273207128330722
Processing tr
Number of token types for tr: 6272
Number of intersecting types: 30846


Unnamed: 0,train,test
▁boot,11,1
▁Trend,2,0
set,47,0
▁Neden,0,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for tr: 0.5938472232210807
Processing sq
Number of token types for sq: 4891
Number of intersecting types: 29168


Unnamed: 0,train,test
▁boot,11,0
▁pavarur,0,1
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for sq: 0.43448915477115685
Processing is
Number of token types for is: 4615
Number of intersecting types: 29518


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0
TION,18,0


Cosine similarity for is: 0.5251354659540077
Processing uk
Number of token types for uk: 6507
Number of intersecting types: 33121


Unnamed: 0,train,test
та,0,33
▁boot,11,0
▁Trend,2,0
set,47,1
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
фи,0,1
NJE,3,0


Cosine similarity for uk: 0.5846121788244142
Processing ca
Number of token types for ca: 5314
Number of intersecting types: 29443


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,1
TION,18,0


Cosine similarity for ca: 0.5254392489246424
Processing mk
Number of token types for mk: 5468
Number of intersecting types: 31837


Unnamed: 0,train,test
та,0,206
▁boot,11,0
▁Јан,0,1
ензи,0,3
▁Trend,2,0
сметаат,0,5
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0


Cosine similarity for mk: 0.4225320009293607
Processing hr
Number of token types for hr: 6222
Number of intersecting types: 28864


Unnamed: 0,train,test
▁boot,11,0
▁Essen,0,2
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,0
▁primordial,1,0


Cosine similarity for hr: 0.5662098277228971
Processing sl
Number of token types for sl: 5763
Number of intersecting types: 27507


Unnamed: 0,train,test
▁boot,11,0
▁Trend,2,0
set,47,0
▁skills,38,0
▁ER,2,0
▁Install,1,0
▁Bran,9,0
NJE,3,1
▁primordial,1,0
TION,18,0


Cosine similarity for sl: 0.6334513674796362


In [12]:
# Show results
cosine_sim_df = pd.DataFrame({"cosine_similarity": cosine_sim, "vector_size": vector_size}).sort_values(by="cosine_similarity", ascending=False)
print(cosine_sim_df.to_markdown())

|    |   cosine_similarity |   vector_size |
|:---|--------------------:|--------------:|
| sl |            0.633451 |         27507 |
| tr |            0.593847 |         30846 |
| uk |            0.584612 |         33121 |
| hr |            0.56621  |         28864 |
| el |            0.527321 |         30954 |
| ca |            0.525439 |         29443 |
| is |            0.525135 |         29518 |
| sq |            0.434489 |         29168 |
| mk |            0.422532 |         31837 |
| mt |            0.414248 |         28226 |


In [19]:
main_dict["sl"]["token_overlap"].keys()

dict_keys(['overlap_percentage', 'token_list', 'overlap_token_list', 'token_count', 'cosine_similarity', 'intersection_df', 'intersection_vector_size'])

In [18]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)