In [1]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

import pandas as pd
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
import re

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


  from .autonotebook import tqdm as notebook_tqdm


## Tokenize and count tokens for train_df

Code for tokenization (it is now already done):

In [18]:
# Import the train dataset
train = load_dataset("TajaKuzman/X-GENRE-multilingual-text-genre-dataset", "train")

# To open as Pandas DataFrame:
train_df = pd.DataFrame(train["train"])

display(train_df.head(2))

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the train dataset
tokens_train = []
integers_train = []
token_list_train = []

for text in tqdm(train_df.text.to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens_train, except the beginning (<s>) and end (</s>) token
	current_tokens_train = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	tokens_train.append(current_tokens_train)
	token_list_train.extend(current_tokens_train)
	integers_train.append(encoded_text.input_ids[1:-1])

train_df["tokens_train"] = tokens_train
train_df["token_ids"] = integers_train

#print(token_list_train[:10])
#print(len(token_list_train))

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

train_df.head(3)

# Save the tokenized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English


  0%|          | 0/1772 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1810 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1772/1772 [00:09<00:00, 189.74it/s]


699465


Code to count tokens:

In [2]:
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁..."


In [3]:
# Create a dictionary that counts all the token occurrences

# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:512])

print(len(train_tokens_shortened))

# Create a dictionary which counts the occurrences of the words

word_dict_train = Counter(train_tokens_shortened)

# Sort the dictionary alphabetically based on keys
word_dict_train = dict(sorted(word_dict_train.items()))

print(list(word_dict_train.items())[:100])
print(len(word_dict_train))

699465
[('!', 430), ('!!', 23), ('!!!', 14), ('!!!!', 6), ('!!!!!', 1), ('!!!!!!', 1), ('!!!!!!!', 1), ('!"', 14), ('!)', 10), ('!),', 1), ('!).', 2), ('"', 528), ('")', 7), ('"),', 6), ('").', 7), ('",', 56), ('".', 83), ('"...', 3), ('";', 3), ('"?', 6), ('#', 4), ('$', 8), ('%', 4), ('%)', 1), ('&', 46), ("'", 4517), ('(', 31), ('(1', 1), (')', 788), ('),', 242), (').', 297), ('):', 13), (');', 5), ('*', 19), ('**', 2), ('****', 1), ('+', 15), ('+5', 1), (',', 23447), (',«', 12), ('-', 2803), ('---', 3), ('------', 5), ('----------------', 41), ('-0', 3), ('-01', 1), ('-02', 2), ('-02-', 2), ('-03-', 4), ('-06', 1), ('-06-', 3), ('-09-', 1), ('-1', 15), ('-1)', 3), ('-10', 2), ('-10-', 1), ('-11', 6), ('-11-', 1), ('-12', 8), ('-13', 6), ('-14', 3), ('-15', 7), ('-16', 9), ('-17', 3), ('-18', 9), ('-19', 5), ('-2', 10), ('-20', 3), ('-2000', 3), ('-2005', 1), ('-2007', 8), ('-2009', 1), ('-2010', 1), ('-2011', 1), ('-2012', 1), ('-2014', 1), ('-2020', 1), ('-21', 4), ('-22', 3), ('-

In [5]:
len(set(train_tokens_shortened))

27025

The train dataset has 699.465 tokens and 27.025 unique words.

In [4]:
# See the most frequent tokens:

# Sort the dictionary by values (word counts) in descending order
sorted(word_dict_train.items(), key=lambda x: x[1], reverse=True)[:10]


[(',', 23447),
 ('.', 21407),
 ('▁', 19553),
 ('▁the', 18860),
 ('s', 14184),
 ('▁to', 10762),
 ('▁of', 9912),
 ('▁and', 9752),
 ('▁in', 9140),
 ('▁a', 8341)]

In [14]:
# Save the dictionary of tokens
with open("datasets/tokenized_datasets/X-GENRE-train-token-count.json", "w") as train_count_file:
	json.dump(word_dict_train, train_count_file)

# Tokenize and count tokens for test sets & calculate percentage overlap

Code with which I tokenized the datasets:

In [5]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [7]:
# Code with which the test sets were tokenized
# Improved code to calculate token overlap - we can simply calculate which tokens do not overlap - meaning that they occur only in one of the dataset, and calculate the percentage overlap based on that

# Define an array of token overlap
token_overlap_results = {}

# Loop through the datasets and calculate token overlap
for lang in list(main_dict.keys()):
	df = pd.DataFrame(main_dict[lang]["dataset"])

	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

	tokens = []
	integers = []
	token_list = []

	print("Tokenizing text.")

	for text in tqdm(df.text.to_list()):
		encoded_text = tokenizer(text)
		# Take all tokens, except the beginning (<s>) and end (</s>) token
		current_tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
		# Shorten the list to 512, as tokens after that were not observed by the classifier
		current_tokens = current_tokens[:512]
		tokens.append(current_tokens)
		token_list.extend(current_tokens)
		integers.append(encoded_text.input_ids[1:-1][:512])

	df["tokens"] = tokens
	df["token_ids"] = integers

	print(token_list[:10])
	print("All tokens:")
	print(len(token_list))

	token_overlap_results = {}

	# See how many tokens do not overlap
	no_overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_token_list = []

	# We calculate overlap by counting how many tokens do not appear in both sets
	for token in tqdm(token_list):
		if token not in train_tokens_shortened:
			no_overlap_counter += 1
		else:
			overlap_token_list.append(token)

	# Out of all tokens in test set, how many do not overlap with train set?
	no_overlap_per = no_overlap_counter/len(token_list)

	# Calculate percentage of overlap based on that
	overlap_per = 1-no_overlap_per

	print(f"Number of tokens that overlap: {len(overlap_token_list)}")
	print(f"Number of different tokens that overlap: {len(set(overlap_token_list))}")
	print(f"Percentage of overlap: {overlap_per}")

	# Update the dataset in the dictionary
	main_dict[lang]["dataset"] = df.to_dict()

	# Add the list of all tokens to the dictionary
	main_dict[lang]["token_overlap"] = {"overlap_percentage":overlap_per, "token_list": token_list, "overlap_token_list":overlap_token_list}

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list_size": len(overlap_token_list), "overlap_set_size": len(set(overlap_token_list))}

tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 3.22kB/s]


Tokenizing text.


  0%|          | 0/70 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 70/70 [00:00<00:00, 320.22it/s]


['▁Angel', 'o', '▁Che', 't', 'cuti', ',', '▁se', '▁j', 'kun', '▁qe']
All tokens:
33697
Calculating overlap.


100%|██████████| 33697/33697 [01:10<00:00, 477.93it/s]


Number of tokens that overlap: 27035
Number of different tokens that overlap: 2771
Percentage of overlap: 0.8022969403804493
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (875 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 494.24it/s]


['▁Ενημέρωση', '▁του', '▁Pegasus', '▁Esti', 'asi', '▁με', '▁τις', '▁εισ', 'ερ', 'χ']
All tokens:
31240
Calculating overlap.


100%|██████████| 31240/31240 [03:56<00:00, 131.83it/s]


Number of tokens that overlap: 5043
Number of different tokens that overlap: 822
Percentage of overlap: 0.1614276568501921
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 503.68it/s]


['▁A', 'Ö', 'L', '▁Der', 's', '▁Seçim', 'i', '▁ve', '▁Sınav', '▁Giriş']
All tokens:
29681
Calculating overlap.


100%|██████████| 29681/29681 [02:36<00:00, 189.96it/s]


Number of tokens that overlap: 15460
Number of different tokens that overlap: 2412
Percentage of overlap: 0.5208719382770122
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 687.89it/s]


['▁Blog', '▁“', 'U', 'në', '▁të', '▁kam', '▁dashur', '▁me', '▁një', '▁dashuri']
All tokens:
26596
Calculating overlap.


100%|██████████| 26596/26596 [02:02<00:00, 217.00it/s]


Number of tokens that overlap: 16060
Number of different tokens that overlap: 2732
Percentage of overlap: 0.6038502030380508
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (920 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 588.10it/s]


['▁Sæ', 'kja', '▁um', '▁full', 'a', '▁að', 'ild', '▁Kenn', 'ita', 'la']
All tokens:
29647
Calculating overlap.


100%|██████████| 29647/29647 [02:36<00:00, 188.84it/s]


Number of tokens that overlap: 15209
Number of different tokens that overlap: 2017
Percentage of overlap: 0.5130030019900833
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1002 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 414.92it/s]


['▁Не', 'стандарт', 'ний', '▁підхід', '▁для', '▁виготовлення', '▁Ак', 'вар', 'і', 'у']
All tokens:
31540
Calculating overlap.


100%|██████████| 31540/31540 [04:23<00:00, 119.75it/s]


Number of tokens that overlap: 4941
Number of different tokens that overlap: 408
Percentage of overlap: 0.15665821179454664
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 574.78it/s]


['▁P', 'à', 'gine', 's', '▁En', 'fei', 'nada', '▁Porto', '▁uns', '▁dies']
All tokens:
27544
Calculating overlap.


100%|██████████| 27544/27544 [01:21<00:00, 336.32it/s]


Number of tokens that overlap: 20516
Number of different tokens that overlap: 2897
Percentage of overlap: 0.7448446122567529
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (816 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 590.23it/s]


['▁Ек', 'шу', 'ли', ',', '▁T', 'CL', '▁ги', '▁прави', '▁смартфон', 'овите']
All tokens:
27639
Calculating overlap.


100%|██████████| 27639/27639 [03:58<00:00, 115.96it/s]


Number of tokens that overlap: 4035
Number of different tokens that overlap: 656
Percentage of overlap: 0.14598936285683273
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 662.96it/s]


['▁O', '▁proizvod', 'u', '▁Color', '▁Trans', 'former', ',', '▁za', '▁pamet', 'no']
All tokens:
26546
Calculating overlap.


100%|██████████| 26546/26546 [01:22<00:00, 323.47it/s]


Number of tokens that overlap: 21808
Number of different tokens that overlap: 4383
Percentage of overlap: 0.8215173660815189
Tokenizing text.


  0%|          | 0/80 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (804 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 654.96it/s]


['▁Kita', 'jsko', '▁mesto', '▁duhov', '▁V', '▁Notranj', 'i', '▁Mongol', 'iji', '▁raste']
All tokens:
26292
Calculating overlap.


100%|██████████| 26292/26292 [00:54<00:00, 481.40it/s]

Number of tokens that overlap: 25616
Number of different tokens that overlap: 5281
Percentage of overlap: 0.9742887570363609





In [6]:
token_overlap_results = {}

for lang in list(main_dict.keys()):
	overlap_token_list = main_dict[lang]["token_overlap"]["overlap_token_list"]
	# Add to the results
	token_overlap_results[lang] = {"percentage": main_dict[lang]["token_overlap"]["overlap_percentage"], "overlap_list_size": len(overlap_token_list), "overlap_set_size": len(set(overlap_token_list))}

In [7]:
token_overlap_results

{'mt': {'percentage': 0.8022969403804493,
  'overlap_list_size': 27035,
  'overlap_set_size': 2771},
 'el': {'percentage': 0.1614276568501921,
  'overlap_list_size': 5043,
  'overlap_set_size': 822},
 'tr': {'percentage': 0.5208719382770122,
  'overlap_list_size': 15460,
  'overlap_set_size': 2412},
 'sq': {'percentage': 0.6038502030380508,
  'overlap_list_size': 16060,
  'overlap_set_size': 2732},
 'is': {'percentage': 0.5130030019900833,
  'overlap_list_size': 15209,
  'overlap_set_size': 2017},
 'uk': {'percentage': 0.15665821179454664,
  'overlap_list_size': 4941,
  'overlap_set_size': 408},
 'ca': {'percentage': 0.7448446122567529,
  'overlap_list_size': 20516,
  'overlap_set_size': 2897},
 'mk': {'percentage': 0.14598936285683273,
  'overlap_list_size': 4035,
  'overlap_set_size': 656},
 'hr': {'percentage': 0.8215173660815189,
  'overlap_list_size': 21808,
  'overlap_set_size': 4383},
 'sl': {'percentage': 0.9742887570363609,
  'overlap_list_size': 25616,
  'overlap_set_size': 5

In [8]:
overlap_df = pd.DataFrame(token_overlap_results).transpose().sort_values(by="percentage", ascending=False)
overlap_df

Unnamed: 0,percentage,overlap_list_size,overlap_set_size
sl,0.974289,25616.0,5281.0
hr,0.821517,21808.0,4383.0
mt,0.802297,27035.0,2771.0
ca,0.744845,20516.0,2897.0
sq,0.60385,16060.0,2732.0
tr,0.520872,15460.0,2412.0
is,0.513003,15209.0,2017.0
el,0.161428,5043.0,822.0
uk,0.156658,4941.0,408.0
mk,0.145989,4035.0,656.0


In [10]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

Add token counts information

In [11]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [9]:
token_number = {}
type_number = {}

for lang in list(main_dict.keys()):
	#print(f"Creating token dict for {lang}")
	current_token_list = main_dict[lang]["token_overlap"]["token_list"]

	#print(f"No of tokens: {len(current_token_list)}")

	# Create a dictionary which counts the occurrences of the tokens

	#word_dict_test = Counter(current_token_list)

	# Sort the dictionary alphabetically based on keys
	#word_dict_test = dict(sorted(word_dict_test.items()))

	# Add information on no. of tokens and words to a dict
	token_number[lang] = len(current_token_list)
	#type_number[lang] = len(word_dict_test)
	type_number[lang] = len(set(current_token_list))

	#print(list(word_dict_test.items())[:100])
	#print(f"No of unique tokens: {len(word_dict_test)}")

	# Add the count of tokens to the main dictionary
	#main_dict[lang]["token_overlap"]["token_count"] = word_dict_test

# Create a dataframe for statistics
token_df = pd.DataFrame({"tokens": token_number, "types": type_number})
print(token_df.to_markdown())

|    |   tokens |   types |
|:---|---------:|--------:|
| mt |    33697 |    3874 |
| el |    31240 |    4751 |
| tr |    29681 |    6231 |
| sq |    26596 |    4871 |
| is |    29647 |    4522 |
| uk |    31540 |    6463 |
| ca |    27544 |    5314 |
| mk |    27639 |    5468 |
| hr |    26546 |    6222 |
| sl |    26292 |    5763 |


In [10]:
# Merge token_df and overlap_df
merged = pd.concat([overlap_df, token_df], axis=1)
merged

# Reorder columns

merged = merged[["percentage", "tokens", "overlap_list_size", "types","overlap_set_size"]]

# Rename columns

merged.columns = ["overlap_percentage", "all_tokens", "overlapping_tokens", "all_types", "overlapping_types"]

merged

Unnamed: 0,overlap_percentage,all_tokens,overlapping_tokens,all_types,overlapping_types
sl,0.974289,26292,25616.0,5763,5281.0
hr,0.821517,26546,21808.0,6222,4383.0
mt,0.802297,33697,27035.0,3874,2771.0
ca,0.744845,27544,20516.0,5314,2897.0
sq,0.60385,26596,16060.0,4871,2732.0
tr,0.520872,29681,15460.0,6231,2412.0
is,0.513003,29647,15209.0,4522,2017.0
el,0.161428,31240,5043.0,4751,822.0
uk,0.156658,31540,4941.0,6463,408.0
mk,0.145989,27639,4035.0,5468,656.0


In [13]:
print(merged.to_markdown())

|    |   overlap_percentage |   all_tokens |   overlapping_tokens |   all_types |   overlapping_types |
|:---|---------------------:|-------------:|---------------------:|------------:|--------------------:|
| sl |             0.974289 |        26292 |                25616 |        5763 |                5281 |
| hr |             0.821517 |        26546 |                21808 |        6222 |                4383 |
| mt |             0.802297 |        33697 |                27035 |        3874 |                2771 |
| ca |             0.744845 |        27544 |                20516 |        5314 |                2897 |
| sq |             0.60385  |        26596 |                16060 |        4871 |                2732 |
| tr |             0.520872 |        29681 |                15460 |        6231 |                2412 |
| is |             0.513003 |        29647 |                15209 |        4522 |                2017 |
| el |             0.161428 |        31240 |                 504

In [21]:
# Let's see the most frequent tokens
most_frequent = {}

# Sort the dictionary by values (word counts) in descending order
for lang in list(main_dict.keys()):
	token_list = main_dict[lang]["token_overlap"]["overlap_token_list"]

	word_dict = Counter(token_list)

	most_frequent[lang] = word_dict.most_common(10)

print(pd.DataFrame({"most_frequent_type": most_frequent}).to_markdown())

|    | most_frequent_type                                                                                                                     |
|:---|:---------------------------------------------------------------------------------------------------------------------------------------|
| ca | [(',', 1079), ('▁de', 1013), ('.', 675), ('s', 651), ('▁i', 566), ('▁la', 560), ('▁a', 530), ('▁que', 439), ("'", 357), ('’', 334)]    |
| el | [('▁', 1017), ('.', 801), (',', 782), ('▁η', 159), ('▁"', 98), ('▁(', 83), (':', 62), (')', 60), ('"', 55), ('-', 45)]                 |
| hr | [(',', 878), ('.', 766), ('▁i', 546), ('▁u', 430), ('a', 413), ('▁je', 350), ('▁na', 282), ('▁za', 253), ('▁se', 239), ('e', 219)]     |
| is | [('.', 1021), ('▁og', 640), (',', 598), ('▁', 532), ('▁er', 357), ('s', 316), ('▁sem', 282), ('a', 272), ('i', 272), ('▁til', 261)]    |
| mk | [(',', 1021), ('.', 738), ('▁', 341), ('o', 126), ('e', 96), ('▁"', 85), (':', 64), ('"', 53), ('-', 47), ('▁-', 42)]            

In [11]:
# Let's see the most frequent tokens
most_frequent = {}
# Sort the dictionary by values (word counts) in descending order
for lang in list(main_dict.keys()):
	most_frequent[lang] = (sorted(main_dict[lang]['token_overlap']['token_count'].items(), key=lambda x: x[1], reverse=True)[:10])

print(pd.DataFrame({"most_frequent_type": most_frequent}).to_markdown())

KeyError: 'token_count'

In [36]:
# Save the main dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

# Compare token overlap on label level 

### Create label-level token counts for train dataset

In [12]:
# Open the tokenized train df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁..."


In [13]:
train_df.labels.unique()

array(['Other', 'Information/Explanation', 'News', 'Instruction',
       'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal',
       'Promotion'], dtype=object)

In [14]:
# Separate the train df into label-based dfs

# Create lists of tokens that are label based
label_tokens = {}
token_count = {}
type_count = {}

for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
	print(f"Processing {label}")

	label_df = train_df[train_df["labels"] == label]
	display(label_df.head(3))

	# Create a list of tokens, where we take only the first 512 tokens
	train_tokens_shortened = []

	for i in label_df["tokens_train"].to_list():
		train_tokens_shortened.extend(i[:512])

	print(f"Number of all tokens: {len(train_tokens_shortened)}")

	# Add to dictionaries
	label_tokens[label] = train_tokens_shortened
	token_count[label] = len(train_tokens_shortened)
	type_count[label] = len(set(train_tokens_shortened))

Processing Information/Explanation


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,"[▁Abstract, ▁Object, ive, :, ▁Report, ing, ▁bi...","[233973, 134549, 5844, 12, 34798, 214, 333, 16...",abstract objective: reporting bias due to soci...,"[▁abstract, ▁objective, :, ▁report, ing, ▁bi, ..."
3,In 2009 the song was the focus of a successful...,Information/Explanation,CORE,English,"[▁In, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ...","[360, 1877, 70, 11531, 509, 70, 32153, 111, 10...",in 2009 the song was the focus of a successful...,"[▁in, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ..."
39,Story: Whaling Page 4 -- M?ori and whaling Wha...,Information/Explanation,CORE,English,"[▁Story, :, ▁W, hal, ing, ▁Page, ▁4, ▁--, ▁M, ...","[30575, 12, 601, 4200, 214, 14231, 201, 4210, ...",story: whaling page 4 -- m?ori and whaling wha...,"[▁story, :, ▁w, hal, ing, ▁page, ▁4, ▁--, ▁m, ..."


Number of all tokens: 124130
Processing News


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
4,QuotW This was the week when neither rumours o...,News,CORE,English,"[▁Quo, t, W, ▁This, ▁was, ▁the, ▁week, ▁when, ...","[43851, 18, 1456, 3293, 509, 70, 5895, 3229, 2...",quotw this was the week when neither rumours o...,"[▁quo, tw, ▁this, ▁was, ▁the, ▁week, ▁when, ▁n..."
5,KaZaA claims it can't stop users sharing music...,News,CORE,English,"[▁Ka, Za, A, ▁claims, ▁it, ▁can, ', t, ▁stop, ...","[1136, 16737, 284, 140526, 442, 831, 25, 18, 7...",kazaa claims it can't stop users sharing music...,"[▁kaza, a, ▁claims, ▁it, ▁can, ', t, ▁stop, ▁u..."
9,Nebraska fans checking out airfare for a trip ...,News,CORE,English,"[▁Ne, bra, ska, ▁fans, ▁checking, ▁out, ▁air, ...","[799, 2844, 937, 35992, 175199, 1810, 1831, 44...",nebraska fans checking out airfare for a trip ...,"[▁ne, bra, ska, ▁fans, ▁checking, ▁out, ▁air, ..."


Number of all tokens: 136557
Processing Instruction


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
6,When you first sign up with an online casino a...,Instruction,CORE,English,"[▁When, ▁you, ▁first, ▁sign, ▁up, ▁with, ▁an, ...","[14847, 398, 5117, 24092, 1257, 678, 142, 1118...",when you first sign up with an online casino a...,"[▁when, ▁you, ▁first, ▁sign, ▁up, ▁with, ▁an, ..."
7,How to be the BEST Workplace Supervisor A work...,Instruction,CORE,English,"[▁How, ▁to, ▁be, ▁the, ▁BEST, ▁Work, place, ▁S...","[11249, 47, 186, 70, 121300, 27985, 23935, 426...",how to be the best workplace supervisor a work...,"[▁how, ▁to, ▁be, ▁the, ▁best, ▁work, place, ▁s..."
29,I am hungry and now have an hour with a tobler...,Instruction,CORE,English,"[▁I, ▁am, ▁hun, gry, ▁and, ▁now, ▁have, ▁an, ▁...","[87, 444, 1926, 47285, 136, 5036, 765, 142, 56...",i am hungry and now have an hour with a tobler...,"[▁i, ▁am, ▁hun, gry, ▁and, ▁now, ▁have, ▁an, ▁..."


Number of all tokens: 83750
Processing Opinion/Argumentation


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
8,popular themes AllMusic relies heavily on Java...,Opinion/Argumentation,CORE,English,"[▁popular, ▁them, es, ▁All, Music, ▁reli, es, ...","[5700, 2856, 90, 3164, 158257, 28702, 90, 1730...",popular themes allmusic relies heavily on java...,"[▁popular, ▁them, es, ▁all, music, ▁reli, es, ..."
10,"I was just recalling how, about a year ago, my...",Opinion/Argumentation,CORE,English,"[▁I, ▁was, ▁just, ▁recall, ing, ▁how, ,, ▁abou...","[87, 509, 1660, 189232, 214, 3642, 4, 1672, 10...","i was just recalling how, about a year ago, my...","[▁i, ▁was, ▁just, ▁recall, ing, ▁how, ,, ▁abou..."
32,Combining our love of shiny things with some t...,Opinion/Argumentation,CORE,English,"[▁Combi, ning, ▁our, ▁love, ▁of, ▁shi, ny, ▁th...","[106935, 592, 2446, 5161, 111, 6544, 299, 8966...",combining our love of shiny things with some t...,"[▁com, bi, ning, ▁our, ▁love, ▁of, ▁shi, ny, ▁..."


Number of all tokens: 103141
Processing Forum


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
13,Quote ryan mead: I would like something that g...,Forum,CORE,English,"[▁Quote, ▁ry, an, ▁me, ad, :, ▁I, ▁would, ▁lik...","[109216, 5535, 66, 163, 712, 12, 87, 2806, 188...",quote ryan mead: i would like something that g...,"[▁quote, ▁ry, an, ▁me, ad, :, ▁i, ▁would, ▁lik..."
15,Changing ISP re Broadband - what happens to em...,Forum,CORE,English,"[▁Chang, ing, ▁I, SP, ▁re, ▁Bro, ad, band, ▁-,...","[108193, 214, 87, 9434, 456, 13177, 712, 8262,...",changing isp re broadband - what happens to em...,"[▁changing, ▁is, p, ▁re, ▁broad, band, ▁-, ▁wh..."
16,Comments for Post (25) I'm there with you. I'v...,Forum,CORE,English,"[▁Comments, ▁for, ▁Post, ▁(25), ▁I, ', m, ▁the...","[11427, 100, 2795, 59791, 87, 25, 39, 2685, 67...",comments for post (25) i'm there with you. i'v...,"[▁comments, ▁for, ▁post, ▁(25), ▁i, ', m, ▁the..."


Number of all tokens: 58900
Processing Prose/Lyrical


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
14,One Dance Too Many The night I first met Ziegl...,Prose/Lyrical,CORE,English,"[▁One, ▁Dance, ▁Too, ▁Many, ▁The, ▁night, ▁I, ...","[6561, 67022, 56374, 52455, 581, 17431, 87, 51...",one dance too many the night i first met ziegl...,"[▁one, ▁dance, ▁too, ▁many, ▁the, ▁night, ▁i, ..."
57,"Household Tales, by Brothers Grimm The Story o...",Prose/Lyrical,CORE,English,"[▁House, hold, ▁Tale, s, ,, ▁by, ▁Brother, s, ...","[13038, 16200, 59144, 7, 4, 390, 67921, 7, 106...","household tales, by brothers grimm the story o...","[▁household, ▁tales, ,, ▁by, ▁brother, s, ▁gri..."
72,Empire! Empire! I Would Have Stolen You A Whol...,Prose/Lyrical,CORE,English,"[▁Empire, !, ▁Empire, !, ▁I, ▁Would, ▁Have, ▁S...","[145359, 38, 145359, 38, 87, 154559, 31901, 73...",empire! empire! i would have stolen you a whol...,"[▁em, pire, !, ▁em, pire, !, ▁i, ▁would, ▁have..."


Number of all tokens: 46860
Processing Legal


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
35,Full Terms and Conditions Eligibility to enter...,Legal,CORE,English,"[▁Full, ▁Terms, ▁and, ▁Condi, tions, ▁E, ligi,...","[9312, 165504, 136, 46347, 5256, 241, 7883, 83...",full terms and conditions eligibility to enter...,"[▁full, ▁terms, ▁and, ▁conditions, ▁e, ligi, b..."
110,Commonwealth Consolidated Acts INCOME TAX ASSE...,Legal,CORE,English,"[▁Common, we, al, th, ▁Con, solid, ated, ▁Act,...","[151301, 1177, 289, 927, 1657, 97281, 27686, 2...",commonwealth consolidated acts income tax asse...,"[▁common, we, al, th, ▁consolida, ted, ▁act, s..."
122,"This Terms of Use Agreement (""Agreement"") is b...",Legal,CORE,English,"[▁This, ▁Terms, ▁of, ▁Use, ▁Agreement, ▁("", A,...","[3293, 165504, 111, 36836, 186670, 24073, 284,...","this terms of use agreement (""agreement"") is b...","[▁this, ▁terms, ▁of, ▁use, ▁agreement, ▁("", a,..."


Number of all tokens: 28496
Processing Promotion


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
79,Post navigation Citizen Fish are back on the r...,Promotion,CORE,English,"[▁Post, ▁navigation, ▁Citizen, ▁Fish, ▁are, ▁b...","[2795, 134470, 193223, 104796, 621, 4420, 98, ...",post navigation citizen fish are back on the r...,"[▁post, ▁navigation, ▁citizen, ▁fish, ▁are, ▁b..."
130,Win yourself a FREE copy of the BradyGames off...,Promotion,CORE,English,"[▁Win, ▁yourself, ▁a, ▁FREE, ▁copy, ▁of, ▁the,...","[17686, 31949, 10, 86697, 43658, 111, 70, 5859...",win yourself a free copy of the bradygames off...,"[▁win, ▁yourself, ▁a, ▁free, ▁copy, ▁of, ▁the,..."
179,Do You Want To Know The Quick Secret for On Pa...,Promotion,CORE,English,"[▁Do, ▁You, ▁Want, ▁To, ▁Know, ▁The, ▁Quick, ▁...","[984, 2583, 42335, 717, 70829, 581, 89038, 390...",do you want to know the quick secret for on pa...,"[▁do, ▁you, ▁want, ▁to, ▁know, ▁the, ▁quick, ▁..."


Number of all tokens: 88626


In [15]:
# Calculate numbers for each label
label_results_train = pd.DataFrame({"token_count": token_count, "type_count": type_count})

print(label_results_train.to_markdown())

|                         |   token_count |   type_count |
|:------------------------|--------------:|-------------:|
| Information/Explanation |        124130 |        14678 |
| News                    |        136557 |        15319 |
| Instruction             |         83750 |         8929 |
| Opinion/Argumentation   |        103141 |        13088 |
| Forum                   |         58900 |         8555 |
| Prose/Lyrical           |         46860 |         5990 |
| Legal                   |         28496 |         4425 |
| Promotion               |         88626 |        12548 |


In [16]:
# Save the label token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count.json", "w") as train_label_count_file:
	json.dump(label_tokens, train_label_count_file)

### Create label-level token counts for test sets

In [17]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict["uk"].keys()

dict_keys(['accuracy', 'micro_f1', 'macro_f1', 'label_scores', 'dataset', 'token_overlap'])

In [18]:
pd.DataFrame(main_dict["sl"]["dataset"]).head(2)

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."


In [19]:
# Do the same as with the dataset, but on every language
lang_results = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")

	label_token_dict = {}

	# Current df
	df = pd.DataFrame(main_dict[lang]["dataset"])
	display(df.head(2))

	for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
		print(f"Processing {label}")

		label_df = df[df["y_true"] == label]

		# Create a list of tokens
		token_list = []

		for i in label_df["tokens"].to_list():
			token_list.extend(i)

		# Add to dictionary
		label_token_dict[label] = token_list

	# Add to main dict
	main_dict[lang]["token_overlap"]["label_level_token_lists"] = label_token_dict


Processing mt


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262..."
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing el


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ..."
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing tr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.tr.15851513,Instruction,AÖL Ders Seçimi ve Sınav Giriş Merkezi Belirle...,AÖL warned of frequent negligence and errors o...,{'text_id': 'macocu.tr.15851513'},Instruction,"[▁A, Ö, L, ▁Der, s, ▁Seçim, i, ▁ve, ▁Sınav, ▁G...","[62, 8655, 866, 1310, 7, 166134, 14, 173, 1762..."
1,macocu.tr.12699738,Legal,Banka promosyonu ihalesinde uygulanacak kriter...,Criteria to be applied in the tender for bank ...,{'text_id': 'macocu.tr.12699738'},Legal,"[▁Banka, ▁promo, syon, u, ▁i, hale, sinde, ▁uy...","[81847, 8891, 10270, 34, 17, 50742, 19209, 633..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing sq


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids
0,macocu.sq.1061396,Opinion/Argumentation,341.0,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",Opinion/Argumentation,"[▁Blog, ▁“, U, në, ▁të, ▁kam, ▁dashur, ▁me, ▁n...","[5061, 52, 1062, 3208, 134, 3840, 57168, 163, ..."
1,macocu.sq.183383,Legal,140.0,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...,"{'text_id': 'macocu.sq.183383', 'domain': 'euk...",Legal,"[▁Li, ria, ▁nga, ▁keq, t, raj, timi, ▁Kon, ven...","[1261, 1651, 817, 39184, 18, 10185, 20520, 369..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing is


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.is.1301366,Instruction,Sækja um fulla aðild \n\nKennitala * \n\nNetfa...,Apply full membership\n\nSocial Security numbe...,{'text_id': 'macocu.is.1301366'},Information/Explanation,"[▁Sæ, kja, ▁um, ▁full, a, ▁að, ild, ▁Kenn, ita...","[71595, 28643, 286, 4393, 11, 389, 38472, 5906..."
1,macocu.is.1528713,Information/Explanation,Inngangur Íslenskur landbúnaður hefur þróast ö...,Introduction Icelandic agriculture has evolved...,{'text_id': 'macocu.is.1528713'},Information/Explanation,"[▁Inn, gangur, ▁Íslensk, ur, ▁land, búnað, ur,...","[11151, 160409, 122022, 474, 3551, 74026, 474,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing uk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307..."
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing ca


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,macocu.ca.2248072,Opinion/Argumentation,Pàgines \n\nEnfeinada \n\nPorto uns dies una m...,Pages\n\nCoined\n\nI have been a little busy f...,{'text_id': 'macocu.ca.2248072'},Forum,"[▁P, à, gine, s, ▁En, fei, nada, ▁Porto, ▁uns,...","[436, 1298, 63023, 7, 357, 51899, 28866, 24952..."
1,macocu.ca.756254,Information/Explanation,Info \n\nLa Casa nova dels Banys de Sant Vicen...,Info\n\nThe Casa Nova dels Banys de Sant Vicen...,{'text_id': 'macocu.ca.756254'},Information/Explanation,"[▁Info, ▁La, ▁Casa, ▁nova, ▁dels, ▁Ban, ys, ▁d...","[14048, 239, 8591, 4678, 2323, 5458, 4778, 8, ..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing mk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing hr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.hr.1033815,Promotion,"O proizvodu\nColor Transformer, za pametno i j...","About the Color Transformer product, for smart...","CLASSLA-web.hr.1033815', 'domain': 'hairshop.hr'}",Promotion,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, former, ,, ...","[180, 43170, 34, 51193, 11062, 82772, 4, 80, 6..."
2,CLASSLA-web.hr.1119579,Promotion,Sunčano selo / Sunny village\nNa obroncima Bil...,Sunshine / Sunny Village on the slopes of Bilo...,"CLASSLA-web.hr.1119579', 'domain': 'vikendi.com'}",Promotion,"[▁Sun, čan, o, ▁se, lo, ▁/, ▁Sunny, ▁village, ...","[7550, 17129, 31, 40, 365, 248, 151197, 54427,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing sl


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion


In [25]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

In [27]:
main_dict["sl"]['token_overlap'].keys()

dict_keys(['overlap_percentage', 'token_list', 'overlap_token_list', 'label_level_token_lists'])

In [24]:
main_dict["sl"]["token_overlap"]["label_level_token_lists"]

{'Information/Explanation': ['▁D',
  'vor',
  'ec',
  '▁je',
  '▁morda',
  '▁na',
  'sled',
  'nik',
  '▁enega',
  '▁od',
  '▁številnih',
  '▁v',
  '▁vir',
  'ih',
  '▁iz',
  'pri',
  'ča',
  'nih',
  '▁srednje',
  've',
  'ških',
  '▁dvor',
  'ov',
  '▁na',
  '▁šir',
  'š',
  'em',
  '▁območju',
  '▁Vip',
  'ave',
  '.',
  '▁Prvi',
  'č',
  '▁je',
  '▁bil',
  '▁omen',
  'jen',
  '▁leta',
  '▁16',
  '30',
  '▁v',
  '▁popis',
  'u',
  '▁za',
  'pušč',
  'ine',
  '▁Balta',
  'zar',
  'ja',
  '▁pl',
  '.',
  '▁Raum',
  'sch',
  'üs',
  's',
  'la',
  ',',
  '▁gospod',
  'a',
  '▁na',
  '▁Bel',
  'ne',
  'ku',
  '▁pri',
  '▁Morav',
  'ča',
  'h',
  '▁na',
  '▁Gor',
  'en',
  'jskem',
  '▁in',
  '▁v',
  '▁Vrh',
  'pol',
  'ju',
  '▁pri',
  '▁Vip',
  'avi',
  '.',
  '▁Zaradi',
  '▁lege',
  '▁ob',
  '▁vodi',
  '▁je',
  '▁do',
  'bil',
  '▁nem',
  'ško',
  '▁ime',
  '▁Schön',
  'au',
  ',',
  '▁ki',
  '▁pomeni',
  '▁Le',
  'pi',
  '▁log',
  '.',
  '▁Popis',
  '▁njegove',
  '▁za',
  'pušč',
  '

## Calculate overlap on label level

In [28]:
# Open label-based train token count
with open("datasets/tokenized_datasets/X-GENRE-train-label-token-count.json", "r") as train_label_count_file:
	label_token_count_train = json.load(train_label_count_file)

label_token_count_train.keys()

dict_keys(['Information/Explanation', 'News', 'Instruction', 'Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal', 'Promotion'])

In [29]:
# Import the main dict for test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "r") as file:
	main_dict = json.load(file)

main_dict["sl"]["token_overlap"]["label_level_token_lists"]

{'Information/Explanation': ['▁D',
  'vor',
  'ec',
  '▁je',
  '▁morda',
  '▁na',
  'sled',
  'nik',
  '▁enega',
  '▁od',
  '▁številnih',
  '▁v',
  '▁vir',
  'ih',
  '▁iz',
  'pri',
  'ča',
  'nih',
  '▁srednje',
  've',
  'ških',
  '▁dvor',
  'ov',
  '▁na',
  '▁šir',
  'š',
  'em',
  '▁območju',
  '▁Vip',
  'ave',
  '.',
  '▁Prvi',
  'č',
  '▁je',
  '▁bil',
  '▁omen',
  'jen',
  '▁leta',
  '▁16',
  '30',
  '▁v',
  '▁popis',
  'u',
  '▁za',
  'pušč',
  'ine',
  '▁Balta',
  'zar',
  'ja',
  '▁pl',
  '.',
  '▁Raum',
  'sch',
  'üs',
  's',
  'la',
  ',',
  '▁gospod',
  'a',
  '▁na',
  '▁Bel',
  'ne',
  'ku',
  '▁pri',
  '▁Morav',
  'ča',
  'h',
  '▁na',
  '▁Gor',
  'en',
  'jskem',
  '▁in',
  '▁v',
  '▁Vrh',
  'pol',
  'ju',
  '▁pri',
  '▁Vip',
  'avi',
  '.',
  '▁Zaradi',
  '▁lege',
  '▁ob',
  '▁vodi',
  '▁je',
  '▁do',
  'bil',
  '▁nem',
  'ško',
  '▁ime',
  '▁Schön',
  'au',
  ',',
  '▁ki',
  '▁pomeni',
  '▁Le',
  'pi',
  '▁log',
  '.',
  '▁Popis',
  '▁njegove',
  '▁za',
  'pušč',
  '

In [31]:
# Improved code to calculate token overlap - we can simply calculate which tokens do not overlap - meaning that they occur only in one of the dataset, and calculate the percentage overlap based on that
token_overlap_label_results = {}
results = {}

# Loop through the datasets and labels and calculate token overlap
for lang in list(main_dict.keys()):
	print(lang)
	# Create a dict for labels
	label_overlap = {}
	label_overlap_tokens = {}

	# loop through labels
	if lang != "mt":
		for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
			print(label)
			token_list_test = main_dict[lang]["token_overlap"]["label_level_token_lists"][label]

			token_list_train = label_token_count_train[label]

			# See how many tokens do not overlap
			no_overlap_counter = 0

			# Save tokens that overlap for further inspection
			overlap_token_list = []

			# We calculate overlap by counting how many tokens do not appear in both sets
			for token in tqdm(token_list_test):
				if token not in token_list_train:
					no_overlap_counter += 1
				else:
					overlap_token_list.append(token)

			# Out of all tokens in test set, how many do not overlap with train set?
			no_overlap_per = no_overlap_counter/len(token_list_test)

			# Calculate percentage of overlap based on that
			overlap_per = 1-no_overlap_per

			print(f"Percentage of overlap: {overlap_per}")

			label_overlap[label] = overlap_per
			label_overlap_tokens[label] = overlap_token_list

			# Add to the results
			results[f"{lang}-{label}"] = overlap_per

		# Add the list of all tokens to the dictionary
		main_dict[lang]["token_overlap"]["label_overlap_percentage"] = label_overlap
		main_dict[lang]["token_overlap"]["label_overlap_token_list"] = label_overlap_tokens
	else:
		# Skip "Legal" for Maltese, because we do not have this label there
		for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Promotion']:
			print(label)
			token_list_test = main_dict[lang]["token_overlap"]["label_level_token_lists"][label]

			token_list_train = label_token_count_train[label]

			# See how many tokens do not overlap
			no_overlap_counter = 0

			# Save tokens that overlap for further inspection
			overlap_token_list = []

			# We calculate overlap by counting how many tokens do not appear in both sets
			for token in tqdm(token_list_test):
				if token not in token_list_train:
					no_overlap_counter += 1
				else:
					overlap_token_list.append(token)

			# Out of all tokens in test set, how many do not overlap with train set?
			no_overlap_per = no_overlap_counter/len(token_list_test)

			# Calculate percentage of overlap based on that
			overlap_per = 1-no_overlap_per

			print(f"Percentage of overlap: {overlap_per}")

			label_overlap[label] = overlap_per
			label_overlap_tokens[label] = overlap_token_list

			# Add to the results
			results[f"{lang}-{label}"] = overlap_per

		# Add the list of all tokens to the dictionary
		main_dict[lang]["token_overlap"]["label_overlap_percentage"] = label_overlap
		main_dict[lang]["token_overlap"]["label_overlap_token_list"] = label_overlap_tokens		

mt
Information/Explanation


100%|██████████| 6656/6656 [00:03<00:00, 2089.15it/s]


Percentage of overlap: 0.7145432692307692
News


100%|██████████| 6755/6755 [00:03<00:00, 1823.71it/s]


Percentage of overlap: 0.7083641746854182
Instruction


100%|██████████| 9288/9288 [00:03<00:00, 2583.06it/s]


Percentage of overlap: 0.6194013781223083
Opinion/Argumentation


100%|██████████| 4096/4096 [00:01<00:00, 2236.85it/s]


Percentage of overlap: 0.68212890625
Forum


100%|██████████| 512/512 [00:00<00:00, 3429.26it/s]


Percentage of overlap: 0.583984375
Prose/Lyrical


100%|██████████| 512/512 [00:00<00:00, 4594.55it/s]


Percentage of overlap: 0.591796875
Promotion


100%|██████████| 5878/5878 [00:02<00:00, 2763.78it/s]


Percentage of overlap: 0.6578768288533514
el
Information/Explanation


100%|██████████| 5608/5608 [00:06<00:00, 919.72it/s] 


Percentage of overlap: 0.13908701854493577
News


100%|██████████| 3501/3501 [00:04<00:00, 817.25it/s]


Percentage of overlap: 0.1162524992859183
Instruction


100%|██████████| 2971/2971 [00:02<00:00, 1432.48it/s]


Percentage of overlap: 0.16122517670817904
Opinion/Argumentation


100%|██████████| 5607/5607 [00:05<00:00, 1088.26it/s]


Percentage of overlap: 0.11699661137863382
Forum


100%|██████████| 5137/5137 [00:02<00:00, 1956.24it/s]


Percentage of overlap: 0.17033287911232242
Prose/Lyrical


100%|██████████| 2872/2872 [00:01<00:00, 2394.08it/s]


Percentage of overlap: 0.14589136490250698
Legal


100%|██████████| 3368/3368 [00:00<00:00, 4214.30it/s]


Percentage of overlap: 0.1395486935866983
Promotion


100%|██████████| 2176/2176 [00:01<00:00, 1295.93it/s]


Percentage of overlap: 0.15900735294117652
tr
Information/Explanation


100%|██████████| 2289/2289 [00:01<00:00, 1421.64it/s]


Percentage of overlap: 0.47924858016601135
News


100%|██████████| 3342/3342 [00:02<00:00, 1198.24it/s]


Percentage of overlap: 0.44105326152004787
Instruction


100%|██████████| 4152/4152 [00:02<00:00, 1831.12it/s]


Percentage of overlap: 0.3827071290944123
Opinion/Argumentation


100%|██████████| 5613/5613 [00:03<00:00, 1579.42it/s]


Percentage of overlap: 0.4332798859789774
Forum


100%|██████████| 3219/3219 [00:01<00:00, 2511.47it/s]


Percentage of overlap: 0.44330537433985706
Prose/Lyrical


100%|██████████| 3764/3764 [00:01<00:00, 3241.65it/s]


Percentage of overlap: 0.41339001062699254
Legal


100%|██████████| 3677/3677 [00:00<00:00, 5061.44it/s]


Percentage of overlap: 0.3146586891487626
Promotion


100%|██████████| 3625/3625 [00:02<00:00, 1809.04it/s]


Percentage of overlap: 0.39420689655172414
sq
Information/Explanation


100%|██████████| 5196/5196 [00:03<00:00, 1375.55it/s]


Percentage of overlap: 0.4844110854503464
News


100%|██████████| 2415/2415 [00:02<00:00, 1201.60it/s]


Percentage of overlap: 0.4650103519668737
Instruction


100%|██████████| 3204/3204 [00:01<00:00, 1857.93it/s]


Percentage of overlap: 0.4297752808988764
Opinion/Argumentation


100%|██████████| 4859/4859 [00:02<00:00, 1628.48it/s]


Percentage of overlap: 0.49104754064622347
Forum


100%|██████████| 2917/2917 [00:00<00:00, 2988.08it/s]


Percentage of overlap: 0.5786767226602674
Prose/Lyrical


100%|██████████| 3393/3393 [00:00<00:00, 3617.70it/s]


Percentage of overlap: 0.49277925139994105
Legal


100%|██████████| 2817/2817 [00:00<00:00, 5087.34it/s]


Percentage of overlap: 0.37841675541356057
Promotion


100%|██████████| 1795/1795 [00:00<00:00, 2055.45it/s]


Percentage of overlap: 0.5337047353760446
is
Information/Explanation


100%|██████████| 1917/1917 [00:01<00:00, 1257.46it/s]


Percentage of overlap: 0.4324465310380803
News


100%|██████████| 4149/4149 [00:03<00:00, 1108.60it/s]


Percentage of overlap: 0.4196191853458665
Instruction


100%|██████████| 3915/3915 [00:02<00:00, 1692.97it/s]


Percentage of overlap: 0.383397190293742
Opinion/Argumentation


100%|██████████| 4970/4970 [00:03<00:00, 1441.05it/s]


Percentage of overlap: 0.4191146881287726
Forum


100%|██████████| 2769/2769 [00:01<00:00, 2455.28it/s]


Percentage of overlap: 0.40195016251354276
Prose/Lyrical


100%|██████████| 4649/4649 [00:01<00:00, 3022.96it/s]


Percentage of overlap: 0.37491933749193374
Legal


100%|██████████| 3830/3830 [00:00<00:00, 5120.35it/s]


Percentage of overlap: 0.3227154046997389
Promotion


100%|██████████| 3448/3448 [00:01<00:00, 1798.93it/s]


Percentage of overlap: 0.43271461716937354
uk
Information/Explanation


100%|██████████| 4352/4352 [00:04<00:00, 949.88it/s] 


Percentage of overlap: 0.13304227941176472
News


100%|██████████| 3386/3386 [00:04<00:00, 840.27it/s]


Percentage of overlap: 0.13112817483756645
Instruction


100%|██████████| 4411/4411 [00:03<00:00, 1397.40it/s]


Percentage of overlap: 0.11902063024257536
Opinion/Argumentation


100%|██████████| 5386/5386 [00:04<00:00, 1127.78it/s]


Percentage of overlap: 0.14611956925362046
Forum


100%|██████████| 3391/3391 [00:01<00:00, 2000.34it/s]


Percentage of overlap: 0.16661751695664995
Prose/Lyrical


100%|██████████| 4282/4282 [00:01<00:00, 2535.14it/s]


Percentage of overlap: 0.16184026156001863
Legal


100%|██████████| 3484/3484 [00:00<00:00, 4390.06it/s]


Percentage of overlap: 0.1702066590126292
Promotion


100%|██████████| 2848/2848 [00:02<00:00, 1374.72it/s]


Percentage of overlap: 0.1629213483146067
ca
Information/Explanation


100%|██████████| 4017/4017 [00:02<00:00, 1825.06it/s]


Percentage of overlap: 0.6069205875031118
News


100%|██████████| 2612/2612 [00:01<00:00, 1654.66it/s]


Percentage of overlap: 0.6125574272588055
Instruction


100%|██████████| 2133/2133 [00:00<00:00, 2622.26it/s]


Percentage of overlap: 0.6207219878105954
Opinion/Argumentation


100%|██████████| 3368/3368 [00:01<00:00, 2275.48it/s]


Percentage of overlap: 0.6339073634204275
Forum


100%|██████████| 3755/3755 [00:01<00:00, 3441.22it/s]


Percentage of overlap: 0.6399467376830892
Prose/Lyrical


100%|██████████| 4092/4092 [00:00<00:00, 4218.61it/s]


Percentage of overlap: 0.5916422287390029
Legal


100%|██████████| 3336/3336 [00:00<00:00, 6564.14it/s]


Percentage of overlap: 0.4787170263788969
Promotion


100%|██████████| 4231/4231 [00:01<00:00, 2577.50it/s]


Percentage of overlap: 0.616875443157646
mk
Information/Explanation


100%|██████████| 3324/3324 [00:03<00:00, 944.22it/s] 


Percentage of overlap: 0.16817087845968715
News


100%|██████████| 3778/3778 [00:04<00:00, 809.54it/s]


Percentage of overlap: 0.10534674430915825
Instruction


100%|██████████| 3271/3271 [00:02<00:00, 1375.21it/s]


Percentage of overlap: 0.1213696117395292
Opinion/Argumentation


100%|██████████| 3007/3007 [00:02<00:00, 1098.78it/s]


Percentage of overlap: 0.12337878284003989
Forum


100%|██████████| 4072/4072 [00:02<00:00, 1922.76it/s]


Percentage of overlap: 0.15446954813359526
Prose/Lyrical


100%|██████████| 3892/3892 [00:01<00:00, 2490.15it/s]


Percentage of overlap: 0.1500513874614594
Legal


100%|██████████| 3412/3412 [00:00<00:00, 4115.97it/s]


Percentage of overlap: 0.11371629542790151
Promotion


100%|██████████| 2883/2883 [00:02<00:00, 1304.65it/s]


Percentage of overlap: 0.13146028442594515
hr
Information/Explanation


100%|██████████| 4027/4027 [00:02<00:00, 1902.84it/s]


Percentage of overlap: 0.7047429848522473
News


100%|██████████| 3237/3237 [00:01<00:00, 1655.67it/s]


Percentage of overlap: 0.7194933580475749
Instruction


100%|██████████| 1701/1701 [00:00<00:00, 2290.16it/s]


Percentage of overlap: 0.6202233980011758
Opinion/Argumentation


100%|██████████| 3427/3427 [00:01<00:00, 2041.19it/s]


Percentage of overlap: 0.6807703530784943
Forum


100%|██████████| 3811/3811 [00:01<00:00, 2991.44it/s]


Percentage of overlap: 0.6452374704801889
Prose/Lyrical


100%|██████████| 2651/2651 [00:00<00:00, 3864.15it/s]


Percentage of overlap: 0.5620520558279894
Legal


100%|██████████| 3490/3490 [00:00<00:00, 7021.13it/s]


Percentage of overlap: 0.5272206303724929
Promotion


100%|██████████| 4202/4202 [00:01<00:00, 3161.80it/s]


Percentage of overlap: 0.6972870061875298
sl
Information/Explanation


100%|██████████| 3446/3446 [00:01<00:00, 2706.09it/s]


Percentage of overlap: 0.8682530470110272
News


100%|██████████| 3452/3452 [00:01<00:00, 2134.62it/s]


Percentage of overlap: 0.9122247972190035
Instruction


100%|██████████| 3395/3395 [00:01<00:00, 2940.55it/s]


Percentage of overlap: 0.7705449189985273
Opinion/Argumentation


100%|██████████| 2471/2471 [00:00<00:00, 2757.88it/s]


Percentage of overlap: 0.8887090246863618
Forum


100%|██████████| 3970/3970 [00:01<00:00, 3632.31it/s]


Percentage of overlap: 0.801007556675063
Prose/Lyrical


100%|██████████| 2909/2909 [00:00<00:00, 5453.02it/s]


Percentage of overlap: 0.6995531110347198
Legal


100%|██████████| 3446/3446 [00:00<00:00, 11372.51it/s]


Percentage of overlap: 0.7556587347649448
Promotion


100%|██████████| 3203/3203 [00:00<00:00, 6392.10it/s]

Percentage of overlap: 0.8741804558226662





In [32]:
results

{'mt-Information/Explanation': 0.7145432692307692,
 'mt-News': 0.7083641746854182,
 'mt-Instruction': 0.6194013781223083,
 'mt-Opinion/Argumentation': 0.68212890625,
 'mt-Forum': 0.583984375,
 'mt-Prose/Lyrical': 0.591796875,
 'mt-Promotion': 0.6578768288533514,
 'el-Information/Explanation': 0.13908701854493577,
 'el-News': 0.1162524992859183,
 'el-Instruction': 0.16122517670817904,
 'el-Opinion/Argumentation': 0.11699661137863382,
 'el-Forum': 0.17033287911232242,
 'el-Prose/Lyrical': 0.14589136490250698,
 'el-Legal': 0.1395486935866983,
 'el-Promotion': 0.15900735294117652,
 'tr-Information/Explanation': 0.47924858016601135,
 'tr-News': 0.44105326152004787,
 'tr-Instruction': 0.3827071290944123,
 'tr-Opinion/Argumentation': 0.4332798859789774,
 'tr-Forum': 0.44330537433985706,
 'tr-Prose/Lyrical': 0.41339001062699254,
 'tr-Legal': 0.3146586891487626,
 'tr-Promotion': 0.39420689655172414,
 'sq-Information/Explanation': 0.4844110854503464,
 'sq-News': 0.4650103519668737,
 'sq-Instruct

In [33]:
# Inspect the results
overlap_df = pd.DataFrame({"label": list(results.keys()), "overlap": list(results.values())})
overlap_df

Unnamed: 0,label,overlap
0,mt-Information/Explanation,0.714543
1,mt-News,0.708364
2,mt-Instruction,0.619401
3,mt-Opinion/Argumentation,0.682129
4,mt-Forum,0.583984
...,...,...
74,sl-Opinion/Argumentation,0.888709
75,sl-Forum,0.801008
76,sl-Prose/Lyrical,0.699553
77,sl-Legal,0.755659


In [34]:
# Save the label-level-overlap
with open("datasets/label-level-token-overlap.csv", "w") as file:
	overlap_df.to_csv(file)

In [35]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)

# English word overlap

## Prepare the train dataset

In [114]:
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized-old.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁..."


In [115]:
# Add lists of English words
token_list = train_df["tokens_train"].to_list()

all_text_word_list = []

for text in token_list:

	word_list = []

	# Take only the first 512 tokens
	for word in text[:512]:
		for punct in [".", '"', "!", "?", ":", ";", ",", "/", ")", "(", "[", "]"]:
			word = word.replace(punct, "").lower()

		if len(word) > 0:
			if word[0] == "▁":
				word_list.append(word.replace("▁", ""))
			else:
				previous_word = word_list[-1]
				word_list[-1] = previous_word + word

	all_text_word_list.append(word_list)

train_df["word_list"] = all_text_word_list
train_df.head()


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,word_list
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁...","[seeking, all, things, brilliant, i, want, peo..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁...","[meet, orchid, du, bois, i, first, met, hayley..."
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,"[▁Abstract, ▁Object, ive, :, ▁Report, ing, ▁bi...","[233973, 134549, 5844, 12, 34798, 214, 333, 16...",abstract objective: reporting bias due to soci...,"[▁abstract, ▁objective, :, ▁report, ing, ▁bi, ...","[abstract, objective, reporting, bias, due, to..."
3,In 2009 the song was the focus of a successful...,Information/Explanation,CORE,English,"[▁In, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ...","[360, 1877, 70, 11531, 509, 70, 32153, 111, 10...",in 2009 the song was the focus of a successful...,"[▁in, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ...","[in, 2009, the, song, was, the, focus, of, a, ..."
4,QuotW This was the week when neither rumours o...,News,CORE,English,"[▁Quo, t, W, ▁This, ▁was, ▁the, ▁week, ▁when, ...","[43851, 18, 1456, 3293, 509, 70, 5895, 3229, 2...",quotw this was the week when neither rumours o...,"[▁quo, tw, ▁this, ▁was, ▁the, ▁week, ▁when, ▁n...","[quotw, this, was, the, week, when, neither, r..."


Add the MT-ed translation of GINCO

In [116]:
# Add the MT English translation of GINCO
mt_ginco = pd.read_csv("datasets/GINCO-MT-GINCO-keeptext-with-all-information.csv", sep="\t", index_col=0)
mt_ginco = mt_ginco[['Slovene_text', 'MT_text',]]
mt_ginco.rename(columns={"Slovene_text":"text"}, inplace=True)
mt_ginco

Unnamed: 0,text,MT_text
0,"Šport <p/> Zimska liga malega nogometa sobota,...",Sport <p/> Winter Little League Football Satur...
1,JEDILNIK <p/> Iskalnik <p/> Poglavitni cilj pr...,JEDILNIK <p/> Search <p/> The main objective o...
2,Projekt INNOVAge in zavod Oreli <p/> Zavod Ore...,Project INNOVAge and the Oreli Institute <p/> ...
3,"V novembru, mesecu preprečevanja odvisnosti, b...","In November, the month of addiction prevention..."
4,Selena Gomez ponudila v poslušanje novi album ...,Selena Gomez launches new album <p/> 16.07.201...
...,...,...
997,Projektne novine <p/> Promocijski projektni ča...,Project News <p/> Promotional project newspape...
998,V raznoliki ponudbi tušev izberite popolno raz...,Choose the perfect shower to match your taste ...
999,"O izdelku Za znamko Dame stojita dve ženski, z...",About the product There are two women behind t...
1000,Razprava pogosto potegne na plano najprej tist...,The debate often brings to the surface first t...


In [117]:
# Create a list of English words
# Add lists of English words - to do this properly, we would need to tokenize the English translation
token_list = mt_ginco["MT_text"].to_list()

all_text_word_list = []

for text in token_list:

	text = text.replace("<p/>", "")

	text_list = text.split(" ")

	word_list = []

	# Take only the first 512 words
	for word in text_list[:512]:
		for punct in [".", '"', "!", "?", ":", ";", ",", "/", ")", "(", "[", "]"]:
			word = word.replace(punct, "").lower()

		if len(word) > 0:
			word_list.append(word)

	all_text_word_list.append(word_list)

mt_ginco["word_list"] = all_text_word_list
mt_ginco.head()

Unnamed: 0,text,MT_text,word_list
0,"Šport <p/> Zimska liga malega nogometa sobota,...",Sport <p/> Winter Little League Football Satur...,"[sport, winter, little, league, football, satu..."
1,JEDILNIK <p/> Iskalnik <p/> Poglavitni cilj pr...,JEDILNIK <p/> Search <p/> The main objective o...,"[jedilnik, search, the, main, objective, of, t..."
2,Projekt INNOVAge in zavod Oreli <p/> Zavod Ore...,Project INNOVAge and the Oreli Institute <p/> ...,"[project, innovage, and, the, oreli, institute..."
3,"V novembru, mesecu preprečevanja odvisnosti, b...","In November, the month of addiction prevention...","[in, november, the, month, of, addiction, prev..."
4,Selena Gomez ponudila v poslušanje novi album ...,Selena Gomez launches new album <p/> 16.07.201...,"[selena, gomez, launches, new, album, 16072013..."


In [118]:
train_df_merged = pd.merge(left=train_df, right= mt_ginco, on="text", how="left")
train_df_merged[train_df_merged["dataset"] == "GINCO"].head(3)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,word_list_x,MT_text,word_list_y
607,Žakardska tkanina je splošna oznaka za vzorčas...,Information/Explanation,GINCO,Slovenian,"[▁Ža, kar, d, ska, ▁tkanin, a, ▁je, ▁spl, oš, ...","[78724, 1153, 71, 937, 124424, 11, 55, 27095, ...",zakardska tkanina je splosna oznaka za vzorcas...,"[▁zak, ard, ska, ▁tkanin, a, ▁je, ▁spl, os, na...","[žakardska, tkanina, je, splošna, oznaka, za, ...",Jacquard fabric is a general term for patterne...,"[jacquard, fabric, is, a, general, term, for, ..."
608,Kako nastane očka? <p/> Očetovstvo in materins...,Opinion/Argumentation,GINCO,Slovenian,"[▁Kako, ▁nastane, ▁o, čka, ?, ▁<, p, /, >, ▁O,...","[8438, 165103, 36, 5536, 32, 4426, 254, 64, 27...",kako nastane ocka? <p/> ocetovstvo in materins...,"[▁kako, ▁nastane, ▁o, cka, ?, ▁<, p, /, >, ▁o,...","[kako, nastane, očka, <p>, očetovstvo, in, mat...",How does Daddy come into being? <p/>Fatherhood...,"[how, does, daddy, come, into, being, fatherho..."
609,NOGOMETNO IGRIŠČE <p/> Goooolll ... Naši vodij...,Promotion,GINCO,Slovenian,"[▁NO, GO, MET, NO, ▁I, GRI, Š, ČE, ▁<, p, /, >...","[9520, 19930, 33677, 8575, 87, 191357, 6200, 7...",nogometno igrisce <p/> goooolll ... nasi vodij...,"[▁nogomet, no, ▁igri, s, ce, ▁<, p, /, >, ▁go,...","[nogometno, igrišče, <p>, goooolll, , naši, vo...",FOOTBALL PLAYING FIELD <p/> Goooolll ... Our t...,"[football, playing, field, goooolll, our, team..."


In [123]:
# Merge wordlists
train_df_merged.fillna(value=0, inplace=True)

train_df_merged["words"] = np.where(train_df_merged["word_list_y"] == 0, train_df_merged["word_list_x"], train_df_merged["word_list_y"])

display(train_df_merged.head(3))

display(train_df_merged[train_df_merged["dataset"] == "GINCO"].head(3))

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,word_list_x,MT_text,word_list_y,words
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁...","[seeking, all, things, brilliant, i, want, peo...",0,0,"[seeking, all, things, brilliant, i, want, peo..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁...","[meet, orchid, du, bois, i, first, met, hayley...",0,0,"[meet, orchid, du, bois, i, first, met, hayley..."
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,"[▁Abstract, ▁Object, ive, :, ▁Report, ing, ▁bi...","[233973, 134549, 5844, 12, 34798, 214, 333, 16...",abstract objective: reporting bias due to soci...,"[▁abstract, ▁objective, :, ▁report, ing, ▁bi, ...","[abstract, objective, reporting, bias, due, to...",0,0,"[abstract, objective, reporting, bias, due, to..."


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,word_list_x,MT_text,word_list_y,words
607,Žakardska tkanina je splošna oznaka za vzorčas...,Information/Explanation,GINCO,Slovenian,"[▁Ža, kar, d, ska, ▁tkanin, a, ▁je, ▁spl, oš, ...","[78724, 1153, 71, 937, 124424, 11, 55, 27095, ...",zakardska tkanina je splosna oznaka za vzorcas...,"[▁zak, ard, ska, ▁tkanin, a, ▁je, ▁spl, os, na...","[žakardska, tkanina, je, splošna, oznaka, za, ...",Jacquard fabric is a general term for patterne...,"[jacquard, fabric, is, a, general, term, for, ...","[jacquard, fabric, is, a, general, term, for, ..."
608,Kako nastane očka? <p/> Očetovstvo in materins...,Opinion/Argumentation,GINCO,Slovenian,"[▁Kako, ▁nastane, ▁o, čka, ?, ▁<, p, /, >, ▁O,...","[8438, 165103, 36, 5536, 32, 4426, 254, 64, 27...",kako nastane ocka? <p/> ocetovstvo in materins...,"[▁kako, ▁nastane, ▁o, cka, ?, ▁<, p, /, >, ▁o,...","[kako, nastane, očka, <p>, očetovstvo, in, mat...",How does Daddy come into being? <p/>Fatherhood...,"[how, does, daddy, come, into, being, fatherho...","[how, does, daddy, come, into, being, fatherho..."
609,NOGOMETNO IGRIŠČE <p/> Goooolll ... Naši vodij...,Promotion,GINCO,Slovenian,"[▁NO, GO, MET, NO, ▁I, GRI, Š, ČE, ▁<, p, /, >...","[9520, 19930, 33677, 8575, 87, 191357, 6200, 7...",nogometno igrisce <p/> goooolll ... nasi vodij...,"[▁nogomet, no, ▁igri, s, ce, ▁<, p, /, >, ▁go,...","[nogometno, igrišče, <p>, goooolll, , naši, vo...",FOOTBALL PLAYING FIELD <p/> Goooolll ... Our t...,"[football, playing, field, goooolll, our, team...","[football, playing, field, goooolll, our, team..."


In [125]:
# Remove unnecessary columns
train_df_merged.drop(columns=["word_list_x", "word_list_y"], inplace=True)

train_df_merged.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁...",0,"[seeking, all, things, brilliant, i, want, peo..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁...",0,"[meet, orchid, du, bois, i, first, met, hayley..."


In [126]:
# Save the enriched dataset
train_df_merged.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")

## Calculate word overlap

These are just initial experiments - to do this properly, we would need to tokenize the English translation of GINCO and of all translations of the test datasets, and then calculate the overlap in the list of words extracted from the first 512 tokens.

In [2]:
# Open the tokenized df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁...",0,"[seeking, all, things, brilliant, i, want, peo..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁...",0,"[meet, orchid, du, bois, i, first, met, hayley..."


In [9]:
# List of all train words
all_train_words = []
for text_list in train_df["words"].to_list():
	all_train_words.extend(text_list)

all_unique_train_words = list(set(all_train_words))

print(len(all_train_words))
print(len(all_unique_train_words))

527009
36548


In [3]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr', 'sl'])

In [4]:
df = pd.DataFrame(main_dict["sl"]["dataset"])
df.head(3)

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414..."
3,CLASSLA-web.sl.1230602,Opinion/Argumentation,Danes\nJutri\nPojutrišnjem\nSo res pojedli 200...,Today tomorrow they really ate 200 kg of meats...,"CLASSLA-web.sl.1230602', 'domain': 'kr.trma.si'}",Opinion/Argumentation,"[▁Danes, ▁Ju, tri, ▁Po, ju, tri, šnje, m, ▁So,...","[85405, 3314, 3996, 663, 461, 3996, 69207, 39,..."


In [12]:
# Define an array of token overlap
token_overlap_results = {}

# Loop through the datasets and calculate English word overlap
for lang in list(main_dict.keys()):
	df = pd.DataFrame(main_dict[lang]["dataset"])

	# Add lists of English words - to do this properly, we would need to tokenize the English translation - these are just preliminary experiments
	texts = df["translation"].to_list()

	all_text_word_list = []
	all_words = []

	for text in texts:
		text = text.replace("\n", " ")

		text_list = text.split(" ")

		word_list = []

		# Take only the first 512 words
		for word in text_list[:512]:
			for punct in [".", '"', "!", "?", ":", ";", ",", "/", ")", "(", "[", "]"]:
				word = word.replace(punct, "").lower()

			if len(word) > 0:
				word_list.append(word)
				all_words.append(word)

		all_text_word_list.append(word_list)

	df["word_list"] = all_text_word_list

	print(all_words[:10])
	print("All words:")
	print(len(all_words))

	token_overlap_results = {}

	# See how many tokens do not overlap
	no_overlap_counter = 0

	print("Calculating overlap.")

	# Save tokens that overlap for further inspection
	overlap_all_words = []

	# We calculate overlap by counting how many tokens do not appear in both sets
	for token in tqdm(all_words):
		if token not in all_unique_train_words:
			no_overlap_counter += 1
		else:
			overlap_all_words.append(token)

	# Out of all tokens in test set, how many do not overlap with train set?
	no_overlap_per = no_overlap_counter/len(all_words)

	# Calculate percentage of overlap based on that
	overlap_per = 1-no_overlap_per

	print(f"Number of English words that overlap: {len(overlap_all_words)}")
	print(f"Number of different English words that overlap: {len(set(overlap_all_words))}")
	print(f"Percentage of overlap: {overlap_per}")

	# Update the dataset in the dictionary
	main_dict[lang]["dataset"] = df.to_dict()

	# Add the list of all tokens to the dictionary
	main_dict[lang]["English_word_overlap"] = {"overlap_percentage":overlap_per, "all_words": all_words, "overlap_words":overlap_all_words}

	# Add to the results
	token_overlap_results[lang] = {"percentage": overlap_per, "overlap_list_size": len(overlap_all_words), "overlap_set_size": len(set(overlap_all_words))}

['angelo', 'chetcuti', 'will', 'be', 'replacing', 'bjorn', 'vassallo', 'as', 'secretary-general', 'of']
All words:
26488
Calculating overlap.


100%|██████████| 26488/26488 [00:06<00:00, 4091.82it/s]


Number of English words that overlap: 24087
Number of different English words that overlap: 4276
Percentage of overlap: 0.9093551797040169
['update', 'pegasus', 'estiasi', 'with', 'incoming', 'calls', 'pegasus', 'estiasi', 'has', 'the']
All words:
22165
Calculating overlap.


100%|██████████| 22165/22165 [00:05<00:00, 4070.62it/s]


Number of English words that overlap: 19967
Number of different English words that overlap: 3910
Percentage of overlap: 0.900834649221746
['aöl', 'warned', 'of', 'frequent', 'negligence', 'and', 'errors', 'on', 'the', 'selection']
All words:
23379
Calculating overlap.


100%|██████████| 23379/23379 [00:05<00:00, 4102.23it/s]


Number of English words that overlap: 21144
Number of different English words that overlap: 3585
Percentage of overlap: 0.9044013858591043
['blog', 'i', 'loved', 'you', 'with', 'eternal', 'lovejer', '31', '3', "let's"]
All words:
16615
Calculating overlap.


100%|██████████| 16615/16615 [00:04<00:00, 4071.98it/s]


Number of English words that overlap: 14985
Number of different English words that overlap: 3121
Percentage of overlap: 0.9018958772193801
['apply', 'full', 'membership', 'social', 'security', 'number', '*', 'email', '*', 'job']
All words:
23259
Calculating overlap.


100%|██████████| 23259/23259 [00:05<00:00, 4088.95it/s]


Number of English words that overlap: 20995
Number of different English words that overlap: 3567
Percentage of overlap: 0.9026613353970506
['a', 'non', '-standard', 'approach', 'for', 'making', 'aquariumor', 'what', 'else', 'except']
All words:
25036
Calculating overlap.


100%|██████████| 25036/25036 [00:06<00:00, 4171.44it/s]


Number of English words that overlap: 22791
Number of different English words that overlap: 4054
Percentage of overlap: 0.9103291260584758
['pages', 'coined', 'i', 'have', 'been', 'a', 'little', 'busy', 'for', 'a']
All words:
20085
Calculating overlap.


100%|██████████| 20085/20085 [00:04<00:00, 4096.64it/s]


Number of English words that overlap: 18140
Number of different English words that overlap: 3360
Percentage of overlap: 0.9031615633557382
['ekshui', 'tcl', 'makes', 'smartphones', 'and', 'tcl', 'is', 'housed', 'in', 'china']
All words:
18331
Calculating overlap.


100%|██████████| 18331/18331 [00:04<00:00, 4220.69it/s]


Number of English words that overlap: 17236
Number of different English words that overlap: 3574
Percentage of overlap: 0.9402651246522284
['about', 'the', 'color', 'transformer', 'product', 'for', 'smart', 'and', 'easy', 'coloring']
All words:
19381
Calculating overlap.


100%|██████████| 19381/19381 [00:04<00:00, 4148.59it/s]


Number of English words that overlap: 18112
Number of different English words that overlap: 3718
Percentage of overlap: 0.934523502399257
['the', 'chinese', 'ghost', 'city', 'in', 'inner', 'mongolia', 'is', 'growing', 'a']
All words:
19260
Calculating overlap.


100%|██████████| 19260/19260 [00:04<00:00, 4210.30it/s]

Number of English words that overlap: 18047
Number of different English words that overlap: 3677
Percentage of overlap: 0.9370197300103842





In [14]:
token_overlap_results = {}

for lang in list(main_dict.keys()):
	overlap_token_list = main_dict[lang]["English_word_overlap"]["all_words"]
	# Add to the results
	token_overlap_results[lang] = {"percentage": main_dict[lang]["English_word_overlap"]["overlap_percentage"], "overlap_list_size": len(overlap_token_list), "overlap_set_size": len(set(overlap_token_list))}

In [16]:
overlap_df = pd.DataFrame(token_overlap_results).transpose().sort_values(by="percentage", ascending=False)
overlap_df

Unnamed: 0,percentage,overlap_list_size,overlap_set_size
mk,0.940265,18331.0,4426.0
sl,0.93702,19260.0,4635.0
hr,0.934524,19381.0,4672.0
uk,0.910329,25036.0,5978.0
mt,0.909355,26488.0,6270.0
tr,0.904401,23379.0,5413.0
ca,0.903162,20085.0,4939.0
is,0.902661,23259.0,5366.0
sq,0.901896,16615.0,4382.0
el,0.900835,22165.0,5685.0


In [17]:
# Save the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as main_file:
	json.dump(main_dict,main_file)

### Label level

In [18]:
# Open the tokenized train df
train_df = pd.read_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")
train_df.head(2)

Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ...","seeking all things brilliant ""i want people to...","[▁seeking, ▁all, ▁things, ▁brilliant, ▁"", i, ▁...",0,"[seeking, all, things, brilliant, i, want, peo..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51...",meet orchid du bois i first met hayley mowday ...,"[▁meet, ▁or, ch, id, ▁du, ▁bois, ▁i, ▁first, ▁...",0,"[meet, orchid, du, bois, i, first, met, hayley..."


In [19]:
# Separate the train df into label-based dfs

# Create lists of tokens that are label based
label_tokens = {}
token_count = {}
type_count = {}

for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
	print(f"Processing {label}")

	label_df = train_df[train_df["labels"] == label]
	display(label_df.head(3))

	train_tokens_shortened = []

	for i in label_df["words"].to_list():
		train_tokens_shortened.extend(i)

	print(f"Number of all tokens: {len(train_tokens_shortened)}")

	# Add to dictionaries
	label_tokens[label] = train_tokens_shortened
	token_count[label] = len(train_tokens_shortened)
	type_count[label] = len(set(train_tokens_shortened))

Processing Information/Explanation


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,"[▁Abstract, ▁Object, ive, :, ▁Report, ing, ▁bi...","[233973, 134549, 5844, 12, 34798, 214, 333, 16...",abstract objective: reporting bias due to soci...,"[▁abstract, ▁objective, :, ▁report, ing, ▁bi, ...",0,"[abstract, objective, reporting, bias, due, to..."
3,In 2009 the song was the focus of a successful...,Information/Explanation,CORE,English,"[▁In, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ...","[360, 1877, 70, 11531, 509, 70, 32153, 111, 10...",in 2009 the song was the focus of a successful...,"[▁in, ▁2009, ▁the, ▁song, ▁was, ▁the, ▁focus, ...",0,"[in, 2009, the, song, was, the, focus, of, a, ..."
39,Story: Whaling Page 4 -- M?ori and whaling Wha...,Information/Explanation,CORE,English,"[▁Story, :, ▁W, hal, ing, ▁Page, ▁4, ▁--, ▁M, ...","[30575, 12, 601, 4200, 214, 14231, 201, 4210, ...",story: whaling page 4 -- m?ori and whaling wha...,"[▁story, :, ▁w, hal, ing, ▁page, ▁4, ▁--, ▁m, ...",0,"[story, whaling, page, 4, --, mori, and, whali..."


Number of all tokens: 90496
Processing News


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
4,QuotW This was the week when neither rumours o...,News,CORE,English,"[▁Quo, t, W, ▁This, ▁was, ▁the, ▁week, ▁when, ...","[43851, 18, 1456, 3293, 509, 70, 5895, 3229, 2...",quotw this was the week when neither rumours o...,"[▁quo, tw, ▁this, ▁was, ▁the, ▁week, ▁when, ▁n...",0,"[quotw, this, was, the, week, when, neither, r..."
5,KaZaA claims it can't stop users sharing music...,News,CORE,English,"[▁Ka, Za, A, ▁claims, ▁it, ▁can, ', t, ▁stop, ...","[1136, 16737, 284, 140526, 442, 831, 25, 18, 7...",kazaa claims it can't stop users sharing music...,"[▁kaza, a, ▁claims, ▁it, ▁can, ', t, ▁stop, ▁u...",0,"[kazaa, claims, it, can't, stop, users, sharin..."
9,Nebraska fans checking out airfare for a trip ...,News,CORE,English,"[▁Ne, bra, ska, ▁fans, ▁checking, ▁out, ▁air, ...","[799, 2844, 937, 35992, 175199, 1810, 1831, 44...",nebraska fans checking out airfare for a trip ...,"[▁ne, bra, ska, ▁fans, ▁checking, ▁out, ▁air, ...",0,"[nebraska, fans, checking, out, airfare, for, ..."


Number of all tokens: 100620
Processing Instruction


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
6,When you first sign up with an online casino a...,Instruction,CORE,English,"[▁When, ▁you, ▁first, ▁sign, ▁up, ▁with, ▁an, ...","[14847, 398, 5117, 24092, 1257, 678, 142, 1118...",when you first sign up with an online casino a...,"[▁when, ▁you, ▁first, ▁sign, ▁up, ▁with, ▁an, ...",0,"[when, you, first, sign, up, with, an, online,..."
7,How to be the BEST Workplace Supervisor A work...,Instruction,CORE,English,"[▁How, ▁to, ▁be, ▁the, ▁BEST, ▁Work, place, ▁S...","[11249, 47, 186, 70, 121300, 27985, 23935, 426...",how to be the best workplace supervisor a work...,"[▁how, ▁to, ▁be, ▁the, ▁best, ▁work, place, ▁s...",0,"[how, to, be, the, best, workplace, supervisor..."
29,I am hungry and now have an hour with a tobler...,Instruction,CORE,English,"[▁I, ▁am, ▁hun, gry, ▁and, ▁now, ▁have, ▁an, ▁...","[87, 444, 1926, 47285, 136, 5036, 765, 142, 56...",i am hungry and now have an hour with a tobler...,"[▁i, ▁am, ▁hun, gry, ▁and, ▁now, ▁have, ▁an, ▁...",0,"[i, am, hungry, and, now, have, an, hour, with..."


Number of all tokens: 64195
Processing Opinion/Argumentation


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
8,popular themes AllMusic relies heavily on Java...,Opinion/Argumentation,CORE,English,"[▁popular, ▁them, es, ▁All, Music, ▁reli, es, ...","[5700, 2856, 90, 3164, 158257, 28702, 90, 1730...",popular themes allmusic relies heavily on java...,"[▁popular, ▁them, es, ▁all, music, ▁reli, es, ...",0,"[popular, themes, allmusic, relies, heavily, o..."
10,"I was just recalling how, about a year ago, my...",Opinion/Argumentation,CORE,English,"[▁I, ▁was, ▁just, ▁recall, ing, ▁how, ,, ▁abou...","[87, 509, 1660, 189232, 214, 3642, 4, 1672, 10...","i was just recalling how, about a year ago, my...","[▁i, ▁was, ▁just, ▁recall, ing, ▁how, ,, ▁abou...",0,"[i, was, just, recalling, how, about, a, year,..."
32,Combining our love of shiny things with some t...,Opinion/Argumentation,CORE,English,"[▁Combi, ning, ▁our, ▁love, ▁of, ▁shi, ny, ▁th...","[106935, 592, 2446, 5161, 111, 6544, 299, 8966...",combining our love of shiny things with some t...,"[▁com, bi, ning, ▁our, ▁love, ▁of, ▁shi, ny, ▁...",0,"[combining, our, love, of, shiny, things, with..."


Number of all tokens: 81309
Processing Forum


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
13,Quote ryan mead: I would like something that g...,Forum,CORE,English,"[▁Quote, ▁ry, an, ▁me, ad, :, ▁I, ▁would, ▁lik...","[109216, 5535, 66, 163, 712, 12, 87, 2806, 188...",quote ryan mead: i would like something that g...,"[▁quote, ▁ry, an, ▁me, ad, :, ▁i, ▁would, ▁lik...",0,"[quote, ryan, mead, i, would, like, something,..."
15,Changing ISP re Broadband - what happens to em...,Forum,CORE,English,"[▁Chang, ing, ▁I, SP, ▁re, ▁Bro, ad, band, ▁-,...","[108193, 214, 87, 9434, 456, 13177, 712, 8262,...",changing isp re broadband - what happens to em...,"[▁changing, ▁is, p, ▁re, ▁broad, band, ▁-, ▁wh...",0,"[changing, isp, re, broadband, -, what, happen..."
16,Comments for Post (25) I'm there with you. I'v...,Forum,CORE,English,"[▁Comments, ▁for, ▁Post, ▁(25), ▁I, ', m, ▁the...","[11427, 100, 2795, 59791, 87, 25, 39, 2685, 67...",comments for post (25) i'm there with you. i'v...,"[▁comments, ▁for, ▁post, ▁(25), ▁i, ', m, ▁the...",0,"[comments, for, post, 25, i'm, there, with, yo..."


Number of all tokens: 44728
Processing Prose/Lyrical


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
14,One Dance Too Many The night I first met Ziegl...,Prose/Lyrical,CORE,English,"[▁One, ▁Dance, ▁Too, ▁Many, ▁The, ▁night, ▁I, ...","[6561, 67022, 56374, 52455, 581, 17431, 87, 51...",one dance too many the night i first met ziegl...,"[▁one, ▁dance, ▁too, ▁many, ▁the, ▁night, ▁i, ...",0,"[one, dance, too, many, the, night, i, first, ..."
57,"Household Tales, by Brothers Grimm The Story o...",Prose/Lyrical,CORE,English,"[▁House, hold, ▁Tale, s, ,, ▁by, ▁Brother, s, ...","[13038, 16200, 59144, 7, 4, 390, 67921, 7, 106...","household tales, by brothers grimm the story o...","[▁household, ▁tales, ,, ▁by, ▁brother, s, ▁gri...",0,"[household, tales, by, brothers, grimm, the, s..."
72,Empire! Empire! I Would Have Stolen You A Whol...,Prose/Lyrical,CORE,English,"[▁Empire, !, ▁Empire, !, ▁I, ▁Would, ▁Have, ▁S...","[145359, 38, 145359, 38, 87, 154559, 31901, 73...",empire! empire! i would have stolen you a whol...,"[▁em, pire, !, ▁em, pire, !, ▁i, ▁would, ▁have...",0,"[empire, empire, i, would, have, stolen, you, ..."


Number of all tokens: 35162
Processing Legal


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
35,Full Terms and Conditions Eligibility to enter...,Legal,CORE,English,"[▁Full, ▁Terms, ▁and, ▁Condi, tions, ▁E, ligi,...","[9312, 165504, 136, 46347, 5256, 241, 7883, 83...",full terms and conditions eligibility to enter...,"[▁full, ▁terms, ▁and, ▁conditions, ▁e, ligi, b...",0,"[full, terms, and, conditions, eligibility, to..."
110,Commonwealth Consolidated Acts INCOME TAX ASSE...,Legal,CORE,English,"[▁Common, we, al, th, ▁Con, solid, ated, ▁Act,...","[151301, 1177, 289, 927, 1657, 97281, 27686, 2...",commonwealth consolidated acts income tax asse...,"[▁common, we, al, th, ▁consolida, ted, ▁act, s...",0,"[commonwealth, consolidated, acts, income, tax..."
122,"This Terms of Use Agreement (""Agreement"") is b...",Legal,CORE,English,"[▁This, ▁Terms, ▁of, ▁Use, ▁Agreement, ▁("", A,...","[3293, 165504, 111, 36836, 186670, 24073, 284,...","this terms of use agreement (""agreement"") is b...","[▁this, ▁terms, ▁of, ▁use, ▁agreement, ▁("", a,...",0,"[this, terms, of, use, agreement, agreement, i..."


Number of all tokens: 22944
Processing Promotion


Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids,text_norm,tokens_train_norm,MT_text,words
79,Post navigation Citizen Fish are back on the r...,Promotion,CORE,English,"[▁Post, ▁navigation, ▁Citizen, ▁Fish, ▁are, ▁b...","[2795, 134470, 193223, 104796, 621, 4420, 98, ...",post navigation citizen fish are back on the r...,"[▁post, ▁navigation, ▁citizen, ▁fish, ▁are, ▁b...",0,"[post, navigation, citizen, fish, are, back, o..."
130,Win yourself a FREE copy of the BradyGames off...,Promotion,CORE,English,"[▁Win, ▁yourself, ▁a, ▁FREE, ▁copy, ▁of, ▁the,...","[17686, 31949, 10, 86697, 43658, 111, 70, 5859...",win yourself a free copy of the bradygames off...,"[▁win, ▁yourself, ▁a, ▁free, ▁copy, ▁of, ▁the,...",0,"[win, yourself, a, free, copy, of, the, bradyg..."
179,Do You Want To Know The Quick Secret for On Pa...,Promotion,CORE,English,"[▁Do, ▁You, ▁Want, ▁To, ▁Know, ▁The, ▁Quick, ▁...","[984, 2583, 42335, 717, 70829, 581, 89038, 390...",do you want to know the quick secret for on pa...,"[▁do, ▁you, ▁want, ▁to, ▁know, ▁the, ▁quick, ▁...",0,"[do, you, want, to, know, the, quick, secret, ..."


Number of all tokens: 65152


In [20]:
# Calculate numbers for each label
label_results_train = pd.DataFrame({"token_count": token_count, "type_count": type_count})

print(label_results_train.to_markdown())

|                         |   token_count |   type_count |
|:------------------------|--------------:|-------------:|
| Information/Explanation |         90496 |        13430 |
| News                    |        100620 |        14100 |
| Instruction             |         64195 |         7790 |
| Opinion/Argumentation   |         81309 |        11486 |
| Forum                   |         44728 |         7068 |
| Prose/Lyrical           |         35162 |         5509 |
| Legal                   |         22944 |         3202 |
| Promotion               |         65152 |         9694 |


### Create label-level token counts for test sets

In [21]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict["uk"].keys()

dict_keys(['accuracy', 'micro_f1', 'macro_f1', 'label_scores', 'dataset', 'token_overlap', 'English_word_overlap'])

In [23]:
# Do the same as with the dataset, but on every language
lang_results = {}

for lang in list(main_dict.keys()):
	print(f"Processing {lang}")

	label_token_dict = {}

	# Current df
	df = pd.DataFrame(main_dict[lang]["dataset"])
	display(df.head(2))

	if lang != "mt":
		for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
			print(f"Processing {label}")

			label_df = df[df["y_true"] == label]

			# Create a list of tokens
			token_list = []

			for i in label_df["word_list"].to_list():
				token_list.extend(i)

			# Add to dictionary
			label_token_dict[label] = token_list

	elif lang == "mt":
		for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Promotion']:
			print(f"Processing {label}")

			label_df = df[df["y_true"] == label]

			# Create a list of tokens
			token_list = []

			for i in label_df["word_list"].to_list():
				token_list.extend(i)

			# Add to dictionary
			label_token_dict[label] = token_list

	# Add to main dict
	main_dict[lang]["English_word_overlap"]["label_level_word_lists"] = label_token_dict


Processing mt


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.mt.402244,News,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...","Angelo Chetcuti, will be replacing Bjorn Vassa...",{'text_id': 'macocu.mt.402244'},News,"[▁Angel, o, ▁Che, t, cuti, ,, ▁se, ▁j, kun, ▁q...","[26902, 31, 5024, 18, 64969, 4, 40, 1647, 6262...","[angelo, chetcuti, will, be, replacing, bjorn,..."
1,macocu.mt.377203,Prose/Lyrical,Poltergeist jirreferi għal fenomeni oħra tal-m...,"Poltergeist refers to other woman's phenomena,...",{'text_id': 'macocu.mt.377203'},Opinion/Argumentation,"[▁Pol, ter, geist, ▁jir, re, feri, ▁g, ħ, al, ...","[9017, 720, 178490, 52826, 107, 26926, 706, 24...","[poltergeist, refers, to, other, woman's, phen..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Promotion
Processing el


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.el.1525713,Instruction,Ενημέρωση του Pegasus Estiasi με τις εισερχόμε...,Update Pegasus Estiasi with Incoming Calls\n\n...,{'text_id': 'macocu.el.1525713'},Instruction,"[▁Ενημέρωση, ▁του, ▁Pegasus, ▁Esti, asi, ▁με, ...","[236422, 385, 241060, 60271, 1544, 558, 1713, ...","[update, pegasus, estiasi, with, incoming, cal..."
1,macocu.el.3525724,Forum,Η τιμή της έκδοσης 8GB/ 128GB είναι 1.299 ευρώ...,"The price of 8GB/ 128GB is € 1,299, of the 12G...",{'text_id': 'macocu.el.3525724'},Forum,"[▁Η, ▁τιμή, ▁της, ▁έκδοση, ς, ▁8, GB, /, ▁128,...","[1700, 77118, 463, 110873, 235, 382, 8359, 64,...","[the, price, of, 8gb, 128gb, is, €, 1299, of, ..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing tr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.tr.15851513,Instruction,AÖL Ders Seçimi ve Sınav Giriş Merkezi Belirle...,AÖL warned of frequent negligence and errors o...,{'text_id': 'macocu.tr.15851513'},Instruction,"[▁A, Ö, L, ▁Der, s, ▁Seçim, i, ▁ve, ▁Sınav, ▁G...","[62, 8655, 866, 1310, 7, 166134, 14, 173, 1762...","[aöl, warned, of, frequent, negligence, and, e..."
1,macocu.tr.12699738,Legal,Banka promosyonu ihalesinde uygulanacak kriter...,Criteria to be applied in the tender for bank ...,{'text_id': 'macocu.tr.12699738'},Legal,"[▁Banka, ▁promo, syon, u, ▁i, hale, sinde, ▁uy...","[81847, 8891, 10270, 34, 17, 50742, 19209, 633...","[criteria, to, be, applied, in, the, tender, f..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing sq


Unnamed: 0,text_id,y_pred,text_length,domain,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.sq.1061396,Opinion/Argumentation,341.0,fjalaejetes.org,Blog\n\n“Unë të kam dashur me një dashuri të p...,"Blog\n\n""I loved you with eternal love.""Jer 31...","{'text_id': 'macocu.sq.1061396', 'domain': 'fj...",Opinion/Argumentation,"[▁Blog, ▁“, U, në, ▁të, ▁kam, ▁dashur, ▁me, ▁n...","[5061, 52, 1062, 3208, 134, 3840, 57168, 163, ...","[blog, i, loved, you, with, eternal, lovejer, ..."
1,macocu.sq.183383,Legal,140.0,eukos.org,Liria nga keqtrajtimi\n\nKonventa e të Drejtav...,Freedom from mistreatment\n\nStudent Rights Co...,"{'text_id': 'macocu.sq.183383', 'domain': 'euk...",Legal,"[▁Li, ria, ▁nga, ▁keq, t, raj, timi, ▁Kon, ven...","[1261, 1651, 817, 39184, 18, 10185, 20520, 369...","[freedom, from, mistreatment, student, rights,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing is


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.is.1301366,Instruction,Sækja um fulla aðild \n\nKennitala * \n\nNetfa...,Apply full membership\n\nSocial Security numbe...,{'text_id': 'macocu.is.1301366'},Information/Explanation,"[▁Sæ, kja, ▁um, ▁full, a, ▁að, ild, ▁Kenn, ita...","[71595, 28643, 286, 4393, 11, 389, 38472, 5906...","[apply, full, membership, social, security, nu..."
1,macocu.is.1528713,Information/Explanation,Inngangur Íslenskur landbúnaður hefur þróast ö...,Introduction Icelandic agriculture has evolved...,{'text_id': 'macocu.is.1528713'},Information/Explanation,"[▁Inn, gangur, ▁Íslensk, ur, ▁land, búnað, ur,...","[11151, 160409, 122022, 474, 3551, 74026, 474,...","[introduction, icelandic, agriculture, has, ev..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing uk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.uk.419381,Instruction,Нестандартний підхід для виготовлення Акваріум...,A non -standard approach for making aquarium.O...,{'text_id': 'macocu.uk.419381'},Instruction,"[▁Не, стандарт, ний, ▁підхід, ▁для, ▁виготовле...","[1087, 159257, 1394, 205827, 518, 166156, 1307...","[a, non, -standard, approach, for, making, aqu..."
1,macocu.uk.16993168,Prose/Lyrical,МУЧЕНИКИ БУЧА-ІРПІНЬ \n\nНе снилось полянам й ...,The martyrs of Bucha-Irpin\n\nThe glades and t...,{'text_id': 'macocu.uk.16993168'},Prose/Lyrical,"[▁М, УЧ, ЕНИ, КИ, ▁, БУ, ЧА, -, ІР, П, ІН, Ь, ...","[1435, 87706, 78591, 38682, 6, 39932, 75333, 9...","[the, martyrs, of, bucha-irpin, the, glades, a..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing ca


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,macocu.ca.2248072,Opinion/Argumentation,Pàgines \n\nEnfeinada \n\nPorto uns dies una m...,Pages\n\nCoined\n\nI have been a little busy f...,{'text_id': 'macocu.ca.2248072'},Forum,"[▁P, à, gine, s, ▁En, fei, nada, ▁Porto, ▁uns,...","[436, 1298, 63023, 7, 357, 51899, 28866, 24952...","[pages, coined, i, have, been, a, little, busy..."
1,macocu.ca.756254,Information/Explanation,Info \n\nLa Casa nova dels Banys de Sant Vicen...,Info\n\nThe Casa Nova dels Banys de Sant Vicen...,{'text_id': 'macocu.ca.756254'},Information/Explanation,"[▁Info, ▁La, ▁Casa, ▁nova, ▁dels, ▁Ban, ys, ▁d...","[14048, 239, 8591, 4678, 2323, 5458, 4778, 8, ...","[info, the, casa, nova, dels, banys, de, sant,..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing mk


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416...","[ekshui, tcl, makes, smartphones, and, tcl, is..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44...","[red, valentino, predicts, a, storm, of, print..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing hr


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
0,CLASSLA-web.hr.1033815,Promotion,"O proizvodu\nColor Transformer, za pametno i j...","About the Color Transformer product, for smart...","CLASSLA-web.hr.1033815', 'domain': 'hairshop.hr'}",Promotion,"[▁O, ▁proizvod, u, ▁Color, ▁Trans, former, ,, ...","[180, 43170, 34, 51193, 11062, 82772, 4, 80, 6...","[about, the, color, transformer, product, for,..."
2,CLASSLA-web.hr.1119579,Promotion,Sunčano selo / Sunny village\nNa obroncima Bil...,Sunshine / Sunny Village on the slopes of Bilo...,"CLASSLA-web.hr.1119579', 'domain': 'vikendi.com'}",Promotion,"[▁Sun, čan, o, ▁se, lo, ▁/, ▁Sunny, ▁village, ...","[7550, 17129, 31, 40, 365, 248, 151197, 54427,...","[sunshine, sunny, village, on, the, slopes, of..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion
Processing sl


Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids,word_list
1,CLASSLA-web.sl.1087171,Opinion/Argumentation,Kitajsko mesto duhov\nV Notranji Mongoliji ras...,The Chinese ghost city in Inner Mongolia is gr...,"CLASSLA-web.sl.1087171', 'domain': 'mladina.si'}",News,"[▁Kita, jsko, ▁mesto, ▁duhov, ▁V, ▁Notranj, i,...","[12992, 30878, 22041, 87909, 310, 240213, 14, ...","[the, chinese, ghost, city, in, inner, mongoli..."
2,CLASSLA-web.sl.1215246,Instruction,"Krompir skuhamo, olupimo in narežemo na tanke ...","Cook the potatoes, peel and cut into thin ring...","CLASSLA-web.sl.1215246', 'domain': 'emorje.com'}",Instruction,"[▁Krom, pir, ▁s, kuha, mo, ,, ▁olup, imo, ▁in,...","[128458, 21738, 91, 60863, 432, 4, 37663, 2414...","[cook, the, potatoes, peel, and, cut, into, th..."


Processing Information/Explanation
Processing News
Processing Instruction
Processing Opinion/Argumentation
Processing Forum
Processing Prose/Lyrical
Processing Legal
Processing Promotion


In [24]:
main_dict["sl"]["English_word_overlap"]["label_level_word_lists"]

{'Information/Explanation': ['the',
  'mansion',
  'may',
  'be',
  'the',
  'successor',
  'to',
  'one',
  'of',
  'the',
  'many',
  'sources',
  'of',
  'medieval',
  'courts',
  'in',
  'the',
  'wider',
  'vipava',
  'area',
  'it',
  'was',
  'first',
  'mentioned',
  'in',
  '1630',
  'in',
  'the',
  'census',
  'of',
  "balthazar's",
  'leg',
  'raumschüssla',
  'the',
  'lord',
  'at',
  'belnek',
  'near',
  'moravce',
  'in',
  'the',
  'gorenjska',
  'region',
  'and',
  'in',
  'vrhpolje',
  'near',
  'vipava',
  'due',
  'to',
  'its',
  'location',
  'by',
  'the',
  'water',
  'it',
  'got',
  'the',
  'german',
  'name',
  'schönau',
  'which',
  'means',
  'a',
  'beautiful',
  'log',
  'the',
  'census',
  'of',
  'his',
  'legacy',
  'shows',
  'that',
  'the',
  'ramschüssl',
  'lineage',
  'had',
  'an',
  'estate',
  'near',
  'vrhpolje',
  'in',
  '1571',
  'in',
  'the',
  'village',
  'of',
  'dolenje',
  'vipava',
  'supposedly',
  'vrhpolje',
  'like',
  '

In [26]:
token_overlap_label_results = {}
results = {}

# Loop through the datasets and labels and calculate token overlap
for lang in list(main_dict.keys()):
	print(lang)
	# Create a dict for labels
	label_overlap = {}
	label_overlap_tokens = {}

	# loop through labels
	if lang != "mt":
		for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Legal','Promotion']:
			print(label)
			token_list_test = main_dict[lang]["English_word_overlap"]["label_level_word_lists"][label]

			token_list_train = label_tokens[label]

			# See how many tokens do not overlap
			no_overlap_counter = 0

			# Save tokens that overlap for further inspection
			overlap_token_list = []

			# We calculate overlap by counting how many tokens do not appear in both sets
			for token in tqdm(token_list_test):
				if token not in token_list_train:
					no_overlap_counter += 1
				else:
					overlap_token_list.append(token)

			# Out of all tokens in test set, how many do not overlap with train set?
			no_overlap_per = no_overlap_counter/len(token_list_test)

			# Calculate percentage of overlap based on that
			overlap_per = 1-no_overlap_per

			print(f"Percentage of overlap: {overlap_per}")

			label_overlap[label] = overlap_per
			label_overlap_tokens[label] = overlap_token_list

			# Add to the results
			results[f"{lang}-{label}"] = overlap_per

		# Add the list of all tokens to the dictionary
		main_dict[lang]["English_word_overlap"]["label_overlap_percentage"] = label_overlap
		main_dict[lang]["English_word_overlap"]["label_overlap_token_list"] = label_overlap_tokens
	else:
		# Skip "Legal" for Maltese, because we do not have this label there
		for label in ['Information/Explanation', 'News', 'Instruction','Opinion/Argumentation', 'Forum', 'Prose/Lyrical', 'Promotion']:
			print(label)
			token_list_test = main_dict[lang]["English_word_overlap"]["label_level_word_lists"][label]

			token_list_train = label_tokens[label]

			# See how many tokens do not overlap
			no_overlap_counter = 0

			# Save tokens that overlap for further inspection
			overlap_token_list = []

			# We calculate overlap by counting how many tokens do not appear in both sets
			for token in tqdm(token_list_test):
				if token not in token_list_train:
					no_overlap_counter += 1
				else:
					overlap_token_list.append(token)

			# Out of all tokens in test set, how many do not overlap with train set?
			no_overlap_per = no_overlap_counter/len(token_list_test)

			# Calculate percentage of overlap based on that
			overlap_per = 1-no_overlap_per

			print(f"Percentage of overlap: {overlap_per}")

			label_overlap[label] = overlap_per
			label_overlap_tokens[label] = overlap_token_list

			# Add to the results
			results[f"{lang}-{label}"] = overlap_per

		# Add the list of all tokens to the dictionary
		main_dict[lang]["English_word_overlap"]["label_overlap_percentage"] = label_overlap
		main_dict[lang]["English_word_overlap"]["label_overlap_token_list"] = label_overlap_tokens		

mt
Information/Explanation


100%|██████████| 6194/6194 [00:01<00:00, 6009.29it/s]


Percentage of overlap: 0.8609945108169196
News


100%|██████████| 3667/3667 [00:00<00:00, 6472.59it/s]


Percentage of overlap: 0.869375511317153
Instruction


100%|██████████| 7402/7402 [00:00<00:00, 8205.57it/s]


Percentage of overlap: 0.83977303431505
Opinion/Argumentation


100%|██████████| 3445/3445 [00:00<00:00, 6807.18it/s]


Percentage of overlap: 0.8409288824383164
Forum


100%|██████████| 509/509 [00:00<00:00, 8099.85it/s]


Percentage of overlap: 0.7367387033398821
Prose/Lyrical


100%|██████████| 492/492 [00:00<00:00, 11256.93it/s]


Percentage of overlap: 0.8028455284552846
Promotion


100%|██████████| 4779/4779 [00:00<00:00, 7598.25it/s]


Percentage of overlap: 0.8367859384808537
el
Information/Explanation


100%|██████████| 3799/3799 [00:00<00:00, 5295.11it/s]


Percentage of overlap: 0.8315346143722032
News


100%|██████████| 2358/2358 [00:00<00:00, 5838.38it/s]


Percentage of overlap: 0.8443596268023749
Instruction


100%|██████████| 1901/1901 [00:00<00:00, 6656.94it/s]


Percentage of overlap: 0.823250920568122
Opinion/Argumentation


100%|██████████| 4642/4642 [00:00<00:00, 7147.92it/s]


Percentage of overlap: 0.8459715639810427
Forum


100%|██████████| 3610/3610 [00:00<00:00, 10720.04it/s]


Percentage of overlap: 0.8088642659279779
Prose/Lyrical


100%|██████████| 1739/1739 [00:00<00:00, 15250.39it/s]


Percentage of overlap: 0.8516388729154687
Legal


100%|██████████| 2583/2583 [00:00<00:00, 14466.52it/s]


Percentage of overlap: 0.7653890824622532
Promotion


100%|██████████| 1533/1533 [00:00<00:00, 6886.90it/s]


Percentage of overlap: 0.8212654924983692
tr
Information/Explanation


100%|██████████| 1584/1584 [00:00<00:00, 5204.28it/s]


Percentage of overlap: 0.8175505050505051
News


100%|██████████| 2510/2510 [00:00<00:00, 5574.99it/s]


Percentage of overlap: 0.8557768924302789
Instruction


100%|██████████| 3555/3555 [00:00<00:00, 8019.95it/s]


Percentage of overlap: 0.8413502109704641
Opinion/Argumentation


100%|██████████| 4918/4918 [00:00<00:00, 7587.71it/s]


Percentage of overlap: 0.8635624237494917
Forum


100%|██████████| 2359/2359 [00:00<00:00, 10600.54it/s]


Percentage of overlap: 0.8151759220008479
Prose/Lyrical


100%|██████████| 2473/2473 [00:00<00:00, 13429.55it/s]


Percentage of overlap: 0.8123736352608169
Legal


100%|██████████| 3071/3071 [00:00<00:00, 16637.68it/s]


Percentage of overlap: 0.7867144252686421
Promotion


100%|██████████| 2909/2909 [00:00<00:00, 8290.48it/s]


Percentage of overlap: 0.8576830525953936
sq
Information/Explanation


100%|██████████| 3122/3122 [00:00<00:00, 5875.32it/s]


Percentage of overlap: 0.8414477898782832
News


100%|██████████| 1459/1459 [00:00<00:00, 5735.57it/s]


Percentage of overlap: 0.8588074023303632
Instruction


100%|██████████| 1825/1825 [00:00<00:00, 8065.94it/s]


Percentage of overlap: 0.8471232876712329
Opinion/Argumentation


100%|██████████| 3429/3429 [00:00<00:00, 7003.51it/s]


Percentage of overlap: 0.8404782735491397
Forum


100%|██████████| 1954/1954 [00:00<00:00, 10588.61it/s]


Percentage of overlap: 0.8096212896622313
Prose/Lyrical


100%|██████████| 2125/2125 [00:00<00:00, 11472.80it/s]


Percentage of overlap: 0.8028235294117647
Legal


100%|██████████| 1799/1799 [00:00<00:00, 17804.56it/s]


Percentage of overlap: 0.8054474708171206
Promotion


100%|██████████| 902/902 [00:00<00:00, 7437.98it/s]


Percentage of overlap: 0.8259423503325942
is
Information/Explanation


100%|██████████| 1475/1475 [00:00<00:00, 6619.00it/s]


Percentage of overlap: 0.8847457627118644
News


100%|██████████| 3172/3172 [00:00<00:00, 5843.87it/s]


Percentage of overlap: 0.862547288776797
Instruction


100%|██████████| 3458/3458 [00:00<00:00, 8152.72it/s]


Percentage of overlap: 0.8389242336610758
Opinion/Argumentation


100%|██████████| 4477/4477 [00:00<00:00, 7885.98it/s]


Percentage of overlap: 0.8646415010051374
Forum


100%|██████████| 1651/1651 [00:00<00:00, 10637.07it/s]


Percentage of overlap: 0.8188976377952756
Prose/Lyrical


100%|██████████| 3764/3764 [00:00<00:00, 13236.36it/s]


Percentage of overlap: 0.825451647183847
Legal


100%|██████████| 3273/3273 [00:00<00:00, 14407.82it/s]


Percentage of overlap: 0.7222731439046746
Promotion


100%|██████████| 1989/1989 [00:00<00:00, 8617.37it/s]


Percentage of overlap: 0.8547008547008547
uk
Information/Explanation


100%|██████████| 3624/3624 [00:00<00:00, 6439.22it/s]


Percentage of overlap: 0.8730684326710817
News


100%|██████████| 2327/2327 [00:00<00:00, 5867.41it/s]


Percentage of overlap: 0.857327030511388
Instruction


100%|██████████| 3592/3592 [00:00<00:00, 7532.24it/s]


Percentage of overlap: 0.8285077951002227
Opinion/Argumentation


100%|██████████| 4823/4823 [00:00<00:00, 6628.73it/s]


Percentage of overlap: 0.8397263114244247
Forum


100%|██████████| 2536/2536 [00:00<00:00, 10733.42it/s]


Percentage of overlap: 0.819006309148265
Prose/Lyrical


100%|██████████| 3510/3510 [00:00<00:00, 11616.43it/s]


Percentage of overlap: 0.8122507122507122
Legal


100%|██████████| 2781/2781 [00:00<00:00, 16677.57it/s]


Percentage of overlap: 0.7756202804746494
Promotion


100%|██████████| 1843/1843 [00:00<00:00, 6787.76it/s]


Percentage of overlap: 0.834508952794357
ca
Information/Explanation


100%|██████████| 2420/2420 [00:00<00:00, 5052.29it/s]


Percentage of overlap: 0.8252066115702479
News


100%|██████████| 1959/1959 [00:00<00:00, 5777.49it/s]


Percentage of overlap: 0.8427769270035732
Instruction


100%|██████████| 1534/1534 [00:00<00:00, 9157.44it/s]


Percentage of overlap: 0.8559322033898304
Opinion/Argumentation


100%|██████████| 2942/2942 [00:00<00:00, 7784.01it/s]


Percentage of overlap: 0.8633582596872875
Forum


100%|██████████| 3294/3294 [00:00<00:00, 11828.24it/s]


Percentage of overlap: 0.851244687310261
Prose/Lyrical


100%|██████████| 2863/2863 [00:00<00:00, 13176.93it/s]


Percentage of overlap: 0.8075445337059028
Legal


100%|██████████| 2403/2403 [00:00<00:00, 15905.44it/s]


Percentage of overlap: 0.7682064086558469
Promotion


100%|██████████| 2670/2670 [00:00<00:00, 7331.54it/s]


Percentage of overlap: 0.8404494382022472
mk
Information/Explanation


100%|██████████| 1951/1951 [00:00<00:00, 6045.95it/s]


Percentage of overlap: 0.8713480266529985
News


100%|██████████| 2430/2430 [00:00<00:00, 7087.73it/s]


Percentage of overlap: 0.9037037037037037
Instruction


100%|██████████| 2175/2175 [00:00<00:00, 10901.21it/s]


Percentage of overlap: 0.9006896551724138
Opinion/Argumentation


100%|██████████| 1965/1965 [00:00<00:00, 7777.22it/s]


Percentage of overlap: 0.8661577608142493
Forum


100%|██████████| 3239/3239 [00:00<00:00, 11968.82it/s]


Percentage of overlap: 0.8437789441185551
Prose/Lyrical


100%|██████████| 2542/2542 [00:00<00:00, 16192.04it/s]


Percentage of overlap: 0.8509047993705743
Legal


100%|██████████| 2302/2302 [00:00<00:00, 17752.25it/s]


Percentage of overlap: 0.8153779322328409
Promotion


100%|██████████| 1727/1727 [00:00<00:00, 8590.12it/s]


Percentage of overlap: 0.8702953097857556
hr
Information/Explanation


100%|██████████| 2652/2652 [00:00<00:00, 5238.48it/s]


Percentage of overlap: 0.8367269984917044
News


100%|██████████| 2365/2365 [00:00<00:00, 6209.62it/s]


Percentage of overlap: 0.8875264270613108
Instruction


100%|██████████| 1084/1084 [00:00<00:00, 8188.09it/s]


Percentage of overlap: 0.8809963099630996
Opinion/Argumentation


100%|██████████| 2889/2889 [00:00<00:00, 8750.65it/s]


Percentage of overlap: 0.893042575285566
Forum


100%|██████████| 2958/2958 [00:00<00:00, 11609.39it/s]


Percentage of overlap: 0.8279242731575389
Prose/Lyrical


100%|██████████| 1842/1842 [00:00<00:00, 12517.06it/s]


Percentage of overlap: 0.8029315960912052
Legal


100%|██████████| 2645/2645 [00:00<00:00, 15448.54it/s]


Percentage of overlap: 0.7776937618147448
Promotion


100%|██████████| 2946/2946 [00:00<00:00, 8671.04it/s]


Percentage of overlap: 0.8699932111337407
sl
Information/Explanation


100%|██████████| 2227/2227 [00:00<00:00, 5727.85it/s]


Percentage of overlap: 0.8495734171531208
News


100%|██████████| 2738/2738 [00:00<00:00, 6176.80it/s]


Percentage of overlap: 0.877282688093499
Instruction


100%|██████████| 2254/2254 [00:00<00:00, 8680.48it/s]


Percentage of overlap: 0.8708961845607808
Opinion/Argumentation


100%|██████████| 1865/1865 [00:00<00:00, 7973.48it/s]


Percentage of overlap: 0.8879356568364611
Forum


100%|██████████| 3296/3296 [00:00<00:00, 10378.51it/s]


Percentage of overlap: 0.8143203883495146
Prose/Lyrical


100%|██████████| 2033/2033 [00:00<00:00, 13273.01it/s]


Percentage of overlap: 0.8396458435809149
Legal


100%|██████████| 2644/2644 [00:00<00:00, 20064.88it/s]


Percentage of overlap: 0.8252647503782149
Promotion


100%|██████████| 2203/2203 [00:00<00:00, 8371.60it/s]

Percentage of overlap: 0.8583749432591921





In [27]:
results

{'mt-Information/Explanation': 0.8609945108169196,
 'mt-News': 0.869375511317153,
 'mt-Instruction': 0.83977303431505,
 'mt-Opinion/Argumentation': 0.8409288824383164,
 'mt-Forum': 0.7367387033398821,
 'mt-Prose/Lyrical': 0.8028455284552846,
 'mt-Promotion': 0.8367859384808537,
 'el-Information/Explanation': 0.8315346143722032,
 'el-News': 0.8443596268023749,
 'el-Instruction': 0.823250920568122,
 'el-Opinion/Argumentation': 0.8459715639810427,
 'el-Forum': 0.8088642659279779,
 'el-Prose/Lyrical': 0.8516388729154687,
 'el-Legal': 0.7653890824622532,
 'el-Promotion': 0.8212654924983692,
 'tr-Information/Explanation': 0.8175505050505051,
 'tr-News': 0.8557768924302789,
 'tr-Instruction': 0.8413502109704641,
 'tr-Opinion/Argumentation': 0.8635624237494917,
 'tr-Forum': 0.8151759220008479,
 'tr-Prose/Lyrical': 0.8123736352608169,
 'tr-Legal': 0.7867144252686421,
 'tr-Promotion': 0.8576830525953936,
 'sq-Information/Explanation': 0.8414477898782832,
 'sq-News': 0.8588074023303632,
 'sq-Inst

In [28]:
# Inspect the results
overlap_df = pd.DataFrame({"label": list(results.keys()), "overlap": list(results.values())})
overlap_df

Unnamed: 0,label,overlap
0,mt-Information/Explanation,0.860995
1,mt-News,0.869376
2,mt-Instruction,0.839773
3,mt-Opinion/Argumentation,0.840929
4,mt-Forum,0.736739
...,...,...
74,sl-Opinion/Argumentation,0.887936
75,sl-Forum,0.814320
76,sl-Prose/Lyrical,0.839646
77,sl-Legal,0.825265


In [29]:
# Save the label-level-overlap
with open("datasets/label-level-English-world-overlap.csv", "w") as file:
	overlap_df.to_csv(file)

In [None]:
# Save the extended json dict
with open("manual-annotations/multilingual-genre-annotated-test-set.json", "w") as file:
	json.dump(main_dict, file)