In [11]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

import pandas as pd
import json
from tqdm import tqdm

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

dict_keys(['mt', 'el', 'tr', 'sq', 'is', 'uk', 'ca', 'mk', 'hr'])

In [37]:
# Test the approach
df = pd.DataFrame(main_dict["mk"]["dataset"])
df.head(2)

Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News


In [38]:
# Apply the XLM-R-base tokenizer over the dataset
from transformers import AutoTokenizer, XLMRobertaModel
import torch

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

tokens = []
integers = []
token_list = []

for text in tqdm(df.text.to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens, except the beginning (<s>) and end (</s>) token
	current_tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	tokens.append(current_tokens)
	token_list.extend(current_tokens)
	integers.append(encoded_text.input_ids[1:-1])

df["tokens"] = tokens
df["token_ids"] = integers

print(token_list[:10])
print(len(token_list))

df.head(3)

# Convert tokens back to words
#print(tokenizer.convert_tokens_to_string(tokens))

Token indices sequence length is longer than the specified maximum sequence length for this model (816 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 80/80 [00:00<00:00, 568.59it/s]

['▁Ек', 'шу', 'ли', ',', '▁T', 'CL', '▁ги', '▁прави', '▁смартфон', 'овите']
30387





Unnamed: 0,text_id,y_pred,text,translation,metadata,y_true,tokens,token_ids
0,CLASSLA-web.mk.1000486,Forum,"Екшули, TCL ги прави смартфоновите, а TCL е см...","Ekshui, TCL makes smartphones, and TCL is hous...","CLASSLA-web.mk.1000486', 'domain': 'forum.carc...",Forum,"[▁Ек, шу, ли, ,, ▁T, CL, ▁ги, ▁прави, ▁смартфо...","[75430, 12213, 546, 4, 384, 37486, 1670, 10416..."
1,CLASSLA-web.mk.1009071,News,Red Valentino прогнозира бура од принтови за с...,Red Valentino predicts a storm of prints for n...,"CLASSLA-web.mk.1009071', 'domain': 'fashionel....",News,"[▁Red, ▁Valentino, ▁прогноз, ира, ▁бур, а, ▁од...","[6096, 166361, 45404, 6790, 21623, 59, 338, 44..."
2,CLASSLA-web.mk.1043814,Instruction,Најголем фактор на ризик за развој на проширен...,The biggest risk factor for the development of...,"CLASSLA-web.mk.1043814', 'domain': 'puls24.mk'}",Instruction,"[▁Најголем, ▁фактор, ▁на, ▁ризик, ▁за, ▁развој...","[238783, 25873, 29, 50641, 61, 25348, 29, 591,..."


In [23]:
# Tokenize also the train dataset
from datasets import load_dataset

train = load_dataset("TajaKuzman/X-GENRE-multilingual-text-genre-dataset", "train")

# To open as Pandas DataFrame:
train_df = pd.DataFrame(train["train"])

train_df

Downloading readme: 100%|██████████| 14.2k/14.2k [00:00<00:00, 1.38MB/s]


Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English
3,In 2009 the song was the focus of a successful...,Information/Explanation,CORE,English
4,QuotW This was the week when neither rumours o...,News,CORE,English
...,...,...,...,...
1767,Sound Pillow represents another way in which t...,Promotion,FTD,English
1768,Night vision scopes have been a quite signific...,Instruction,FTD,English
1769,Personal stories - Leigh I was diagnosed over ...,Opinion/Argumentation,FTD,English
1770,"A few days ago , in a galaxy far , far away .....",Prose/Lyrical,FTD,English


In [26]:
# Tokenize the train dataset as well
tokens_train = []
integers_train = []
token_list_train = []

for text in tqdm(train_df.text.to_list()):
	encoded_text = tokenizer(text)
	# Take all tokens_train, except the beginning (<s>) and end (</s>) token
	current_tokens_train = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)[1:-1]
	tokens_train.append(current_tokens_train)
	token_list_train.extend(current_tokens_train)
	integers_train.append(encoded_text.input_ids[1:-1])

train_df["tokens_train"] = tokens_train
train_df["token_ids"] = integers_train

print(token_list_train[:10])
print(len(token_list_train))

train_df.head(3)

100%|██████████| 1772/1772 [00:09<00:00, 186.50it/s]

['▁See', 'king', '▁All', '▁Things', '▁Br', 'illian', 't', '▁"', 'I', '▁want']
2710819





Unnamed: 0,text,labels,dataset,language,tokens_train,token_ids
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,"[▁See, king, ▁All, ▁Things, ▁Br, illian, t, ▁""...","[6872, 6048, 3164, 119175, 13008, 162076, 18, ..."
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,"[▁Meet, ▁Or, ch, id, ▁du, ▁Bo, is, ▁I, ▁first,...","[72626, 3347, 206, 532, 115, 2460, 164, 87, 51..."
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,"[▁Abstract, ▁Object, ive, :, ▁Report, ing, ▁bi...","[233973, 134549, 5844, 12, 34798, 214, 333, 16..."


In [34]:
# Create a list of tokens, where we take only the first 512 tokens
train_tokens_shortened = []

for i in train_df["tokens_train"].to_list():
	train_tokens_shortened.extend(i[:513])

len(train_tokens_shortened)

700411

In [39]:
# See how many tokens overlap
overlap_counter = 0

for token in token_list:
	if token in train_tokens_shortened:
		overlap_counter += 1

overlap_counter

4508

In [40]:
# Out of all tokens, how many overlap?
overlap_counter/len(token_list)

0.1483529140750979

For Croatian, 82% of tokens overlap with train_df, for Macedonian 15%. 

In [29]:
# Save the tokenized version
train_df.to_json("datasets/tokenized_datasets/X-GENRE-train-tokenized.json")