In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import json
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

en_ginco = pd.read_json("../../datasets/EN-GINCO-test-dataset/EN-GINCO.jsonl", lines=True)
x_ginco = pd.read_json("../../datasets/X-GINCO-test-set/X-GINCO.jsonl", lines=True)

print(en_ginco.shape, x_ginco.shape)

(272, 4) (790, 6)


In [11]:
def map_labels(predicted_labels):

	# Mapping based on mappings presented in https://arxiv.org/pdf/2406.19892v2
	core_to_xgenre_mapping = {
		# machine-translated
		'MT': "Other",
		# Lyrical
		'LY': 'Prose/Lyrical',
		# Spoken
		'SP': "Other",
		# Interview
		'it': "Other",
		# Interactive Discussion
		'ID': "Forum",
		# Narrative
		'NA': "News",
		# news report
		'ne': "News",
		# sports report
		'sr': "News",
		# narrative blog
		'nb': "Opinion/Argumentation", 
		# How-to /Instructions
		'HI': "Instruction",
		# Recipe
		're': "Instruction",
		# Informational description
		'IN': 'Information/Explanation',
		# Encyclopedia article
		'en': 'Information/Explanation',
		# Research article
		'ra': 'Information/Explanation', 
		# Description of a thing or person
		'dtp': 'Information/Explanation',
		# Frequently asked questions
		'fi': "Instruction",
		# Legal terms and conditions
		'lt': "Legal",
		#Opinion
		'OP': 'Opinion/Argumentation',
		# Review
		'rv': 'Opinion/Argumentation',
		# Opinion blog
		'ob': 'Opinion/Argumentation',
		# Denominational religious blog or sermon
		'rs': "Prose/Lyrical",
		# Advice
		'av': 'Opinion/Argumentation',
		# Informational persuasion
		'IP': "Promotion",
		# Description with intent to sell
		'ds': "Promotion",
		# News & opinion blog or editorial
		'ed': 'Opinion/Argumentation'
	}

	mapped_labels = []

	for i in predicted_labels:
		if len(i) == 1:
			cur_label = core_to_xgenre_mapping[i[0]]
			mapped_labels.append(cur_label)
		elif len(i) == 0:
			mapped_labels.append("Mix")
		# mapping if the subcategory is predicted
		elif len(i) >= 2:
			if i[-1] in ['it', 'ne', 'sr', 'nb', 're', 'en', 'ra', 'dtp', 'fi', 'lt', 'rv', 'ob', 'rs', 'av', 'ds', 'ed']:
				mapped_labels.append(core_to_xgenre_mapping[i[-1]])
			else:
				mapped_labels.append("Mix")
		else:
				mapped_labels.append(i)
				print("Mapping unsucessful:", i)
	
	return mapped_labels

In [13]:
def predict(df_test_name):
	dfs = {
		"en-ginco": en_ginco,
		"x-ginco": x_ginco
	}

	df = dfs[df_test_name]

	texts = df["text"].to_list()

	model_id = "TurkuNLP/web-register-classification-multilingual"

	# Load model and tokenizer
	model = AutoModelForSequenceClassification.from_pretrained(model_id)
	model.to("cuda:0")

	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

	predicted_labels = []

	for text in texts:
		# Tokenize text
		inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda:0")

		with torch.no_grad():
			outputs = model(**inputs)

		# Apply sigmoid to the logits to get probabilities
		probabilities = torch.sigmoid(outputs.logits).squeeze()

		# Determine a threshold for predicting labels
		threshold = 0.5
		predicted_label_indices = (probabilities > threshold).nonzero(as_tuple=True)[0]

		# Extract readable labels using id2label
		id2label = model.config.id2label
		predicted_label = [id2label[idx.item()] for idx in predicted_label_indices]

		predicted_labels.append(predicted_label)

	mapped_labels = map_labels(predicted_labels)

	print(set(mapped_labels))

	print(len(texts))
	print(len(mapped_labels))

	current_results = {
		"system": "CORE register classifier",
		"predictions": [
			{
			"train": "Multilingual CORE corpora",
			"test": "{}".format(df_test_name),
			"predictions": mapped_labels,
			}
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format("CORE", df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format("CORE", df_test_name))


In [14]:
predict("en-ginco")

{'Opinion/Argumentation', 'Instruction', 'Prose/Lyrical', 'Legal', 'Mix', 'News', 'Other', 'Forum', 'Information/Explanation', 'Promotion'}
272
272
Classification with CORE on en-ginco finished.


In [15]:
predict("x-ginco")

{'Opinion/Argumentation', 'Instruction', 'Prose/Lyrical', 'Legal', 'Other', 'News', 'Mix', 'Forum', 'Information/Explanation', 'Promotion'}
790
790
Classification with CORE on x-ginco finished.
