In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [5]:
from transformers import AutoTokenizer
import sys
import torch
import json
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
import argparse
import pandas as pd

In [6]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

en_ginco = pd.read_json("../../datasets/EN-GINCO-test-dataset/EN-GINCO.jsonl", lines=True)
x_ginco = pd.read_json("../../datasets/X-GINCO-test-set/X-GINCO.jsonl", lines=True)

print(en_ginco.shape, x_ginco.shape)

(272, 4) (790, 6)


In [7]:
en_ginco.head(2)

Unnamed: 0,text,labels,dataset,language
0,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English
1,Why graft thrives in postconflict zones <p> A ...,News,EN-GINCO,English


In [9]:
def predict_genre(df_test_name):
    dfs = {
        "en-ginco": en_ginco,
        "x-ginco": x_ginco
    }

    df = dfs[df_test_name]

    texts = df["text"].to_list()

    prediction_list = []

    model = AutoModelForSequenceClassification.from_pretrained("classla/xlm-roberta-base-multilingual-text-genre-classifier")
    model.to("cuda:0")

    tokenizer = AutoTokenizer.from_pretrained("classla/xlm-roberta-base-multilingual-text-genre-classifier")

    labels = ["Other", "Information/Explanation", "News", "Instruction", "Opinion/Argumentation", "Forum", "Prose/Lyrical", "Legal", "Promotion"]

    def transcode(logit):
        cats=sorted(zip(labels,softmax(logit)),key=lambda x:-x[1])
        if cats[0][1]>=0.8:
            label=cats[0][0]
        else:
            label='Mix'
        return label

    inputs = tokenizer(texts, max_length=512, truncation=True, padding=True, return_tensors="pt").to("cuda:0")

    with torch.no_grad():
        logits = model(**inputs).logits

    for idx in range(len(logits)):
        current_logit = logits[idx].tolist()
        prediction_list.append(transcode(current_logit))
    
    # Create a json with results

    current_results = {
        "system": "X-GENRE classifier",
        "predictions": [
            {
            "train": "X-GENRE (train split)",
            "test": "{}".format(df_test_name),
            "predictions": prediction_list,
            }
        ]
        }

    # Save the results as a new json
    with open("submissions/submission-{}-{}.json".format("X-GENRE", df_test_name), "w") as file:
        json.dump(current_results, file)

    print("Classification with {} on {} finished.".format("X-GENRE", df_test_name))

In [10]:
predict_genre("en-ginco")

Classification with X-GENRE on en-ginco finished.


In [11]:
predict_genre("x-ginco")

Classification with X-GENRE on x-ginco finished.
