In [None]:
%pip install datasets

In [1]:
import json
import pandas as pd
import os
from sklearn.dummy import DummyClassifier
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the X-GENRE datasets from Hugging Face
train = load_dataset("TajaKuzman/X-GENRE-text-genre-dataset", "train")
test = load_dataset("TajaKuzman/X-GENRE-text-genre-dataset", "test")
dev = load_dataset("TajaKuzman/X-GENRE-text-genre-dataset", "dev")

In [3]:
# To open them as Pandas DataFrame:
train_df = pd.DataFrame(train["train"])
test_df = pd.DataFrame(test["train"])

print(train_df.shape, test_df.shape)

(1772, 4) (592, 4)


In [2]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

en_ginco = pd.read_json("../../datasets/EN-GINCO-test-dataset/EN-GINCO.jsonl", lines=True)
x_ginco = pd.read_json("../../datasets/X-GINCO-test-set/X-GINCO.jsonl", lines=True)

print(en_ginco.shape, x_ginco.shape)

(272, 4) (790, 6)


In [5]:
def dummy(train_df, test_df, test_df_name):
    # Create X_train and Y_train parts, used for sci kit learning
    # List of texts in training split
    X_train = list(train_df.text)
    # List of labels in training split
    Y_train = list(train_df.labels)

    # List of texts in test split
    X_test = list(test_df.text)
    # List of labels in test split
    Y_test = list(test_df.labels)

    print(len(X_train), len(Y_train), len(X_test), len(Y_test))

    # Create a list of labels
    labels = list(test_df.labels.unique())
    print("Labels: {}".format(labels))

    for strategy in ["stratified", "most_frequent"]:
        model = f"dummy-{strategy}"

        dummy_mf = DummyClassifier(strategy=strategy)

        # Train the model
        dummy_mf.fit(X_train, Y_train)

        #Get the predictions
        y_pred_mf = dummy_mf.predict(X_test)

        y_pred = list(y_pred_mf)

        # Create a json with results
        current_results = {
            "system": model,
            "predictions": [
                {
                "train": "X-GENRE (train split)",
                "test": "{}".format(test_df_name),
                "predictions": y_pred,
                }
            ],
            #"model": model_type_dict[model][1],
            #"args": model_args,
            }

        # Save the results as a new json
        with open("submissions/submission-{}-{}.json".format(model, test_df_name), "w") as file:
            json.dump(current_results, file)

        print("Classification with {} on {} finished.".format(model, test_df_name))


In [6]:
dummy(train_df, en_ginco, "en-ginco")

1772 1772 272 272
Labels: ['Information/Explanation', 'News', 'Promotion', 'Opinion/Argumentation', 'Instruction', 'Forum', 'Other', 'Legal', 'Prose/Lyrical']
Classification with dummy-stratified on en-ginco finished.
Classification with dummy-most_frequent on en-ginco finished.


In [7]:
dummy(train_df, x_ginco, "x-ginco")

1772 1772 790 790
Labels: ['News', 'Opinion/Argumentation', 'Instruction', 'Information/Explanation', 'Promotion', 'Forum', 'Prose/Lyrical', 'Legal']
Classification with dummy-stratified on x-ginco finished.
Classification with dummy-most_frequent on x-ginco finished.
