In [1]:
import json
import pandas as pd
import os
from sklearn.dummy import DummyClassifier
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the test datasets

test_en = pd.read_json("../../datasets/ParlaCAP-EN-test/ParlaMint-EN-CAP-test-dataset.jsonl", lines=True)
test_hr = pd.read_json("../../datasets/ParlaCAP-HR-test/ParlaMint-HR-CAP-test-dataset.jsonl", lines=True)

print(test_en.shape, test_hr.shape)

(440, 9) (869, 9)


In [4]:
# Load the training dataset

train_df = pd.read_json("../../datasets/ParlaCAP-train/ParlaCAP-train.jsonl", lines=True)

print(train_df.shape)

train_df.head(2)

(29779, 13)


Unnamed: 0,ID,Text_ID,text,Date,Speaker_role,Speaker_name,length,lang,labels,split,eval,keyword,public-lands-candidate-instance
0,ParlaMint-ES-PV_2019-11-15.u95,ParlaMint-ES-PV_2019-11-15,"Sailburu anderea, prozesu guztia esplikatu did...",2019-11-15,Regular,"Ubera Aranzeta, Rebeka",227,ES-PV,Education,train,no,,
1,ParlaMint-AT_2005-12-06-022-XXII-NRSITZ-00129_...,ParlaMint-AT_2005-12-06-022-XXII-NRSITZ-00129,Herr Präsident! Frau Bundesministerin! Sehr ge...,2005-12-06,Regular,"Praßl, Michael",222,AT,Technology,train,no,,


In [5]:
def dummy(train_df, test_df, test_df_name):
    # Create X_train and Y_train parts, used for sci kit learning
    # List of texts in training split
    X_train = list(train_df.text)
    # List of labels in training split
    Y_train = list(train_df.labels)

    # List of texts in test split
    X_test = list(test_df.text)
    # List of labels in test split
    Y_test = list(test_df.labels)

    print(len(X_train), len(Y_train), len(X_test), len(Y_test))

    # Create a list of labels
    labels = list(test_df.labels.unique())
    print("Labels: {}".format(labels))

    for strategy in ["stratified", "most_frequent"]:
        model = f"dummy-{strategy}"

        dummy_mf = DummyClassifier(strategy=strategy)

        # Train the model
        dummy_mf.fit(X_train, Y_train)

        #Get the predictions
        y_pred_mf = dummy_mf.predict(X_test)

        y_pred = list(y_pred_mf)

        # Create a json with results
        current_results = {
            "system": model,
            "predictions": [
                {
                "train": "ParlaCAP-train",
                "test": "{}".format(test_df_name),
                "predictions": y_pred,
                }
            ],
            #"model": model_type_dict[model][1],
            #"args": model_args,
            }

        # Save the results as a new json
        with open("submissions/submission-{}-{}.json".format(model, test_df_name), "w") as file:
            json.dump(current_results, file)

        print("Classification with {} on {} finished.".format(model, test_df_name))


In [6]:
dummy(train_df, test_hr, "ParlaCAP-HR-test")

29779 29779 869 869
Labels: ['Government Operations', 'Defense', 'Environment', 'Culture', 'Law and Crime', 'Education', 'Labor', 'Transportation', 'Domestic Commerce', 'Macroeconomics', 'Health', 'International Affairs', 'Other', 'Civil Rights', 'Immigration', 'Agriculture', 'Energy', 'Housing', 'Foreign Trade', 'Social Welfare', 'Public Lands', 'Technology']
Classification with dummy-stratified on ParlaCAP-HR-test finished.
Classification with dummy-most_frequent on ParlaCAP-HR-test finished.


In [7]:
dummy(train_df, test_en, "ParlaCAP-EN-test")

29779 29779 440 440
Labels: ['Government Operations', 'Foreign Trade', 'Energy', 'Social Welfare', 'Defense', 'Environment', 'Culture', 'Labor', 'Transportation', 'Civil Rights', 'International Affairs', 'Other', 'Health', 'Domestic Commerce', 'Housing', 'Agriculture', 'Education', 'Macroeconomics', 'Law and Crime', 'Technology', 'Immigration', 'Public Lands']
Classification with dummy-stratified on ParlaCAP-EN-test finished.
Classification with dummy-most_frequent on ParlaCAP-EN-test finished.
