In [1]:
import json
import pandas as pd
import os
from sklearn.dummy import DummyClassifier
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the test datasets

test_en = pd.read_json("../../datasets/ParlaMint-EN-CAP-test-dataset.jsonl", lines=True)
test_hr = pd.read_json("../../datasets/ParlaMint-HR-CAP-test-dataset.jsonl", lines=True)
test_ba = pd.read_json("../../datasets/ParlaMint-BA-CAP-test-dataset.jsonl", lines=True)
test_sr = pd.read_json("../../datasets/ParlaMint-RS-CAP-test-dataset.jsonl", lines=True)

print(test_en.shape, test_hr.shape, test_ba.shape,test_sr.shape)

(876, 6) (869, 6) (824, 6) (874, 6)


In [3]:
display(test_hr.head(2))
display(test_ba.head(2))
display(test_sr.head(2))
display(test_en.head(2))

Unnamed: 0,id,lang,text_id,text,speaker_role,labels
0,ParlaMint-HR_2017-12-13-0.u64787,HR,ParlaMint-HR_2017-12-13-0,"Da završimo u istom tonu. Dakle, nitko ne dvoj...",Regular,Government Operations
1,ParlaMint-HR_2022-05-26-0.u112651,HR,ParlaMint-HR_2022-05-26-0,Prelazimo na sljedeću točku. Poštovane zastupn...,Chairperson,Defense


Unnamed: 0,id,lang,text_id,text,speaker_role,labels
0,ParlaMint-BA_2013-11-20-0.u18158,BA,ParlaMint-BA_2013-11-20-0,Gospodine predsjedavajući. Na moje pitanje koj...,Regular,Agriculture
1,ParlaMint-BA_2005-02-18-0.u16720,BA,ParlaMint-BA_2005-02-18-0,"Evo, glasat ćemo odmah poslije ovog. Prije sve...",Regular,Health


Unnamed: 0,id,lang,text_id,text,speaker_role,labels
0,ParlaMint-RS_2009-03-25-0.u16134,RS,ParlaMint-RS_2009-03-25-0,"Dame i gospodo narodni poslanici, poštovana pr...",Regular,Civil Rights
1,ParlaMint-RS_2010-11-17-0.u46983,RS,ParlaMint-RS_2010-11-17-0,"Zašto Kraljevo, a ne Niš? Zašto Kraljevo, a ne...",Regular,Domestic Commerce


Unnamed: 0,id,lang,text_id,text,speaker_role,labels
0,ParlaMint-GB_2016-02-08-lords.u159,GB,ParlaMint-GB_2016-02-08-lords,I think that that will be in place for 2019 an...,Regular,Government Operations
1,ParlaMint-GB_2017-12-13-commons.u600,GB,ParlaMint-GB_2017-12-13-commons,The hon. Gentleman needs to answer the questio...,Regular,Foreign Trade


In [4]:
# Load the training dataset

train_df = pd.read_json("../../datasets/ParlaCAP-train/ParlaCAP-train.jsonl", lines=True)

print(train_df.shape)

train_df.head(2)

(29779, 13)


Unnamed: 0,ID,Text_ID,text,Date,Speaker_role,Speaker_name,length,lang,labels,split,eval,keyword,public-lands-candidate-instance
0,ParlaMint-ES-PV_2019-11-15.u95,ParlaMint-ES-PV_2019-11-15,"Sailburu anderea, prozesu guztia esplikatu did...",2019-11-15,Regular,"Ubera Aranzeta, Rebeka",227,ES-PV,Education,train,no,,
1,ParlaMint-AT_2005-12-06-022-XXII-NRSITZ-00129_...,ParlaMint-AT_2005-12-06-022-XXII-NRSITZ-00129,Herr Präsident! Frau Bundesministerin! Sehr ge...,2005-12-06,Regular,"Praßl, Michael",222,AT,Technology,train,no,,


In [5]:
def dummy(train_df, test_df, test_df_name):
    # Create X_train and Y_train parts, used for sci kit learning
    # List of texts in training split
    X_train = list(train_df.text)
    # List of labels in training split
    Y_train = list(train_df.labels)

    # List of texts in test split
    X_test = list(test_df.text)
    # List of labels in test split
    Y_test = list(test_df.labels)

    print(len(X_train), len(Y_train), len(X_test), len(Y_test))

    # Create a list of labels
    labels = list(test_df.labels.unique())
    print("Labels: {}".format(labels))

    for strategy in ["stratified", "most_frequent"]:
        model = f"dummy-{strategy}"

        dummy_mf = DummyClassifier(strategy=strategy)

        # Train the model
        dummy_mf.fit(X_train, Y_train)

        #Get the predictions
        y_pred_mf = dummy_mf.predict(X_test)

        y_pred = list(y_pred_mf)

        # Create a json with results
        current_results = {
            "system": model,
            "predictions": [
                {
                "train": "ParlaCAP-train",
                "test": "{}".format(test_df_name),
                "predictions": y_pred,
                }
            ],
            #"model": model_type_dict[model][1],
            #"args": model_args,
            }

        # Save the results as a new json
        with open("submissions/submission-{}-{}.json".format(model, test_df_name), "w") as file:
            json.dump(current_results, file)

        print("Classification with {} on {} finished.".format(model, test_df_name))


In [6]:
dummy(train_df, test_hr, "ParlaCAP-HR-test")

29779 29779 869 869
Labels: ['Government Operations', 'Defense', 'Environment', 'Culture', 'Law and Crime', 'Education', 'Labor', 'Transportation', 'Domestic Commerce', 'Macroeconomics', 'Health', 'International Affairs', 'Other', 'Civil Rights', 'Immigration', 'Agriculture', 'Energy', 'Housing', 'Foreign Trade', 'Social Welfare', 'Public Lands', 'Technology']
Classification with dummy-stratified on ParlaCAP-HR-test finished.
Classification with dummy-most_frequent on ParlaCAP-HR-test finished.


In [7]:
dummy(train_df, test_en, "ParlaCAP-EN-test")

29779 29779 876 876
Labels: ['Government Operations', 'Foreign Trade', 'Energy', 'Social Welfare', 'Defense', 'Environment', 'Culture', 'Labor', 'Transportation', 'Civil Rights', 'International Affairs', 'Other', 'Health', 'Domestic Commerce', 'Housing', 'Agriculture', 'Education', 'Macroeconomics', 'Law and Crime', 'Technology', 'Immigration', 'Public Lands']
Classification with dummy-stratified on ParlaCAP-EN-test finished.
Classification with dummy-most_frequent on ParlaCAP-EN-test finished.


In [8]:
dummy(train_df, test_sr, "ParlaCAP-RS-test")

29779 29779 874 874
Labels: ['Civil Rights', 'Domestic Commerce', 'Education', 'International Affairs', 'Public Lands', 'Labor', 'Law and Crime', 'Environment', 'Housing', 'Technology', 'Agriculture', 'Defense', 'Health', 'Other', 'Immigration', 'Government Operations', 'Energy', 'Macroeconomics', 'Culture', 'Foreign Trade', 'Social Welfare', 'Transportation']
Classification with dummy-stratified on ParlaCAP-RS-test finished.
Classification with dummy-most_frequent on ParlaCAP-RS-test finished.


In [9]:
dummy(train_df, test_ba, "ParlaCAP-BA-test")

29779 29779 824 824
Labels: ['Agriculture', 'Health', 'Government Operations', 'Energy', 'International Affairs', 'Macroeconomics', 'Civil Rights', 'Defense', 'Technology', 'Public Lands', 'Foreign Trade', 'Labor', 'Other', 'Education', 'Law and Crime', 'Social Welfare', 'Transportation', 'Domestic Commerce', 'Housing', 'Environment', 'Immigration', 'Culture']
Classification with dummy-stratified on ParlaCAP-BA-test finished.
Classification with dummy-most_frequent on ParlaCAP-BA-test finished.
