# Experiments with Baseline Classifiers

I apply traditional non-neural classifiers to the test data: dummy classifier, Naive Bayes classifier, Logistic Regression, SVM and others.

In [1]:
import pandas as pd
import numpy as np
import json
import sklearn.model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.feature_extraction
from sklearn.svm import SVC
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the test datasets

test_en = pd.read_json("../../datasets/ParlaMint-EN-CAP-test-dataset.jsonl", lines=True)
test_hr = pd.read_json("../../datasets/ParlaMint-HR-CAP-test-dataset.jsonl", lines=True)
test_ba = pd.read_json("../../datasets/ParlaMint-BA-CAP-test-dataset.jsonl", lines=True)
test_sr = pd.read_json("../../datasets/ParlaMint-RS-CAP-test-dataset.jsonl", lines=True)

print(test_en.shape, test_hr.shape, test_ba.shape,test_sr.shape)

(876, 6) (869, 6) (824, 6) (874, 6)


In [3]:
# Load the training dataset

df_train = pd.read_json("../../datasets/ParlaCAP-train/ParlaCAP-train.jsonl", lines=True)

print(df_train.shape)

df_train.head(2)

(29779, 13)


Unnamed: 0,ID,Text_ID,text,Date,Speaker_role,Speaker_name,length,lang,labels,split,eval,keyword,public-lands-candidate-instance
0,ParlaMint-ES-PV_2019-11-15.u95,ParlaMint-ES-PV_2019-11-15,"Sailburu anderea, prozesu guztia esplikatu did...",2019-11-15,Regular,"Ubera Aranzeta, Rebeka",227,ES-PV,Education,train,no,,
1,ParlaMint-AT_2005-12-06-022-XXII-NRSITZ-00129_...,ParlaMint-AT_2005-12-06-022-XXII-NRSITZ-00129,Herr Präsident! Frau Bundesministerin! Sehr ge...,2005-12-06,Regular,"Praßl, Michael",222,AT,Technology,train,no,,


In [4]:
# Create X_train and Y_train parts, used for sci kit learning
# List of texts in training split
X_train = list(df_train.text)
# List of labels in training split
Y_train = list(df_train.labels)


print(len(X_train), len(Y_train))

29779 29779


In [5]:
# Create a list of labels
labels = list(df_train.labels.unique())
labels

['Education',
 'Technology',
 'Health',
 'Environment',
 'Housing',
 'Labor',
 'Defense',
 'Government Operations',
 'Social Welfare',
 'Other',
 'Macroeconomics',
 'Domestic Commerce',
 'Civil Rights',
 'International Affairs',
 'Transportation',
 'Immigration',
 'Law and Crime',
 'Agriculture',
 'Foreign Trade',
 'Culture',
 'Public Lands',
 'Energy']

## Creating Sci-Kit classifiers

In [6]:
# Create a TF-IDF representation of the text
def data_iterator(f):
    for token in f:
        yield token


def tokenizer(txt):
    """Simple whitespace tokenizer"""
    return txt.split()

In [7]:
iterator=data_iterator(X_train)

vectorizer=sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=tokenizer,use_idf=True,min_df=0.005)
d=vectorizer.fit_transform(iterator)



In [8]:
# Create a pipeline of models that you want to try:

pipelines=[]

#for model in [DummyClassifier(strategy="most_frequent"), DummyClassifier(strategy="stratified"), DecisionTreeClassifier(), MultinomialNB(), ComplementNB(), LogisticRegression(), SVC(),RandomForestClassifier()]:
#for model in [ComplementNB(), LogisticRegression(penalty=None), SVC(kernel="linear", C=2)]:
for model in [ComplementNB(), SVC(kernel="linear", C=2)]:
    pipeline=make_pipeline(model)
    pipelines.append(pipeline)

In [9]:
for i, pipeline in enumerate(pipelines):
    pipeline.fit(d, Y_train)

In [10]:
def classify(df_test, df_test_name, pipelines = pipelines):

	# List of texts in test split
	X_test = list(df_test.text)
	# List of labels in test split
	Y_test = list(df_test.labels)

	test_iterator=data_iterator(X_test)
	d_test=vectorizer.transform(test_iterator)

	print(len(X_test), len(Y_test))

	for i, pipeline in enumerate(pipelines):
		y_pred=list(pipeline.predict(d_test))
		model_name = pipelines[i].steps[0][0].upper()
		model_name = model_name.split("(")[0]

		# Create a json with results
		current_results = {
			"system": model_name,
			"predictions": [
				{
				"train": "ParlaCAP-train",
				"test": "{}".format(df_test_name),
				"predictions": y_pred,
				}
			]
			}

		# Save the results as a new json
		with open("submissions/submission-{}-{}.json".format(model_name, df_test_name), "w") as file:
			json.dump(current_results, file)

		print("Classification with {} on {} finished.".format(model_name, df_test_name))


In [11]:
classify(test_en, "ParlaCAP-EN-test", pipelines)
classify(test_hr, "ParlaCAP-HR-test", pipelines)
classify(test_ba, "ParlaCAP-BA-test", pipelines)
classify(test_sr, "ParlaCAP-RS-test", pipelines)

876 876
Classification with COMPLEMENTNB on ParlaCAP-EN-test finished.
Classification with SVC on ParlaCAP-EN-test finished.
869 869
Classification with COMPLEMENTNB on ParlaCAP-HR-test finished.
Classification with SVC on ParlaCAP-HR-test finished.
824 824
Classification with COMPLEMENTNB on ParlaCAP-BA-test finished.
Classification with SVC on ParlaCAP-BA-test finished.
874 874
Classification with COMPLEMENTNB on ParlaCAP-RS-test finished.
Classification with SVC on ParlaCAP-RS-test finished.
