# Experiments with Baseline Classifiers

I apply traditional non-neural classifiers to the test data: dummy classifier, Naive Bayes classifier, Logistic Regression, SVM and others.

In [1]:
import pandas as pd
import numpy as np
import json
import sklearn.model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.feature_extraction
from sklearn.svm import SVC
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the training dataset
df_train = pd.read_json("../../datasets/EMMediaTopic-training-dataset/EMMediaTopic-1.0.jsonl", orient="records", lines=True)
df_train = df_train[df_train["split"] == "train"]
print(df_train.shape)

df_train.rename(columns={"GPT-IPTC-label": "labels"}, inplace=True)

df_train.head(2)

(20000, 5)


Unnamed: 0,document_id,lang,text,labels,split
1,CLASSLA-web.hr.2821678,hr,U organizaciji Lige protiv raka Koprivničko-kr...,health,train
2,CLASSLA-web.hr.2508108,hr,[VIDEO] Rimac: Ova pobjeda u Beogradu znači pu...,sport,train


In [3]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

iptc_test = pd.read_json("../../datasets/IPTC-top-test/IPTC-top-test.jsonl", lines=True)
print(iptc_test.shape)

iptc_test.rename(columns={"label": "labels"}, inplace=True)

iptc_test.head(2)

(1129, 5)


Unnamed: 0,document_id,lang,text,labels,multilabel
0,CLASSLA-web.hr.3842863,hr,Iza kamere\n\nFilmsku družinu ZAG posjetila je...,"arts, culture, entertainment and media","[arts, culture, entertainment and media]"
1,CLASSLA-web.hr.4597011,hr,Učenici osmih razreda nedavno su u sklopu tere...,education,[education]


In [4]:
df_train.describe(include="all")

Unnamed: 0,document_id,lang,text,labels,split
count,20000,20000,20000,20000,20000
unique,20000,4,20000,17,1
top,CLASSLA-web.hr.2821678,hr,U organizaciji Lige protiv raka Koprivničko-kr...,sport,train
freq,1,5000,1,3066,20000


In [5]:
# Create X_train and Y_train parts, used for sci kit learning
# List of texts in training split
X_train = list(df_train.text)
# List of labels in training split
Y_train = list(df_train.labels)


print(len(X_train), len(Y_train))

20000 20000


In [6]:
# Create a list of labels
labels = list(df_train.labels.unique())
labels

['health',
 'sport',
 'education',
 'labour',
 'human interest',
 'religion',
 'society',
 'crime, law and justice',
 'disaster, accident and emergency incident',
 'arts, culture, entertainment and media',
 'politics',
 'economy, business and finance',
 'lifestyle and leisure',
 'science and technology',
 'environment',
 'weather',
 'conflict, war and peace']

## Creating Sci-Kit classifiers

In [7]:
# Create a TF-IDF representation of the text
def data_iterator(f):
    for token in f:
        yield token


def tokenizer(txt):
    """Simple whitespace tokenizer"""
    return txt.split()

In [8]:
iterator=data_iterator(X_train)

vectorizer=sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=tokenizer,use_idf=True,min_df=0.005)
d=vectorizer.fit_transform(iterator)



In [12]:
# Create a pipeline of models that you want to try:

pipelines=[]

#for model in [DummyClassifier(strategy="most_frequent"), DummyClassifier(strategy="stratified"), DecisionTreeClassifier(), MultinomialNB(), ComplementNB(), LogisticRegression(), SVC(),RandomForestClassifier()]:
#for model in [ComplementNB(), LogisticRegression(penalty=None), SVC(kernel="linear", C=2)]:
for model in [ComplementNB(),SVC(kernel="linear", C=2)]:
    pipeline=make_pipeline(model)
    pipelines.append(pipeline)

In [13]:
for i, pipeline in enumerate(pipelines):
    pipeline.fit(d, Y_train)

In [14]:
def classify(df_test, df_test_name, pipelines = pipelines):

	# List of texts in test split
	X_test = list(df_test.text)
	# List of labels in test split
	Y_test = list(df_test.labels)

	test_iterator=data_iterator(X_test)
	d_test=vectorizer.transform(test_iterator)

	print(len(X_test), len(Y_test))

	for i, pipeline in enumerate(pipelines):
		y_pred=list(pipeline.predict(d_test))
		model_name = pipelines[i].steps[0][0].upper()
		model_name = model_name.split("(")[0]

		# Create a json with results
		current_results = {
			"system": model_name,
			"predictions": [
				{
				"train": "EMMediaTopic",
				"test": "{}".format(df_test_name),
				"predictions": y_pred,
				}
			]
			}

		# Save the results as a new json
		with open("submissions/submission-{}-{}.json".format(model_name, df_test_name), "w") as file:
			json.dump(current_results, file)

		print("Classification with {} on {} finished.".format(model_name, df_test_name))


In [15]:
classify(iptc_test, "IPTC-test", pipelines)

1129 1129
Classification with COMPLEMENTNB on IPTC-test finished.
Classification with SVC on IPTC-test finished.
