# Experiments with Baseline Classifiers

I apply traditional non-neural classifiers to the test data: dummy classifier, Naive Bayes classifier, Logistic Regression, SVM and others.

In [15]:
import pandas as pd
import numpy as np
import json
import sklearn.model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.feature_extraction
from sklearn.svm import SVC
from datasets import load_dataset

In [6]:
# Load the X-GENRE datasets from Hugging Face
train = load_dataset("TajaKuzman/X-GENRE-text-genre-dataset", "train")


# To open them as Pandas DataFrame:
df_train = pd.DataFrame(train["train"])

print(df_train.shape)

(1772, 4)


In [5]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

en_ginco = pd.read_json("../../datasets/EN-GINCO-test-dataset/EN-GINCO.jsonl", lines=True)
x_ginco = pd.read_json("../../datasets/X-GINCO-test-set/X-GINCO.jsonl", lines=True)

print(en_ginco.shape, x_ginco.shape)

(272, 4) (790, 6)


In [17]:
en_ginco.head(1)

Unnamed: 0,text,labels,dataset,language
0,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English


In [7]:
df_train.describe(include="all")

Unnamed: 0,text,labels,dataset,language
count,1772,1772,1772,1772
unique,1772,9,3,2
top,Harry Potter and the Philosopher's Stone - J. ...,News,FTD,English
freq,1,344,630,1237


In [8]:
# Create X_train and Y_train parts, used for sci kit learning
# List of texts in training split
X_train = list(df_train.text)
# List of labels in training split
Y_train = list(df_train.labels)


print(len(X_train), len(Y_train))

1772 1772


In [9]:
# Create a list of labels
labels = list(df_train.labels.unique())
labels

['Other',
 'Information/Explanation',
 'News',
 'Instruction',
 'Opinion/Argumentation',
 'Forum',
 'Prose/Lyrical',
 'Legal',
 'Promotion']

## Creating Sci-Kit classifiers

In [10]:
# Create a TF-IDF representation of the text
def data_iterator(f):
    for token in f:
        yield token


def tokenizer(txt):
    """Simple whitespace tokenizer"""
    return txt.split()

In [11]:
iterator=data_iterator(X_train)

vectorizer=sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=tokenizer,use_idf=True,min_df=0.005)
d=vectorizer.fit_transform(iterator)



In [12]:
# Create a pipeline of models that you want to try:

pipelines=[]

#for model in [DummyClassifier(strategy="most_frequent"), DummyClassifier(strategy="stratified"), DecisionTreeClassifier(), MultinomialNB(), ComplementNB(), LogisticRegression(), SVC(),RandomForestClassifier()]:
for model in [ComplementNB(), LogisticRegression(penalty=None), SVC(kernel="linear", C=2)]:
    pipeline=make_pipeline(model)
    pipelines.append(pipeline)

In [13]:
for i, pipeline in enumerate(pipelines):
    pipeline.fit(d, Y_train)

In [22]:
def classify(df_test, df_test_name, pipelines = pipelines):

	# List of texts in test split
	X_test = list(df_test.text)
	# List of labels in test split
	Y_test = list(df_test.labels)

	test_iterator=data_iterator(X_test)
	d_test=vectorizer.transform(test_iterator)

	print(len(X_test), len(Y_test))

	for i, pipeline in enumerate(pipelines):
		y_pred=list(pipeline.predict(d_test))
		model_name = pipelines[i].steps[0][0].upper()
		model_name = model_name.split("(")[0]

		# Create a json with results
		current_results = {
			"system": model_name,
			"predictions": [
				{
				"train": "X-GENRE (train split)",
				"test": "{}".format(df_test_name),
				"predictions": y_pred,
				}
			]
			}

		# Save the results as a new json
		with open("submissions/submission-{}-{}.json".format(model_name, df_test_name), "w") as file:
			json.dump(current_results, file)

		print("Classification with {} on {} finished.".format(model_name, df_test_name))


In [24]:
classify(en_ginco, "en-ginco", pipelines)

272 272
Classification with COMPLEMENTNB on en-ginco finished.
Classification with LOGISTICREGRESSION on en-ginco finished.
Classification with SVC on en-ginco finished.


In [25]:
classify(x_ginco, "x-ginco", pipelines)

790 790
Classification with COMPLEMENTNB on x-ginco finished.
Classification with LOGISTICREGRESSION on x-ginco finished.
Classification with SVC on x-ginco finished.
