# Experiments with Baseline Classifiers

I apply traditional non-neural classifiers to the test data: dummy classifier, Naive Bayes classifier, Logistic Regression, SVM and others.

In [1]:
import pandas as pd
import numpy as np
import json
import sklearn.model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import sklearn.feature_extraction
from sklearn.svm import SVC
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the test datasets

test_en = pd.read_json("../../datasets/ParlaSent-EN-test/ParlaSent_EN_test.jsonl", lines=True)
test_bcs = pd.read_json("../../datasets/ParlaSent-BSC-test/ParlaSent_BCS_test.jsonl", lines=True)

print(test_en.shape, test_bcs.shape)

(2600, 14) (2600, 14)


In [4]:
# Load the training dataset

df_train = pd.read_json("../../datasets/ParlaSent-train/ParlaSent_training.jsonl", lines=True)

print(df_train.shape)

df_train.head(2)

(13000, 16)


Unnamed: 0,text,country,annotator1,annotator2,reconciliation,labels,document_id,sentence_id,term,date,name,party,gender,birth_year,split,ruling
0,Da li je pošteno da se ukida prethodna stopa i...,HR,N_Neutral,Negative,M_Negative,Negative,17023,2536,9.0,2016-11-15,"Pupovac, Milorad",SDSS,M,1955.0,train,Opposition
1,Znam pouzdano da su među specijalnim snagama b...,SRB,Negative,Negative,Negative,Negative,14362,1648,9.0,2013-03-28,"Halimi, Riza",,M,1947.0,train,


In [5]:
# Create X_train and Y_train parts, used for sci kit learning
# List of texts in training split
X_train = list(df_train.text)
# List of labels in training split
Y_train = list(df_train.labels)


print(len(X_train), len(Y_train))

13000 13000


In [6]:
# Create a list of labels
labels = list(df_train.labels.unique())
labels

['Negative', 'Neutral', 'Positive']

## Creating Sci-Kit classifiers

In [7]:
# Create a TF-IDF representation of the text
def data_iterator(f):
    for token in f:
        yield token


def tokenizer(txt):
    """Simple whitespace tokenizer"""
    return txt.split()

In [8]:
iterator=data_iterator(X_train)

vectorizer=sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=tokenizer,use_idf=True,min_df=0.005)
d=vectorizer.fit_transform(iterator)



In [9]:
# Create a pipeline of models that you want to try:

pipelines=[]

#for model in [DummyClassifier(strategy="most_frequent"), DummyClassifier(strategy="stratified"), DecisionTreeClassifier(), MultinomialNB(), ComplementNB(), LogisticRegression(), SVC(),RandomForestClassifier()]:
#for model in [ComplementNB(), LogisticRegression(penalty=None), SVC(kernel="linear", C=2)]:
for model in [ComplementNB(), SVC(kernel="linear", C=2)]:
    pipeline=make_pipeline(model)
    pipelines.append(pipeline)

In [10]:
for i, pipeline in enumerate(pipelines):
    pipeline.fit(d, Y_train)

In [11]:
def classify(df_test, df_test_name, pipelines = pipelines):

	# List of texts in test split
	X_test = list(df_test.text)
	# List of labels in test split
	Y_test = list(df_test.labels)

	test_iterator=data_iterator(X_test)
	d_test=vectorizer.transform(test_iterator)

	print(len(X_test), len(Y_test))

	for i, pipeline in enumerate(pipelines):
		y_pred=list(pipeline.predict(d_test))
		model_name = pipelines[i].steps[0][0].upper()
		model_name = model_name.split("(")[0]

		# Create a json with results
		current_results = {
			"system": model_name,
			"predictions": [
				{
				"train": "ParlaSent",
				"test": "{}".format(df_test_name),
				"predictions": y_pred,
				}
			]
			}

		# Save the results as a new json
		with open("submissions/submission-{}-{}.json".format(model_name, df_test_name), "w") as file:
			json.dump(current_results, file)

		print("Classification with {} on {} finished.".format(model_name, df_test_name))


In [12]:
classify(test_en, "ParlaSent-EN-test", pipelines)

2600 2600
Classification with COMPLEMENTNB on ParlaSent-EN-test finished.
Classification with SVC on ParlaSent-EN-test finished.


In [13]:
classify(test_bcs, "ParlaSent-BCS-test", pipelines)

2600 2600
Classification with COMPLEMENTNB on ParlaSent-BCS-test finished.
Classification with SVC on ParlaSent-BCS-test finished.
