In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=4

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=4


In [2]:
from transformers import AutoTokenizer
import sys
import json
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer, AutoConfig
import torch
import argparse
import numpy as np
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the test datasets

test_en = pd.read_json("../../datasets/ParlaCAP-EN-test/ParlaMint-EN-CAP-test-dataset.jsonl", lines=True)
test_hr = pd.read_json("../../datasets/ParlaCAP-HR-test/ParlaMint-HR-CAP-test-dataset.jsonl", lines=True)

print(test_en.shape, test_hr.shape)

(440, 9) (869, 9)


In [4]:
def apply_classifier(test_df, df_test_name):
	print("Loading the model ...")

	cap_model = AutoModelForSequenceClassification.from_pretrained("classla/ParlaCAP-Topic-Classifier")
	cap_model.to("cuda:0")

	cap_tokenizer = AutoTokenizer.from_pretrained("classla/ParlaCAP-Topic-Classifier")
	print("Model loaded.")

	cap_labels = ["Education", "Technology", "Health", "Environment", "Housing", "Labor", "Defense", "Government Operations", "Social Welfare", "Other", "Macroeconomics", "Domestic Commerce", "Civil Rights", "International Affairs", "Transportation", "Immigration", "Law and Crime", "Agriculture", "Foreign Trade", "Culture", "Public Lands", "Energy"]


	# Improved code to optimize inference speed
	def transcode(logit, cap_labels):
		logit = softmax(logit)
		max_idx = np.argmax(logit)
		if logit[max_idx] >= 0.6:
			label =  cap_labels[max_idx]
		# If classifier's confidence is lower, output label "Mix"
		else:
			label = 'Mix'
		sorted_labels = sorted([(cap_labels[i], logit[i]) for i in np.where(logit > 0)[0]], key=lambda x: -x[1])
		return [label, sorted_labels]

	texts = test_df["text"].to_list()
	preds = []

	print("Prediction started.")

	# If there is CUDA out of memory issue, make the batch size smaller.
	batch_size = 600

	# split the list into batch sizes
	def split_into_batches(lst, batch_size=600):
		return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]
	
	batches = split_into_batches(texts)

	for batch in batches:
		inputs = cap_tokenizer(batch, max_length=512, truncation=True, padding=True, return_tensors="pt").to("cuda:0")
		with torch.no_grad():
			logits = cap_model(**inputs).logits
		for idx in range(len(logits)):
			current_logit = logits[idx].tolist()
			results = transcode(current_logit, cap_labels)
			current_cap = results[0]
			preds.append(current_cap)
	
	# Create a json with results

	current_results = {
		"system": "ParlaCAP-classifier",
		"predictions": [
			{
			"train": "ParlaCAP-train",
			"test": "{}".format(df_test_name),
			"predictions": preds,
			}
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format("ParlaCAP", df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format("ParlaCAP", df_test_name))


In [5]:
apply_classifier(test_en, "ParlaCAP-EN-test")

Loading the model ...
Model loaded.
Prediction started.
Classification with ParlaCAP on ParlaCAP-EN-test finished.


In [5]:
apply_classifier(test_hr, "ParlaCAP-HR-test")

Loading the model ...
Model loaded.
Prediction started.
Classification with ParlaCAP on ParlaCAP-HR-test finished.
