Import all necessary libraries and install everything you need for training:

In [1]:
# install the libraries necessary for data wrangling, prediction and result analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from numba import cuda
from itertools import islice
import time
from tqdm import tqdm

In [2]:
# Install transformers
# (this needs to be done on Kaggle each time you start the session)
#!pip install -q transformers

# Install the simpletransformers
#!pip install -q simpletransformers
from simpletransformers.classification import ClassificationModel

# Install wandb
#!pip install -q wandb
import wandb

# Login to wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtajak[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
# Import the MaCoCu sample

dataset = pd.read_csv("MaCoCu-sl-sample2.csv", sep="\t", index_col = 0)
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124


In [5]:
dataset.describe(include="all")

Unnamed: 0,domain,url,text,doc,length
count,10000,10000,10000,10000,10000.0
unique,1000,9974,10000,10000,
top,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",
freq,10,3,1,1,
mean,,,,,431.5055
std,,,,,1188.840319
min,,,,,76.0
25%,,,,,126.0
50%,,,,,221.0
75%,,,,,424.0


## Apply classifier

Models used:
- FTD classifier - original FTD data (except multi-labeled texts and non-text texts) - 10 categories, 849 instances in training data
- GINCO-downcast classifier - used primary_level_4 downcasted GINCO labels - 9 labels. It was trained on 601 texts.
- CORE-main classifier - main categories only - 9 instances. All texts with multiple labels were discarded. It was trained on 10256 instances.
- GINCO X-GENRE classifier - 9 X-GENRE labels. It was trained on 535 texts (10% texts discarded - belonging to "discarded" labels)
- FTD X-GENRE classifier - 7 X-GENRE labels. It was trained on 630 texts (23% texts were discarded).
- CORE X-GENRE classifier - 9 X-GENRE labels. It was trained on 607 texts - large changes to the dataset were performed (change of distribution, taking only a sample to have a similar size as FTD and GINCO).
- X-GENRE classifier - 9 X-GENRE labels. Trained on the training splits of all of the X-GENRE datasets mentioned above: 1772 instances in the training dataset.


### Functions

In [4]:
def define_model(model_name):
	"""
	Define which model you want to use and download it.
	Args:
	- model_name: choose from "FTD", "GINCO", "CORE", "GINCO-X-GENRE", "FTD-X-GENRE", "CORE-X-GENRE", "X-GENRE"
	"""
	if model_name == "FTD":
		# Initialize Wandb: 
		run = wandb.init(project="FTD-learning-manual-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		# Load the FTD model from Wandb:
		artifact = run.use_artifact('tajak/FTD-learning-manual-hyperparameter-search/FTD-classifier:v1', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)

	elif model_name == "GINCO":
		# Initialize Wandb: 
		run = wandb.init(project="GINCO-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the GINCO-downcast model from Wandb:
		artifact = run.use_artifact('tajak/GINCO-hyperparameter-search/GINCO-downcast-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "CORE":
		# Initialize Wandb: 
		run = wandb.init(project="CORE-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the CORE-main model from Wandb:
		artifact = run.use_artifact('tajak/CORE-hyperparameter-search/CORE-main-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "GINCO-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the GINCO-X-GENRE model from Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/SI-GINCO-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "FTD-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the FTD-X-GENRE model from Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/FTD-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "CORE-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		# To access the model from the Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/CORE-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		# To access the model from the Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)

	return model

In [5]:
def predict(dataset, model, model_name):
	"""
	This function takes the dataset and applies the trained model on it to infer predictions.
	It saves the results to a new csv file.

	Args:
	- dataset (pandas DataFrame): dataset to apply prediction on. The text we want to predict should be in the column "text".
	- model: the model to use
	- model_name: the name of the classifier
	"""
	# Silence the model
	model.args.silent = True

	labels = model.args.labels_list

	# Split the dataframe into batches

	def chunk(arr_range, arr_size):
		arr_range = iter(arr_range)
		return iter(lambda: tuple(islice(arr_range, arr_size)), ())

	batches_list = list(chunk(dataset.text, 8))

	batches_list_new = []

	for i in batches_list:
		batches_list_new.append(list(i))

	print("The dataset is split into {} batches of {} texts.".format(len(batches_list_new),len(batches_list_new[0])))

	# Apply softmax to the raw outputs
	def softmax(x):
		#Compute softmax values for each sets of scores in x.
		return np.exp(x) / np.sum(np.exp(x), axis=0)

	y_pred = []
	y_distr = []
	most_probable = []
	batch_counter = 0

	print(f"Prediction with model {model_name} started.")
	start_time = time.time()

	for i in tqdm(batches_list_new):
		output = model.predict(i)
		current_y_pred = output[0]
		current_y_distr = output[1]
		current_y_distr_softmax = []
		current_y_distr_most_probable = []
		for i in current_y_distr:
			distr = softmax(i)
			distr_dict = {labels[i]: round(distr[i],4) for i in range(len(labels))}
			current_y_distr_softmax.append(distr_dict)
			# Also add the information for the softmax of the most probable category ("certainty")
			distr_sorted = np.sort(distr)
			current_y_distr_most_probable.append(distr_sorted[-1])

		for i in current_y_pred:
			y_pred.append(i)

		for i in current_y_distr_softmax:
			y_distr.append(i)

		for i in current_y_distr_most_probable:
			most_probable.append(i)

		batch_counter += 1
		#print("Batch {} predicted.".format(batch_counter))

	prediction_time = round((time.time() - start_time)/60,2)

	print("Prediction with model {} completed. It took {} minutes for {} instances - {} minutes per one instance.".format(model_name, prediction_time, dataset.shape[0], prediction_time/dataset.shape[0]))

	dataset[f"{model_name}"] = y_pred
	dataset[f"{model_name}_label_distribution"] = y_distr
	dataset[f"{model_name}_confidence"] = most_probable

	# Save the new dataframe which contains the y_pred values as well
	dataset.to_csv("MaCoCu-sl-sample-prediction-{}".format(model_name), sep="\t")

	return dataset

### Prediction

#### FTD

In [8]:
FTD = define_model("FTD")

# Define labels
FTD_labels = [7, 8, 0, 1, 6, 5, 2, 4, 3, 9]

ftd_mapping = {'A1 (argumentative)': 0, 'A11 (personal)': 1, 'A12 (promotion)': 2, 'A14 (academic)': 3, 'A16 (information)': 4, 'A17 (review)': 5, 'A4 (fiction)': 6, 'A7 (instruction)': 7, 'A8 (news)': 8, 'A9 (legal)': 9}

ftd_mapping_reverse = {list(ftd_mapping.values())[i]: list(ftd_mapping.keys())[i] for i in range(len(list(ftd_mapping.values())))}

[34m[1mwandb[0m: Downloading large artifact FTD-classifier:v1, 1081.90MB. 8 files... Done. 0:0:0


In [10]:
# Predict FTD to the dataset
dataset = predict(dataset, FTD, "FTD")

The dataset is split into 1250 batches of 8 texts.
Prediction with model FTD started.


100%|██████████| 1250/1250 [23:12<00:00,  1.11s/it]


Prediction with model FTD completed. It took 23.21 minutes for 10000 instances - 0.002321 minutes per one instance.


In [11]:
dataset.head(3)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,8,"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,2,"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781
2,fsp.si,http://fsp.si/,"Vsaka talna obloga ima svoje značilnosti, zara...","<doc id=""macocu.si.446"" title=""FSP Poslovne No...",1951,2,"{7: 0.029, 8: 0.014, 0: 0.0226, 1: 0.0084, 6: ...",0.835622


In [12]:
# Map the FTD labels to names
dataset["FTD"] = [ftd_mapping_reverse[x] for x in dataset["FTD"]]

dataset.head(3)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781
2,fsp.si,http://fsp.si/,"Vsaka talna obloga ima svoje značilnosti, zara...","<doc id=""macocu.si.446"" title=""FSP Poslovne No...",1951,A12 (promotion),"{7: 0.029, 8: 0.014, 0: 0.0226, 1: 0.0084, 6: ...",0.835622


In [6]:
# At the end of each prediction and before downloading the next model, delete the previous one from the folder to release space.

# Remove previous classifier
%rm -rf wandb
%rm -rf artifacts

In [12]:
# Save the dataset with results
dataset.to_csv("MaCoCu-sl_with_predictions.csv", sep="\t")

#### Loop through other classifiers

In [11]:
# Open the dataset
dataset = pd.read_csv("MaCoCu-sl-sample-prediction-CORE", sep="\t", index_col = 0)
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",0.921079,Narrative,"{'Interactive Discussion': 0.003, 'Narrative':...",0.617481
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196


In [13]:
def prediction_pipeline(model_name, dataset):
	"""Choose from ["GINCO", "CORE", "GINCO-X-GENRE", "FTD-X-GENRE", "CORE-X-GENRE", "X-GENRE"]"""

	# Define the model
	model = define_model(model_name)

	# Predict genre labels to the dataset
	dataset = predict(dataset, model, model_name)

	# Before downloading the next model, delete the previous one to release space.
	%rm -rf wandb
	%rm -rf artifacts

In [9]:
prediction_pipeline("GINCO", dataset)

[34m[1mwandb[0m: Downloading large artifact GINCO-downcast-classifier:v0, 1081.89MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model GINCO started.


100%|██████████| 1250/1250 [23:44<00:00,  1.14s/it]


Prediction with model GINCO completed. It took 23.74 minutes for 10000 instances - 0.002374 minutes per one instance.


In [10]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",0.921079
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409


In [11]:
dataset["GINCO"].value_counts()

Promotion                     4044
Information/Explanation       1428
Opinion/Argumentation         1202
News/Reporting                1202
List of Summaries/Excerpts     805
Instruction                    746
Other                          270
Forum                          223
Legal/Regulation                80
Name: GINCO, dtype: int64

In [13]:
prediction_pipeline("CORE", dataset)

[34m[1mwandb[0m: Downloading large artifact CORE-main-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model CORE started.


100%|██████████| 1250/1250 [22:32<00:00,  1.08s/it]


Prediction with model CORE completed. It took 22.53 minutes for 10000 instances - 0.0022530000000000002 minutes per one instance.


In [14]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",0.921079,Narrative,"{'Interactive Discussion': 0.003, 'Narrative':...",0.617481
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196


In [15]:
dataset["CORE"].value_counts()

Informational Description/Explanation    6476
Informational Persuasion                 1159
Narrative                                1108
How-To/Instructional                      535
Opinion                                   382
Interactive Discussion                    239
Spoken                                     59
Lyrical                                    42
Name: CORE, dtype: int64

In [14]:
prediction_pipeline("GINCO-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact SI-GINCO-X-GENRE-classifier:v0, 1081.88MB. 44 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model GINCO-X-GENRE started.


100%|██████████| 1250/1250 [19:43<00:00,  1.06it/s]


Prediction with model GINCO-X-GENRE completed. It took 19.73 minutes for 10000 instances - 0.001973 minutes per one instance.


In [15]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",0.921079,Narrative,"{'Interactive Discussion': 0.003, 'Narrative':...",0.617481,News,"{'Information/Explanation': 0.0008, 'Opinion/A...",0.99063
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196,Promotion,"{'Information/Explanation': 0.0029, 'Opinion/A...",0.984429


In [16]:
dataset["GINCO-X-GENRE"].value_counts(normalize=True)

Promotion                  0.4467
Information/Explanation    0.1467
News                       0.1352
Opinion/Argumentation      0.0993
Instruction                0.0816
Other                      0.0615
Forum                      0.0199
Legal                      0.0078
Prose/Lyrical              0.0013
Name: GINCO-X-GENRE, dtype: float64

In [18]:
prediction_pipeline("FTD-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact FTD-X-GENRE-classifier:v0, 1081.88MB. 44 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model FTD-X-GENRE started.


100%|██████████| 1250/1250 [22:34<00:00,  1.08s/it]


Prediction with model FTD-X-GENRE completed. It took 22.57 minutes for 10000 instances - 0.002257 minutes per one instance.


In [19]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",0.921079,Narrative,"{'Interactive Discussion': 0.003, 'Narrative':...",0.617481,News,"{'Information/Explanation': 0.0008, 'Opinion/A...",0.99063,News,"{'Promotion': 0.2129, 'Opinion/Argumentation':...",0.551021
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196,Promotion,"{'Information/Explanation': 0.0029, 'Opinion/A...",0.984429,Promotion,"{'Promotion': 0.4754, 'Opinion/Argumentation':...",0.475421


In [20]:
dataset["FTD-X-GENRE"].value_counts(normalize=True)

Promotion                  0.6282
Information/Explanation    0.1693
News                       0.0789
Instruction                0.0605
Opinion/Argumentation      0.0320
Legal                      0.0227
Prose/Lyrical              0.0084
Name: FTD-X-GENRE, dtype: float64

In [22]:
prediction_pipeline("CORE-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact CORE-X-GENRE-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model CORE-X-GENRE started.


100%|██████████| 1250/1250 [22:37<00:00,  1.09s/it]


Prediction with model CORE-X-GENRE completed. It took 22.62 minutes for 10000 instances - 0.002262 minutes per one instance.


In [23]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,...,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence,CORE-X-GENRE,CORE-X-GENRE_label_distribution,CORE-X-GENRE_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",...,0.617481,News,"{'Information/Explanation': 0.0008, 'Opinion/A...",0.99063,News,"{'Promotion': 0.2129, 'Opinion/Argumentation':...",0.551021,News,"{'Other': 0.0365, 'Information/Explanation': 0...",0.680891
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",...,0.650196,Promotion,"{'Information/Explanation': 0.0029, 'Opinion/A...",0.984429,Promotion,"{'Promotion': 0.4754, 'Opinion/Argumentation':...",0.475421,Opinion/Argumentation,"{'Other': 0.0831, 'Information/Explanation': 0...",0.521125


In [24]:
dataset["CORE-X-GENRE"].value_counts(normalize=True)

Information/Explanation    0.4085
Instruction                0.2019
Opinion/Argumentation      0.1444
News                       0.1343
Forum                      0.0698
Prose/Lyrical              0.0237
Other                      0.0174
Name: CORE-X-GENRE, dtype: float64

In [26]:
prediction_pipeline("X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact X-GENRE-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model X-GENRE started.


100%|██████████| 1250/1250 [22:39<00:00,  1.09s/it]


Prediction with model X-GENRE completed. It took 22.65 minutes for 10000 instances - 0.0022649999999999997 minutes per one instance.


In [27]:
# View the final dataset
dataset.describe(include="all")

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,...,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence,CORE-X-GENRE,CORE-X-GENRE_label_distribution,CORE-X-GENRE_confidence,X-GENRE,X-GENRE_label_distribution,X-GENRE_confidence
count,10000,10000,10000,10000,10000.0,10000,10000,10000.0,10000,10000,...,10000.0,10000,10000,10000.0,10000,10000,10000.0,10000,10000,10000.0
unique,1000,9974,10000,10000,,10,10000,,9,10000,...,,7,10000,,7,10000,,9,5121,
top,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",,A12 (promotion),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",,Promotion,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",...,,Promotion,"{'Promotion': 0.2129, 'Opinion/Argumentation':...",,Information/Explanation,"{'Other': 0.0365, 'Information/Explanation': 0...",,Promotion,"{'Other': 0.0002, 'Information/Explanation': 0...",
freq,10,3,1,1,,5891,1,,4044,1,...,,6282,1,,4085,1,,3875,762,
mean,,,,,431.5055,,,0.698041,,,...,0.938409,,,0.773492,,,0.560823,,,0.969197
std,,,,,1188.840319,,,0.234183,,,...,0.117327,,,0.191455,,,0.2349,,,0.091506
min,,,,,76.0,,,0.156952,,,...,0.255282,,,0.188668,,,0.149268,,,0.332393
25%,,,,,126.0,,,0.486665,,,...,0.959605,,,0.634583,,,0.347263,,,0.995489
50%,,,,,221.0,,,0.776443,,,...,0.988922,,,0.86319,,,0.537455,,,0.998672
75%,,,,,424.0,,,0.916415,,,...,0.992147,,,0.928335,,,0.783557,,,0.998982


In [30]:
dataset["X-GENRE"].value_counts()

Promotion                  3875
Information/Explanation    1810
News                       1474
Opinion/Argumentation      1185
Instruction                 860
Other                       327
Forum                       246
Legal                       165
Prose/Lyrical                58
Name: X-GENRE, dtype: int64

In [29]:
dataset.columns

Index(['domain', 'url', 'text', 'doc', 'length', 'FTD',
       'FTD_label_distribution', 'FTD_confidence', 'GINCO',
       'GINCO_label_distribution', 'GINCO_confidence', 'CORE',
       'CORE_label_distribution', 'CORE_confidence', 'GINCO-X-GENRE',
       'GINCO-X-GENRE_label_distribution', 'GINCO-X-GENRE_confidence',
       'FTD-X-GENRE', 'FTD-X-GENRE_label_distribution',
       'FTD-X-GENRE_confidence', 'CORE-X-GENRE',
       'CORE-X-GENRE_label_distribution', 'CORE-X-GENRE_confidence', 'X-GENRE',
       'X-GENRE_label_distribution', 'X-GENRE_confidence'],
      dtype='object')

In [31]:
# Save the final dataset with results
dataset.to_csv("MaCoCu-sl_sample2_with_predictions.csv", sep="\t")