Import all necessary libraries and install everything you need for training:

In [1]:
# install the libraries necessary for data wrangling, prediction and result analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from numba import cuda
from itertools import islice
import time
from tqdm import tqdm

In [2]:
# Install transformers
# (this needs to be done on Kaggle each time you start the session)
#!pip install -q transformers

# Install the simpletransformers
#!pip install -q simpletransformers
from simpletransformers.classification import ClassificationModel

# Install wandb
#!pip install -q wandb
import wandb

# Login to wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtajak[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
# Import the MaCoCu sample

dataset = pd.read_csv("MaCoCu-mk-sample.csv", sep="\t", index_col = 0)
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054


In [5]:
dataset.describe(include="all")

Unnamed: 0,domain,url,text,doc,length
count,10000,10000,10000,10000,10000.0
unique,1000,10000,10000,10000,
top,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",
freq,10,1,1,1,
mean,,,,,407.2226
std,,,,,867.246509
min,,,,,76.0
25%,,,,,128.0
50%,,,,,213.0
75%,,,,,408.0


## Apply classifier

Models used:
- FTD classifier - original FTD data (except multi-labeled texts and non-text texts) - 10 categories, 849 instances in training data
- GINCO-downcast classifier - used primary_level_4 downcasted GINCO labels - 9 labels. It was trained on 601 texts.
- CORE-main classifier - main categories only - 9 instances. All texts with multiple labels were discarded. It was trained on 10256 instances.
- GINCO X-GENRE classifier - 9 X-GENRE labels. It was trained on 535 texts (10% texts discarded - belonging to "discarded" labels)
- FTD X-GENRE classifier - 7 X-GENRE labels. It was trained on 630 texts (23% texts were discarded).
- CORE X-GENRE classifier - 9 X-GENRE labels. It was trained on 607 texts - large changes to the dataset were performed (change of distribution, taking only a sample to have a similar size as FTD and GINCO).
- X-GENRE classifier - 9 X-GENRE labels. Trained on the training splits of all of the X-GENRE datasets mentioned above: 1772 instances in the training dataset.


### Functions

In [6]:
def define_model(model_name):
	"""
	Define which model you want to use and download it.
	Args:
	- model_name: choose from "FTD", "GINCO", "CORE", "GINCO-X-GENRE", "FTD-X-GENRE", "CORE-X-GENRE", "X-GENRE"
	"""
	if model_name == "FTD":
		# Initialize Wandb: 
		run = wandb.init(project="FTD-learning-manual-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		# Load the FTD model from Wandb:
		artifact = run.use_artifact('tajak/FTD-learning-manual-hyperparameter-search/FTD-classifier:v1', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)

	elif model_name == "GINCO":
		# Initialize Wandb: 
		run = wandb.init(project="GINCO-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the GINCO-downcast model from Wandb:
		artifact = run.use_artifact('tajak/GINCO-hyperparameter-search/GINCO-downcast-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "CORE":
		# Initialize Wandb: 
		run = wandb.init(project="CORE-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the CORE-main model from Wandb:
		artifact = run.use_artifact('tajak/CORE-hyperparameter-search/CORE-main-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "GINCO-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the GINCO-X-GENRE model from Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/SI-GINCO-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "FTD-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the FTD-X-GENRE model from Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/FTD-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "CORE-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		# To access the model from the Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/CORE-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		# To access the model from the Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)

	return model

In [7]:
def predict(dataset, model, model_name):
	"""
	This function takes the dataset and applies the trained model on it to infer predictions.
	It saves the results to a new csv file.

	Args:
	- dataset (pandas DataFrame): dataset to apply prediction on. The text we want to predict should be in the column "text".
	- model: the model to use
	- model_name: the name of the classifier
	"""
	# Silence the model
	model.args.silent = True

	labels = model.args.labels_list

	# Split the dataframe into batches

	def chunk(arr_range, arr_size):
		arr_range = iter(arr_range)
		return iter(lambda: tuple(islice(arr_range, arr_size)), ())

	batches_list = list(chunk(dataset.text, 8))

	batches_list_new = []

	for i in batches_list:
		batches_list_new.append(list(i))

	print("The dataset is split into {} batches of {} texts.".format(len(batches_list_new),len(batches_list_new[0])))

	# Apply softmax to the raw outputs
	def softmax(x):
		#Compute softmax values for each sets of scores in x.
		return np.exp(x) / np.sum(np.exp(x), axis=0)

	y_pred = []
	y_distr = []
	most_probable = []
	batch_counter = 0

	print(f"Prediction with model {model_name} started.")
	start_time = time.time()

	for i in tqdm(batches_list_new):
		output = model.predict(i)
		current_y_pred = output[0]
		current_y_distr = output[1]
		current_y_distr_softmax = []
		current_y_distr_most_probable = []
		for i in current_y_distr:
			distr = softmax(i)
			distr_dict = {labels[i]: round(distr[i],4) for i in range(len(labels))}
			current_y_distr_softmax.append(distr_dict)
			# Also add the information for the softmax of the most probable category ("certainty")
			distr_sorted = np.sort(distr)
			current_y_distr_most_probable.append(distr_sorted[-1])

		for i in current_y_pred:
			y_pred.append(i)

		for i in current_y_distr_softmax:
			y_distr.append(i)

		for i in current_y_distr_most_probable:
			most_probable.append(i)

		batch_counter += 1
		#print("Batch {} predicted.".format(batch_counter))

	prediction_time = round((time.time() - start_time)/60,2)

	print("Prediction with model {} completed. It took {} minutes for {} instances - {} minutes per one instance.".format(model_name, prediction_time, dataset.shape[0], prediction_time/dataset.shape[0]))

	dataset[f"{model_name}"] = y_pred
	dataset[f"{model_name}_label_distribution"] = y_distr
	dataset[f"{model_name}_confidence"] = most_probable

	# Save the new dataframe which contains the y_pred values as well
	dataset.to_csv("MaCoCu-sl-sample-prediction-{}".format(model_name), sep="\t")

	return dataset

### Prediction

#### FTD

In [8]:
FTD = define_model("FTD")

# Define labels
FTD_labels = [7, 8, 0, 1, 6, 5, 2, 4, 3, 9]

ftd_mapping = {'A1 (argumentative)': 0, 'A11 (personal)': 1, 'A12 (promotion)': 2, 'A14 (academic)': 3, 'A16 (information)': 4, 'A17 (review)': 5, 'A4 (fiction)': 6, 'A7 (instruction)': 7, 'A8 (news)': 8, 'A9 (legal)': 9}

ftd_mapping_reverse = {list(ftd_mapping.values())[i]: list(ftd_mapping.keys())[i] for i in range(len(list(ftd_mapping.values())))}

[34m[1mwandb[0m: Downloading large artifact FTD-classifier:v1, 1081.90MB. 8 files... Done. 0:0:0


In [9]:
# Predict FTD to the dataset
dataset = predict(dataset, FTD, "FTD")

The dataset is split into 1250 batches of 8 texts.
Prediction with model FTD started.


100%|██████████| 1250/1250 [23:58<00:00,  1.15s/it]


Prediction with model FTD completed. It took 23.98 minutes for 10000 instances - 0.002398 minutes per one instance.


In [10]:
dataset.head(3)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,5,"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,2,"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037
2,in2.mk,http://in2.mk/,"Од крајот на месец Ноември 2009 година, своето...","<doc id=""macocu.mk.25"" title=""IN2 - ПOЧЕTHA CT...",162,2,"{7: 0.0102, 8: 0.0147, 0: 0.0178, 1: 0.0055, 6...",0.902866


In [11]:
# Map the FTD labels to names
dataset["FTD"] = [ftd_mapping_reverse[x] for x in dataset["FTD"]]

dataset.head(3)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,A17 (review),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,A12 (promotion),"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037
2,in2.mk,http://in2.mk/,"Од крајот на месец Ноември 2009 година, своето...","<doc id=""macocu.mk.25"" title=""IN2 - ПOЧЕTHA CT...",162,A12 (promotion),"{7: 0.0102, 8: 0.0147, 0: 0.0178, 1: 0.0055, 6...",0.902866


In [12]:
# At the end of each prediction and before downloading the next model, delete the previous one from the folder to release space.

# Remove previous classifier
%rm -rf wandb
%rm -rf artifacts

In [13]:
dataset["FTD"].value_counts()

A12 (promotion)       4226
A8 (news)             1970
A1 (argumentative)    1343
A16 (information)     1199
A7 (instruction)       447
A17 (review)           388
A9 (legal)             208
A11 (personal)         114
A14 (academic)          58
A4 (fiction)            47
Name: FTD, dtype: int64

In [14]:
# Save the dataset with results
dataset.to_csv("MaCoCu-mk_with_predictions.csv", sep="\t")

#### Loop through other classifiers

In [11]:
# Open the dataset
dataset = pd.read_csv("MaCoCu-mk_with_predictions.csv", sep="\t", index_col = 0)
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence
0,zsc.si,http://www.zsc.si/,"Nacionalna varnost je preresna stvar, da bi jo...","<doc id=""macocu.si.283"" title=""ZSC - Zveza slo...",200,A8 (news),"{7: 0.0329, 8: 0.285, 0: 0.1495, 1: 0.0462, 6:...",0.285015,List of Summaries/Excerpts,"{'Forum': 0.0038, 'News/Reporting': 0.0289, 'O...",0.921079,Narrative,"{'Interactive Discussion': 0.003, 'Narrative':...",0.617481
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196


In [15]:
def prediction_pipeline(model_name, dataset):
	"""Choose from ["GINCO", "CORE", "GINCO-X-GENRE", "FTD-X-GENRE", "CORE-X-GENRE", "X-GENRE"]"""

	# Define the model
	model = define_model(model_name)

	# Predict genre labels to the dataset
	dataset = predict(dataset, model, model_name)

	# Before downloading the next model, delete the previous one to release space.
	%rm -rf wandb
	%rm -rf artifacts

In [17]:
prediction_pipeline("GINCO", dataset)

[34m[1mwandb[0m: Downloading large artifact GINCO-downcast-classifier:v0, 1081.89MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model GINCO started.


100%|██████████| 1250/1250 [27:43<00:00,  1.33s/it]


Prediction with model GINCO completed. It took 27.72 minutes for 10000 instances - 0.0027719999999999997 minutes per one instance.


In [18]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,A17 (review),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622,List of Summaries/Excerpts,"{'Forum': 0.0036, 'News/Reporting': 0.0147, 'O...",0.919893
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,A12 (promotion),"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037,List of Summaries/Excerpts,"{'Forum': 0.0037, 'News/Reporting': 0.0111, 'O...",0.780482


In [19]:
dataset["GINCO"].value_counts()

News/Reporting                3136
Promotion                     2439
Information/Explanation       1772
List of Summaries/Excerpts     835
Opinion/Argumentation          827
Instruction                    488
Other                          322
Forum                          108
Legal/Regulation                73
Name: GINCO, dtype: int64

In [21]:
prediction_pipeline("CORE", dataset)

[34m[1mwandb[0m: Downloading large artifact CORE-main-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model CORE started.


100%|██████████| 1250/1250 [22:33<00:00,  1.08s/it]


Prediction with model CORE completed. It took 22.56 minutes for 10000 instances - 0.0022559999999999998 minutes per one instance.


In [22]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,A17 (review),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622,List of Summaries/Excerpts,"{'Forum': 0.0036, 'News/Reporting': 0.0147, 'O...",0.919893,Opinion,"{'Interactive Discussion': 0.0085, 'Narrative'...",0.541413
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,A12 (promotion),"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037,List of Summaries/Excerpts,"{'Forum': 0.0037, 'News/Reporting': 0.0111, 'O...",0.780482,Informational Description/Explanation,"{'Interactive Discussion': 0.001, 'Narrative':...",0.950632


In [23]:
dataset["CORE"].value_counts()

Informational Description/Explanation    5809
Narrative                                2376
Informational Persuasion                  736
Opinion                                   448
How-To/Instructional                      417
Interactive Discussion                     92
Spoken                                     82
Lyrical                                    40
Name: CORE, dtype: int64

In [25]:
prediction_pipeline("GINCO-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact SI-GINCO-X-GENRE-classifier:v0, 1081.88MB. 44 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model GINCO-X-GENRE started.


100%|██████████| 1250/1250 [23:19<00:00,  1.12s/it]


Prediction with model GINCO-X-GENRE completed. It took 23.32 minutes for 10000 instances - 0.002332 minutes per one instance.


In [26]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,A17 (review),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622,List of Summaries/Excerpts,"{'Forum': 0.0036, 'News/Reporting': 0.0147, 'O...",0.919893,Opinion,"{'Interactive Discussion': 0.0085, 'Narrative'...",0.541413,Opinion/Argumentation,"{'Information/Explanation': 0.0055, 'Opinion/A...",0.861316
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,A12 (promotion),"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037,List of Summaries/Excerpts,"{'Forum': 0.0037, 'News/Reporting': 0.0111, 'O...",0.780482,Informational Description/Explanation,"{'Interactive Discussion': 0.001, 'Narrative':...",0.950632,Promotion,"{'Information/Explanation': 0.0014, 'Opinion/A...",0.99089


In [27]:
dataset["GINCO-X-GENRE"].value_counts(normalize=True)

News                       0.3051
Promotion                  0.2900
Information/Explanation    0.1865
Other                      0.0756
Opinion/Argumentation      0.0632
Instruction                0.0576
Legal                      0.0112
Forum                      0.0098
Prose/Lyrical              0.0010
Name: GINCO-X-GENRE, dtype: float64

In [29]:
prediction_pipeline("FTD-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact FTD-X-GENRE-classifier:v0, 1081.88MB. 44 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model FTD-X-GENRE started.


100%|██████████| 1250/1250 [22:47<00:00,  1.09s/it]


Prediction with model FTD-X-GENRE completed. It took 22.8 minutes for 10000 instances - 0.00228 minutes per one instance.


In [30]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,A17 (review),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622,List of Summaries/Excerpts,"{'Forum': 0.0036, 'News/Reporting': 0.0147, 'O...",0.919893,Opinion,"{'Interactive Discussion': 0.0085, 'Narrative'...",0.541413,Opinion/Argumentation,"{'Information/Explanation': 0.0055, 'Opinion/A...",0.861316,Promotion,"{'Promotion': 0.526, 'Opinion/Argumentation': ...",0.525988
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,A12 (promotion),"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037,List of Summaries/Excerpts,"{'Forum': 0.0037, 'News/Reporting': 0.0111, 'O...",0.780482,Informational Description/Explanation,"{'Interactive Discussion': 0.001, 'Narrative':...",0.950632,Promotion,"{'Information/Explanation': 0.0014, 'Opinion/A...",0.99089,Promotion,"{'Promotion': 0.9479, 'Opinion/Argumentation':...",0.947886


In [31]:
dataset["FTD-X-GENRE"].value_counts(normalize=True)

Promotion                  0.4376
News                       0.2644
Information/Explanation    0.1938
Instruction                0.0479
Legal                      0.0312
Opinion/Argumentation      0.0165
Prose/Lyrical              0.0086
Name: FTD-X-GENRE, dtype: float64

In [33]:
prediction_pipeline("CORE-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact CORE-X-GENRE-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model CORE-X-GENRE started.


100%|██████████| 1250/1250 [23:10<00:00,  1.11s/it]


Prediction with model CORE-X-GENRE completed. It took 23.17 minutes for 10000 instances - 0.002317 minutes per one instance.


In [34]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,...,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence,CORE-X-GENRE,CORE-X-GENRE_label_distribution,CORE-X-GENRE_confidence
0,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",864,A17 (review),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",0.643622,List of Summaries/Excerpts,"{'Forum': 0.0036, 'News/Reporting': 0.0147, 'O...",...,0.541413,Opinion/Argumentation,"{'Information/Explanation': 0.0055, 'Opinion/A...",0.861316,Promotion,"{'Promotion': 0.526, 'Opinion/Argumentation': ...",0.525988,Opinion/Argumentation,"{'Other': 0.0953, 'Information/Explanation': 0...",0.419943
1,qs.mk,https://www.qs.mk/,"Парадата на гордоста се враќа под слоганот ""Во...","<doc id=""macocu.mk.5"" title=""КВИР СКВЕР Скопје...",1054,A12 (promotion),"{7: 0.0056, 8: 0.0162, 0: 0.0263, 1: 0.0047, 6...",0.904037,List of Summaries/Excerpts,"{'Forum': 0.0037, 'News/Reporting': 0.0111, 'O...",...,0.950632,Promotion,"{'Information/Explanation': 0.0014, 'Opinion/A...",0.99089,Promotion,"{'Promotion': 0.9479, 'Opinion/Argumentation':...",0.947886,News,"{'Other': 0.1212, 'Information/Explanation': 0...",0.533915


In [35]:
dataset["CORE-X-GENRE"].value_counts(normalize=True)

Information/Explanation    0.3621
News                       0.3090
Instruction                0.1286
Opinion/Argumentation      0.1270
Forum                      0.0430
Prose/Lyrical              0.0161
Other                      0.0142
Name: CORE-X-GENRE, dtype: float64

In [37]:
prediction_pipeline("X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact X-GENRE-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1250 batches of 8 texts.
Prediction with model X-GENRE started.


100%|██████████| 1250/1250 [23:09<00:00,  1.11s/it]


Prediction with model X-GENRE completed. It took 23.15 minutes for 10000 instances - 0.002315 minutes per one instance.


In [38]:
# View the final dataset
dataset.describe(include="all")

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,...,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence,CORE-X-GENRE,CORE-X-GENRE_label_distribution,CORE-X-GENRE_confidence,X-GENRE,X-GENRE_label_distribution,X-GENRE_confidence
count,10000,10000,10000,10000,10000.0,10000,10000,10000.0,10000,10000,...,10000.0,10000,10000,10000.0,10000,10000,10000.0,10000,10000,10000.0
unique,1000,10000,10000,10000,,10,10000,,9,10000,...,,7,10000,,7,10000,,9,4619,
top,gg.mk,https://gg.mk/,"Тоа е тоа, екипа. Претпоследната епизода ја на...","<doc id=""macocu.mk.4"" title=""GG.MK - е-спорт &...",,A12 (promotion),"{7: 0.0152, 8: 0.1352, 0: 0.0334, 1: 0.0402, 6...",,News/Reporting,"{'Forum': 0.0036, 'News/Reporting': 0.0147, 'O...",...,,Promotion,"{'Promotion': 0.526, 'Opinion/Argumentation': ...",,Information/Explanation,"{'Other': 0.0953, 'Information/Explanation': 0...",,News,"{'Other': 0.0001, 'Information/Explanation': 0...",
freq,10,1,1,1,,4226,1,,3136,1,...,,4376,1,,3621,1,,3420,791,
mean,,,,,407.2226,,,0.707988,,,...,0.920332,,,0.773138,,,0.615557,,,0.974817
std,,,,,867.246509,,,0.213832,,,...,0.136157,,,0.182064,,,0.239739,,,0.081538
min,,,,,76.0,,,0.162058,,,...,0.277827,,,0.220517,,,0.147283,,,0.324877
25%,,,,,128.0,,,0.527706,,,...,0.937103,,,0.641003,,,0.393718,,,0.996886
50%,,,,,213.0,,,0.766465,,,...,0.984124,,,0.829714,,,0.641162,,,0.998792
75%,,,,,408.0,,,0.903888,,,...,0.989249,,,0.938752,,,0.851866,,,0.99899


In [39]:
dataset["X-GENRE"].value_counts()

News                       3420
Promotion                  2660
Information/Explanation    1802
Opinion/Argumentation       762
Instruction                 654
Other                       258
Legal                       243
Forum                       138
Prose/Lyrical                63
Name: X-GENRE, dtype: int64

In [40]:
dataset.columns

Index(['domain', 'url', 'text', 'doc', 'length', 'FTD',
       'FTD_label_distribution', 'FTD_confidence', 'GINCO',
       'GINCO_label_distribution', 'GINCO_confidence', 'CORE',
       'CORE_label_distribution', 'CORE_confidence', 'GINCO-X-GENRE',
       'GINCO-X-GENRE_label_distribution', 'GINCO-X-GENRE_confidence',
       'FTD-X-GENRE', 'FTD-X-GENRE_label_distribution',
       'FTD-X-GENRE_confidence', 'CORE-X-GENRE',
       'CORE-X-GENRE_label_distribution', 'CORE-X-GENRE_confidence', 'X-GENRE',
       'X-GENRE_label_distribution', 'X-GENRE_confidence'],
      dtype='object')

In [41]:
# Save the final dataset with results
dataset.to_csv("MaCoCu-mk_with_predictions.csv", sep="\t")