Import all necessary libraries and install everything you need for training:

In [19]:
# install the libraries necessary for data wrangling, prediction and result analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from numba import cuda
from itertools import islice
import time
from tqdm import tqdm

In [2]:
# Install transformers
# (this needs to be done on Kaggle each time you start the session)
#!pip install -q transformers

# Install the simpletransformers
#!pip install -q simpletransformers
from simpletransformers.classification import ClassificationModel

# Install wandb
#!pip install -q wandb
import wandb

# Login to wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtajak[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
# Import the MaCoCu sample

dataset = pd.read_csv("MaCoCu-sl-sample.csv", sep="\t", index_col = 0)
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124


In [5]:
dataset.describe(include="all")

Unnamed: 0,domain,url,text,doc,length
count,10010,10010,10010,10010,10010.0
unique,1001,9991,10010,10010,
top,ahp.si,https://psj.ff.uni-lj.si/obvestila,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",
freq,10,2,1,1,
mean,,,,,412.45025
std,,,,,891.6065
min,,,,,76.0
25%,,,,,122.0
50%,,,,,213.0
75%,,,,,418.0


## Apply classifier

Models used:
- FTD classifier - original FTD data (except multi-labeled texts and non-text texts) - 10 categories, 849 instances in training data
- GINCO-downcast classifier - used primary_level_4 downcasted GINCO labels - 9 labels. It was trained on 601 texts.
- CORE-main classifier - main categories only - 9 instances. All texts with multiple labels were discarded. It was trained on 10256 instances.
- GINCO X-GENRE classifier - 9 X-GENRE labels. It was trained on 535 texts (10% texts discarded - belonging to "discarded" labels)
- FTD X-GENRE classifier - 7 X-GENRE labels. It was trained on 630 texts (23% texts were discarded).
- CORE X-GENRE classifier - 9 X-GENRE labels. It was trained on 607 texts - large changes to the dataset were performed (change of distribution, taking only a sample to have a similar size as FTD and GINCO).
- X-GENRE classifier - 9 X-GENRE labels. Trained on the training splits of all of the X-GENRE datasets mentioned above: 1772 instances in the training dataset.


### Functions

In [29]:
def define_model(model_name):
	"""
	Define which model you want to use and download it.
	Args:
	- model_name: choose from "FTD", "GINCO", "CORE", "GINCO-X-GENRE", "FTD-X-GENRE", "CORE-X-GENRE", "X-GENRE"
	"""
	if model_name == "FTD":
		# Initialize Wandb: 
		run = wandb.init(project="FTD-learning-manual-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		# Load the FTD model from Wandb:
		artifact = run.use_artifact('tajak/FTD-learning-manual-hyperparameter-search/FTD-classifier:v1', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)

	elif model_name == "GINCO":
		# Initialize Wandb: 
		run = wandb.init(project="GINCO-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the GINCO-downcast model from Wandb:
		artifact = run.use_artifact('tajak/GINCO-hyperparameter-search/GINCO-downcast-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "CORE":
		# Initialize Wandb: 
		run = wandb.init(project="CORE-hyperparameter-search", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the CORE-main model from Wandb:
		artifact = run.use_artifact('tajak/CORE-hyperparameter-search/CORE-main-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "GINCO-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the GINCO-X-GENRE model from Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/SI-GINCO-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "FTD-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		#To load the FTD-X-GENRE model from Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/FTD-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "CORE-X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		# To access the model from the Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/CORE-X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)
	elif model_name == "X-GENRE":
		# Initialize Wandb - for X-GENRE classifiers: 
		run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="applying-predictions-on-MaCoCu")

		# To access the model from the Wandb:
		artifact = run.use_artifact('tajak/X-GENRE classifiers/X-GENRE-classifier:v0', type='model')
		artifact_dir = artifact.download()

		# Loading a local save
		model = ClassificationModel(
			"xlmroberta", artifact_dir)

	return model

In [20]:
def predict(dataset, model, model_name):
	"""
	This function takes the dataset and applies the trained model on it to infer predictions.
	It saves the results to a new csv file.

	Args:
	- dataset (pandas DataFrame): dataset to apply prediction on. The text we want to predict should be in the column "text".
	- model: the model to use
	- model_name: the name of the classifier
	"""
	# Silence the model
	model.args.silent = True

	labels = model.args.labels_list

	# Split the dataframe into batches

	def chunk(arr_range, arr_size):
		arr_range = iter(arr_range)
		return iter(lambda: tuple(islice(arr_range, arr_size)), ())

	batches_list = list(chunk(dataset.text, 8))

	batches_list_new = []

	for i in batches_list:
		batches_list_new.append(list(i))

	print("The dataset is split into {} batches of {} texts.".format(len(batches_list_new),len(batches_list_new[0])))

	# Apply softmax to the raw outputs
	def softmax(x):
		#Compute softmax values for each sets of scores in x.
		return np.exp(x) / np.sum(np.exp(x), axis=0)

	y_pred = []
	y_distr = []
	most_probable = []
	batch_counter = 0

	print(f"Prediction with model {model_name} started.")
	start_time = time.time()

	for i in tqdm(batches_list_new):
		output = model.predict(i)
		current_y_pred = output[0]
		current_y_distr = output[1]
		current_y_distr_softmax = []
		current_y_distr_most_probable = []
		for i in current_y_distr:
			distr = softmax(i)
			distr_dict = {labels[i]: round(distr[i],4) for i in range(len(labels))}
			current_y_distr_softmax.append(distr_dict)
			# Also add the information for the softmax of the most probable category ("certainty")
			distr_sorted = np.sort(distr)
			current_y_distr_most_probable.append(distr_sorted[-1])

		for i in current_y_pred:
			y_pred.append(i)

		for i in current_y_distr_softmax:
			y_distr.append(i)

		for i in current_y_distr_most_probable:
			most_probable.append(i)

		batch_counter += 1
		#print("Batch {} predicted.".format(batch_counter))

	prediction_time = round((time.time() - start_time)/60,2)

	print("Prediction with model {} completed. It took {} minutes for {} instances - {} minutes per one instance.".format(model_name, prediction_time, dataset.shape[0], prediction_time/dataset.shape[0]))

	dataset[f"{model_name}"] = y_pred
	dataset[f"{model_name}_label_distribution"] = y_distr
	dataset[f"{model_name}_confidence"] = most_probable

	# Save the new dataframe which contains the y_pred values as well
	dataset.to_csv("MaCoCu-sl-sample-prediction-{}".format(model_name), sep="\t")

	return dataset

### Prediction

#### FTD

In [10]:
FTD = define_model("FTD")

# Define labels
FTD_labels = [7, 8, 0, 1, 6, 5, 2, 4, 3, 9]

ftd_mapping = {'A1 (argumentative)': 0, 'A11 (personal)': 1, 'A12 (promotion)': 2, 'A14 (academic)': 3, 'A16 (information)': 4, 'A17 (review)': 5, 'A4 (fiction)': 6, 'A7 (instruction)': 7, 'A8 (news)': 8, 'A9 (legal)': 9}

ftd_mapping_reverse = {list(ftd_mapping.values())[i]: list(ftd_mapping.keys())[i] for i in range(len(list(ftd_mapping.values())))}

[34m[1mwandb[0m: Downloading large artifact FTD-classifier:v1, 1081.90MB. 8 files... Done. 0:0:0


In [11]:
# Predict FTD to the dataset
dataset = predict(dataset, FTD, "FTD", FTD_labels)

The dataset is split into 1252 batches of 8 texts.
Prediction with model FTD started.
Batch 1 predicted.
Batch 2 predicted.
Batch 3 predicted.
Batch 4 predicted.
Batch 5 predicted.
Batch 6 predicted.
Batch 7 predicted.
Batch 8 predicted.
Batch 9 predicted.
Batch 10 predicted.
Batch 11 predicted.
Batch 12 predicted.
Batch 13 predicted.
Batch 14 predicted.
Batch 15 predicted.
Batch 16 predicted.
Batch 17 predicted.
Batch 18 predicted.
Batch 19 predicted.
Batch 20 predicted.
Batch 21 predicted.
Batch 22 predicted.
Batch 23 predicted.
Batch 24 predicted.
Batch 25 predicted.
Batch 26 predicted.
Batch 27 predicted.
Batch 28 predicted.
Batch 29 predicted.
Batch 30 predicted.
Batch 31 predicted.
Batch 32 predicted.
Batch 33 predicted.
Batch 34 predicted.
Batch 35 predicted.
Batch 36 predicted.
Batch 37 predicted.
Batch 38 predicted.
Batch 39 predicted.
Batch 40 predicted.
Batch 41 predicted.
Batch 42 predicted.
Batch 43 predicted.
Batch 44 predicted.
Batch 45 predicted.
Batch 46 predicted.
Bat

In [12]:
dataset.head(3)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,2,"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,2,"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781
2,osi.si,https://www.osi.si/,OSI je podjetje z dolgoletnimi izkušnjami na p...,"<doc id=""macocu.si.712"" title=""OSI sistemske i...",173,2,"{7: 0.0132, 8: 0.0125, 0: 0.0184, 1: 0.006, 6:...",0.899391


In [14]:
# Map the FTD labels to names
dataset["FTD"] = [ftd_mapping_reverse[x] for x in dataset["FTD"]]

dataset.head(3)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,A12 (promotion),"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781
2,osi.si,https://www.osi.si/,OSI je podjetje z dolgoletnimi izkušnjami na p...,"<doc id=""macocu.si.712"" title=""OSI sistemske i...",173,A12 (promotion),"{7: 0.0132, 8: 0.0125, 0: 0.0184, 1: 0.006, 6:...",0.899391


In [16]:
# At the end of each prediction and before downloading the next model, delete the previous one from the folder to release space.

# Remove previous classifier
%rm -rf wandb
%rm -rf artifacts

In [18]:
# Save the dataset with results
dataset.to_csv("MaCoCu-sl_with_predictions.csv", sep="\t")

#### Loop through other classifiers

In [32]:
def prediction_pipeline(model_name, dataset):
	"""Choose from ["GINCO", "CORE", "GINCO-X-GENRE", "FTD-X-GENRE", "CORE-X-GENRE", "X-GENRE"]"""

	# Define the model
	model = define_model(model_name)

	# Predict genre labels to the dataset
	dataset = predict(dataset, model, model_name)

	# Before downloading the next model, delete the previous one to release space.
	%rm -rf wandb
	%rm -rf artifacts

In [33]:
prediction_pipeline("GINCO", dataset)




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: Downloading large artifact GINCO-downcast-classifier:v0, 1081.89MB. 8 files... Done. 0:0:0


The dataset is split into 1252 batches of 8 texts.
Prediction with model GINCO started.


100%|██████████| 1252/1252 [32:14<00:00,  1.54s/it]


Prediction with model GINCO completed. It took 32.23 minutes for 10010 instances - 0.0032197802197802194 minutes per one instance.


In [34]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,A12 (promotion),"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997,Promotion,"{'Forum': 0.0017, 'News/Reporting': 0.0019, 'O...",0.954012
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409


In [37]:
prediction_pipeline("CORE", dataset)

[34m[1mwandb[0m: Downloading large artifact CORE-main-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1252 batches of 8 texts.
Prediction with model CORE started.


100%|██████████| 1252/1252 [23:54<00:00,  1.15s/it]


Prediction with model CORE completed. It took 23.9 minutes for 10010 instances - 0.0023876123876123874 minutes per one instance.


In [38]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,A12 (promotion),"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997,Promotion,"{'Forum': 0.0017, 'News/Reporting': 0.0019, 'O...",0.954012,Informational Description/Explanation,"{'Interactive Discussion': 0.0013, 'Narrative'...",0.958831
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196


In [39]:
dataset.CORE.value_counts(normalize=True)

Informational Description/Explanation    0.670130
Informational Persuasion                 0.122577
Narrative                                0.096803
How-To/Instructional                     0.051648
Opinion                                  0.036563
Interactive Discussion                   0.012587
Spoken                                   0.005994
Lyrical                                  0.003696
Name: CORE, dtype: float64

In [41]:
prediction_pipeline("GINCO-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact SI-GINCO-X-GENRE-classifier:v0, 1081.88MB. 44 files... Done. 0:0:0


The dataset is split into 1252 batches of 8 texts.
Prediction with model GINCO-X-GENRE started.


100%|██████████| 1252/1252 [24:20<00:00,  1.17s/it]


Prediction with model GINCO-X-GENRE completed. It took 24.35 minutes for 10010 instances - 0.002432567432567433 minutes per one instance.


In [42]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,A12 (promotion),"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997,Promotion,"{'Forum': 0.0017, 'News/Reporting': 0.0019, 'O...",0.954012,Informational Description/Explanation,"{'Interactive Discussion': 0.0013, 'Narrative'...",0.958831,Promotion,"{'Information/Explanation': 0.001, 'Opinion/Ar...",0.992724
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196,Promotion,"{'Information/Explanation': 0.0029, 'Opinion/A...",0.984429


In [43]:
dataset["GINCO-X-GENRE"].value_counts(normalize=True)

Promotion                  0.481718
Information/Explanation    0.146354
News                       0.125175
Opinion/Argumentation      0.090310
Instruction                0.072428
Other                      0.059640
Forum                      0.013586
Legal                      0.009291
Prose/Lyrical              0.001499
Name: GINCO-X-GENRE, dtype: float64

In [45]:
prediction_pipeline("FTD-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact FTD-X-GENRE-classifier:v0, 1081.88MB. 44 files... Done. 0:0:0


The dataset is split into 1252 batches of 8 texts.
Prediction with model FTD-X-GENRE started.


100%|██████████| 1252/1252 [23:54<00:00,  1.15s/it]


Prediction with model FTD-X-GENRE completed. It took 23.9 minutes for 10010 instances - 0.0023876123876123874 minutes per one instance.


In [46]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,GINCO_confidence,CORE,CORE_label_distribution,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,A12 (promotion),"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997,Promotion,"{'Forum': 0.0017, 'News/Reporting': 0.0019, 'O...",0.954012,Informational Description/Explanation,"{'Interactive Discussion': 0.0013, 'Narrative'...",0.958831,Promotion,"{'Information/Explanation': 0.001, 'Opinion/Ar...",0.992724,Promotion,"{'Promotion': 0.9176, 'Opinion/Argumentation':...",0.917588
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",0.919409,Informational Description/Explanation,"{'Interactive Discussion': 0.0251, 'Narrative'...",0.650196,Promotion,"{'Information/Explanation': 0.0029, 'Opinion/A...",0.984429,Promotion,"{'Promotion': 0.4754, 'Opinion/Argumentation':...",0.475421


In [47]:
dataset["FTD-X-GENRE"].value_counts(normalize=True)

Promotion                  0.654645
Information/Explanation    0.157642
News                       0.063536
Instruction                0.058242
Opinion/Argumentation      0.031069
Legal                      0.027173
Prose/Lyrical              0.007692
Name: FTD-X-GENRE, dtype: float64

In [49]:
prediction_pipeline("CORE-X-GENRE", dataset)

[34m[1mwandb[0m: Downloading large artifact CORE-X-GENRE-classifier:v0, 1081.88MB. 8 files... Done. 0:0:0


The dataset is split into 1252 batches of 8 texts.
Prediction with model CORE-X-GENRE started.


100%|██████████| 1252/1252 [24:19<00:00,  1.17s/it]


Prediction with model CORE-X-GENRE completed. It took 24.32 minutes for 10010 instances - 0.0024295704295704294 minutes per one instance.


In [50]:
dataset.head(2)

Unnamed: 0,domain,url,text,doc,length,FTD,FTD_label_distribution,FTD_confidence,GINCO,GINCO_label_distribution,...,CORE_confidence,GINCO-X-GENRE,GINCO-X-GENRE_label_distribution,GINCO-X-GENRE_confidence,FTD-X-GENRE,FTD-X-GENRE_label_distribution,FTD-X-GENRE_confidence,CORE-X-GENRE,CORE-X-GENRE_label_distribution,CORE-X-GENRE_confidence
0,ahp.si,https://ahp.si/,Bolnica za živali Postojna\nBolnica za živali ...,"<doc id=""macocu.si.221"" title=""Domov | Bolnica...",841,A12 (promotion),"{7: 0.0115, 8: 0.009, 0: 0.0149, 1: 0.0047, 6:...",0.917997,Promotion,"{'Forum': 0.0017, 'News/Reporting': 0.0019, 'O...",...,0.958831,Promotion,"{'Information/Explanation': 0.001, 'Opinion/Ar...",0.992724,Promotion,"{'Promotion': 0.9176, 'Opinion/Argumentation':...",0.917588,Information/Explanation,"{'Other': 0.0322, 'Information/Explanation': 0...",0.550356
1,pas.si,https://www.pas.si/,Prezračevanje prostorov je nujno zaradi najman...,"<doc id=""macocu.si.390"" title=""Preverjeno - Ak...",124,A12 (promotion),"{7: 0.1273, 8: 0.0115, 0: 0.08, 1: 0.0467, 6: ...",0.446781,List of Summaries/Excerpts,"{'Forum': 0.0041, 'News/Reporting': 0.0195, 'O...",...,0.650196,Promotion,"{'Information/Explanation': 0.0029, 'Opinion/A...",0.984429,Promotion,"{'Promotion': 0.4754, 'Opinion/Argumentation':...",0.475421,Opinion/Argumentation,"{'Other': 0.0831, 'Information/Explanation': 0...",0.521125


In [51]:
dataset["CORE-X-GENRE"].value_counts(normalize=True)

Information/Explanation    0.442957
Instruction                0.195804
Opinion/Argumentation      0.140559
News                       0.125075
Forum                      0.055145
Other                      0.020380
Prose/Lyrical              0.020080
Name: CORE-X-GENRE, dtype: float64

In [54]:
prediction_pipeline("X-GENRE", dataset)




Thread SenderThread:
Traceback (most recent call last):
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/internal/internal_util.py", line 54, in run
    self._run()
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/internal/internal_util.py", line 105, in _run
    self._process(record)
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/internal/internal.py", line 312, in _process
    self._sm.send(record)
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/internal/sender.py", line 237, in send
    send_handler(record)
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/internal/sender.py", line 375, in send_exit
    self._update_summary()
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/internal/sender.py", line 842, in _update_summary
    with open(summary_path, "w") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/home/tajak/Genre-Datasets-Comparison/Genre-Datasets

Problem at: /tmp/ipykernel_19465/2728970569.py 76 define_model


Traceback (most recent call last):
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1711, in _atexit_cleanup
    self._on_finish()
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1829, in _on_finish
    self._backend.interface.communicate_poll_exit()
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 617, in communicate_poll_exit
    resp = self._communicate_poll_exit(poll_exit)
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 411, in _communicate_poll_exit
    result = self._communicate(rec)
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 222, in _communicate
    return self._communicate_async(rec, local=local).get(timeout=timeout)
  File "/home/tajak/anaconda3/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 227, in _commun

Exception: problem

In [None]:
# View the final dataset
dataset.describe(include="all")

In [None]:
dataset.head(2)

In [None]:
# Save the final dataset with results
dataset.to_csv("MaCoCu-sl_with_predictions.csv", sep="\t")