# Creating final JSON datasets for X-GENRE and EN-GINCO

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=7

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=7


In [2]:
import pandas as pd
import json

In [3]:
# Import the relevant datasets with all the information
train = "X-GENRE-train.csv"
dev = "X-GENRE-dev.csv"
test = "X-GENRE-test.csv"
en_ginco_path = "EN-GINCO-X-GENRE.csv"

In [4]:
# Open dataframes
train_df = pd.read_csv(train, sep="\t", index_col = 0)
dev_df = pd.read_csv(dev, sep="\t", index_col = 0)
test_df = pd.read_csv(test, sep="\t", index_col = 0)
en_ginco = pd.read_csv(en_ginco_path, index_col = 0)

In [5]:
for df in [train_df, dev_df, test_df, en_ginco]:
	print(df.head(2))
	print(df.describe(include="all"))

                                                text labels
0  Seeking All Things Brilliant "I want people to...  Other
1  Meet Orchid du Bois I first met Hayley Mowday ...  Other
                                                     text labels
count                                                1772   1772
unique                                               1772      9
top     Seeking All Things Brilliant "I want people to...   News
freq                                                    1    344
                                                text                 labels
0  Yesterday, for the first time in 9 years, I em...  Opinion/Argumentation
1  Enjoy fun science experiments for kids that fe...            Instruction
                                                     text labels
count                                                 592    592
unique                                                592      9
top     Yesterday, for the first time in 9 years, I em...   News
freq   

In [6]:
# Identify which texts from the X-GENRE are from GINCO, FTD and CORE
datasets = {
	"GINCO": {
		"train": "GINCO-SL-X-GENRE-train.csv",
		"test": "GINCO-SL-X-GENRE-test.csv",
		"dev": "GINCO-SL-X-GENRE-dev.csv",
		"lang": "Slovenian"},
	"FTD": {
		"train": "FTD-X-GENRE-train.txt",
		"test": "FTD-X-GENRE-test.txt",
		"dev": "FTD-X-GENRE-dev.txt",
		"lang": "English",
	}
}

# First set all the values to CORE and English
for df in [train_df, test_df, dev_df]:
	df["dataset"] = "CORE"
	df["language"] = "English"

for dataset in ["GINCO", "FTD"]:
	for split in ["train", "test", "dev"]:
		if dataset == "GINCO":
			df = pd.read_csv(datasets[dataset][split], index_col=0)
		else:
			df = pd.read_csv(datasets[dataset][split], index_col=0, sep="\t")
		print(df.shape)

		if split == "train":
			for i, row_b in train_df.iterrows():
				# check if the text in test_df is in ginco
				if row_b['text'] in df['text'].values:
					# if it is, change the "dataset" information
					train_df.at[i, 'dataset'] = dataset
					train_df.at[i, "language"] = datasets[dataset]["lang"]
		
		elif split == "test":
			# Add information which instances come from the ginco/ftd datasets
			for i, row_b in test_df.iterrows():
				# check if the text in test_df is in ginco
				if row_b['text'] in df['text'].values:
					# if it is, change the "dataset" information
					test_df.at[i, 'dataset'] = dataset
					test_df.at[i, "language"] = datasets[dataset]["lang"]
		elif split == "dev":
			for i, row_b in dev_df.iterrows():
				# check if the text in test_df is in ginco
				if row_b['text'] in df['text'].values:
					# if it is, change the "dataset" information
					dev_df.at[i, 'dataset'] = dataset
					dev_df.at[i, "language"] = datasets[dataset]["lang"]


(535, 2)
(179, 2)
(179, 2)
(630, 2)
(210, 2)
(210, 2)


In [8]:
# Add these attributes to EN-GINCO as well:
en_ginco["dataset"] = "EN-GINCO"
en_ginco["language"] = "English"

In [22]:
display(en_ginco.head(2))
display(en_ginco.describe(include="all"))
display(en_ginco.dataset.value_counts())
display(en_ginco.language.value_counts())

Unnamed: 0,text,labels,dataset,language,length
0,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English,43
1,Why graft thrives in postconflict zones <p> A ...,News,EN-GINCO,English,1016


Unnamed: 0,text,labels,dataset,language,length
count,272,272,272,272,272.0
unique,272,9,1,1,
top,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English,
freq,1,64,272,272,
mean,,,,,387.246324
std,,,,,394.726672
min,,,,,24.0
25%,,,,,116.75
50%,,,,,251.5
75%,,,,,516.75


EN-GINCO    272
Name: dataset, dtype: int64

English    272
Name: language, dtype: int64

In [14]:
# Add text length
for df in [train_df, dev_df, test_df, en_ginco]:
	df["length"] = df["text"].apply(lambda x: len(x.split()))
	display(df.head(3))

Unnamed: 0,text,labels,dataset,language,length
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English,1310
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English,1429
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English,4610


Unnamed: 0,text,labels,dataset,language,length
0,"Yesterday, for the first time in 9 years, I em...",Opinion/Argumentation,CORE,English,915
1,Enjoy fun science experiments for kids that fe...,Instruction,CORE,English,225
2,It is said that human resource professionals o...,Instruction,CORE,English,848


Unnamed: 0,text,labels,dataset,language,length
0,"grantarctic1 Never caught one Neels, but off t...",Forum,CORE,English,323
1,Go back to Windows 7 or XP operating system If...,Forum,CORE,English,571
2,Almost a quarter of Republicans think Obama 'm...,News,CORE,English,1458


Unnamed: 0,text,labels,dataset,language,length
0,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English,43
1,Why graft thrives in postconflict zones <p> A ...,News,EN-GINCO,English,1016
2,Social Trip <p> On the evening of Wednesday 15...,Promotion,EN-GINCO,English,409


In [19]:
# Calculate sizes for each dataset in term of no. of words
total_size = 0
total_size_texts = 0

for df in [train_df, dev_df, test_df, en_ginco]:
	print("words:")
	print(df.length.sum())
	print("texts:")
	print(df.shape[0])
	total_size += df.length.sum()
	total_size_texts += df.shape[0]

print("total size in words:{}".format(total_size))
print("total size in texts:{}".format(total_size_texts))

words:
1940317
texts:
1772
words:
583595
texts:
592
words:
798025
texts:
592
words:
105331
texts:
272
total size in words:3427268
total size in texts:3228


In [11]:
for df in [train_df, dev_df, test_df, en_ginco]:
	display(df.head(2))
	display(df.describe(include="all"))
	display(df.dataset.value_counts())
	display(df.language.value_counts())

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English


Unnamed: 0,text,labels,dataset,language
count,1772,1772,1772,1772
unique,1772,9,3,2
top,"Seeking All Things Brilliant ""I want people to...",News,FTD,English
freq,1,344,630,1237


FTD      630
CORE     607
GINCO    535
Name: dataset, dtype: int64

English      1237
Slovenian     535
Name: language, dtype: int64

Unnamed: 0,text,labels,dataset,language
0,"Yesterday, for the first time in 9 years, I em...",Opinion/Argumentation,CORE,English
1,Enjoy fun science experiments for kids that fe...,Instruction,CORE,English


Unnamed: 0,text,labels,dataset,language
count,592,592,592,592
unique,592,9,3,2
top,"Yesterday, for the first time in 9 years, I em...",News,FTD,English
freq,1,115,210,413


FTD      210
CORE     203
GINCO    179
Name: dataset, dtype: int64

English      413
Slovenian    179
Name: language, dtype: int64

Unnamed: 0,text,labels,dataset,language
0,"grantarctic1 Never caught one Neels, but off t...",Forum,CORE,English
1,Go back to Windows 7 or XP operating system If...,Forum,CORE,English


Unnamed: 0,text,labels,dataset,language
count,592,592,592,592
unique,592,9,3,2
top,"grantarctic1 Never caught one Neels, but off t...",News,FTD,English
freq,1,114,210,413


FTD      210
CORE     203
GINCO    179
Name: dataset, dtype: int64

English      413
Slovenian    179
Name: language, dtype: int64

Unnamed: 0,text,labels,dataset,language
0,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English
1,Why graft thrives in postconflict zones <p> A ...,News,EN-GINCO,English


Unnamed: 0,text,labels,dataset,language
count,272,272,272,272
unique,272,9,1,1
top,Welcome to KBismarck.org! This is a community ...,Information/Explanation,EN-GINCO,English
freq,1,64,272,272


EN-GINCO    272
Name: dataset, dtype: int64

English    272
Name: language, dtype: int64

In [15]:
# Now, save each dataset to json lines format

train_df.to_json("X-GENRE-train.jsonl", lines=True, orient="records")
test_df.to_json("X-GENRE-test.jsonl", lines=True, orient="records")
dev_df.to_json("X-GENRE-dev.jsonl", lines=True, orient="records")
en_ginco.to_json("EN-GINCO.jsonl", lines=True, orient="records")

In [2]:
# Import from huggingface
from datasets import load_dataset

# Before, sign in to HuggingFace by running huggingface-cli login in the command line
dataset = load_dataset("TajaKuzman/X-GENRE-multilingual-text-genre-dataset")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 13.7k/13.7k [00:00<00:00, 8.06MB/s]


Downloading and preparing dataset json/TajaKuzman--X-GENRE-multilingual-text-genre-dataset to /home/tajak/.cache/huggingface/datasets/TajaKuzman___json/TajaKuzman--X-GENRE-multilingual-text-genre-dataset-25a88d679af75105/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data: 100%|██████████| 11.2M/11.2M [00:00<00:00, 51.6MB/s]
Downloading data: 100%|██████████| 4.57M/4.57M [00:00<00:00, 6.51MB/s]
Downloading data: 100%|██████████| 3.36M/3.36M [00:00<00:00, 3.94MB/s]
Downloading data files: 100%|██████████| 3/3 [00:07<00:00,  2.54s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1595.20it/s]
                                                        

DatasetGenerationError: An error occurred while generating the dataset

In [36]:
dataset

DatasetDict({
    test: Dataset({
        features: ['sentence', 'country', 'annotator1', 'label', 'document_id', 'sentence_id', 'term', 'date', 'name', 'party', 'gender', 'birth_year', 'ruling'],
        num_rows: 5200
    })
})