# Preparing the dataset

Importing the necessary libraries

In [79]:
import json
import pandas as pd
from copy import deepcopy
import re
from tqdm import tqdm

In [80]:
# Import the file

with open("/kaggle/input/genre-identification-corpus-ginco-10/GINCO-1.0-suitable.json") as f:
    dataset = json.load(f)

dataset[0]

 ## Extract text from paragraphs into one string: baseline text

 We'll create an additional parameter for each document: "keep_text" with text non-duplicates and duplicates, marked as "keep" (useful for genre identification, although they are duplicates). After joining text, we'll delete the separate paragraphs to make the file smaller.

In [81]:
for instance in dataset:
    paragraphs = instance["paragraphs"]
    # Removing duplicates:
    paragraphs = [p for p in paragraphs if p["keep"]]

    # Joining texts:
    instance_keep_text = " ".join([p["text"] for p in paragraphs])
    
    # Remove all symbols that raise an error in CLASSLA:
    gt_re = re.compile("&gt;")
    lt_re = re.compile("&lt;")
    instance_keep_text = gt_re.sub('',instance_keep_text)
    instance_keep_text = lt_re.sub('',instance_keep_text)
    
    # Assigning texts to a new field:
    instance["baseline_text"] = instance_keep_text
    
    # Delete the text in paragraphs to save space:
    instance["paragraphs"] = []
    
    # Add a new field for text length:
    list_baseline_text = []
    list_baseline_text = instance["baseline_text"].split()    
    instance["no_of_words"] = len(list_baseline_text)
    
dataset[0]

## Add text representations

Import the CLASSLA Pipeline for processing of Slovene

In [82]:
!pip install classla

In [None]:
# Import CLASSLA for language processing
import classla
classla.download("sl")
nlp = classla.Pipeline('sl')

Let's first see how the pipeline works by processing the first text:

In [None]:
# Run the processing on the first text
doc = nlp(dataset[0]["baseline_text"])

dictionary = doc.to_dict()
dictionary[0][0][:2]

In [None]:
def process_dataset_json(dataset,file_name):
    """
    This function performs linguistic processing of the text in the text["baseline_text"] value
    by using the CLASSLA pipeline for processing of South Slavic languages.
    It adds keys lemmas, upos, xpos, ner and dependency to each dictionary element
    in the dataset containing information on each document.
    Finally, it adds the update content for each text dictionary
    to a JSON file which name is specified with the argument file_name.
    
    Be aware that if a text contains more than 3000 words, you might need to split the text
    into multiple instances to avoid overloading the memory.

    Args:
        dataset (str): name of the dataset object
        file_name(str): name of the JSON file where the new dataset is written
    """
    json_dict = []
        
    for text in tqdm(dataset):
            if len(text["baseline_text"]) > 0:
                #Assign text to a new object:
                text_object = {}
                text_object = text  

                # Process the baseline text:
                nlp_doc = nlp(text_object["baseline_text"])

                # Create a dictionary from the result:
                nlp_dict = {}
                nlp_dict = nlp_doc.to_dict()

                classla_dict = []

                # Add the representations of each word in a dictionary:
                for sentence in nlp_dict:
                    for word_dict in sentence[0]:
                        classla_dict.append(word_dict)

                # Store separate representations in strings:
                lemma_string = ""
                upos_string = ""
                xpos_string = ""
                ner_string = ""
                dependency_string = ""

                # Add the results from the dictionary to strings:
                for word in classla_dict:
                                lemma_string += word["lemma"]
                                lemma_string += " "
                                upos_string += word['upos']
                                upos_string += " "
                                xpos_string += word['xpos']
                                xpos_string += " "
                                ner_string += word["ner"]
                                ner_string += " "
                                dependency_string += word["deprel"]
                                dependency_string += " "

                # Add the strings to the dataset instances:
                text_object["lemmas"] = lemma_string
                text_object["upos"] = upos_string
                text_object["xpos"] = xpos_string
                text_object["ner"] = ner_string
                text_object["dependency"] = dependency_string
                text_id = text_object["id"]
                
                # Add the object to the list of objects which will be saved into JSON
                json_dict.append(text_object)

            # Print out an notification if a document was empty.
            else:
                print(f"Document with the id {text_id} is empty. {str(empty_counter)} of documents are empty, not processed.")
        
    new_representation_file = open(file_name, "w")
    json.dump(json_dict,new_representation_file, indent= "")
    new_representation_file.close()

    print("Hooray, processing of the dataset is completed.")

In [None]:
process_dataset_json(dataset[:3],"example.json")

In [None]:
dataset_rejected = []

def multiple_texts_from_long_text(instance):
    """
    Splits a long text into instances with shorter texts (1000 words), appends them
    to the dataset with additional parameters: "short_text":True and "position_short_text"
    so that the text can be merged if needed.
    Args: instance (dictionary): an object from the the dataset representing one document
    
    It appends split instances to the dataset_rejected list.
    """
    long_text = deepcopy(instance)
    
    # Create a list from the baseline text and split it into chunk of 1000 words:
    list_long_text = []
    list_long_text = long_text["baseline_text"].split()
    long_text_chunks = [list_long_text[x:x+1000] for x in range(0, len(list_long_text), 1000)]
    
    # Merge the words in lists of chunks into strings, create a list of strings:
    list_of_long_text_chunks = []

    for chunk in long_text_chunks:
        split_text = ""
        split_text = " ".join(chunk)
        new_chunk = []
        new_chunk.append(split_text)
        list_of_long_text_chunks.append(new_chunk)
    
    for i in range(len(list_of_long_text_chunks)):
        new_text_instance = {}
        new_text_instance = deepcopy(long_text)
        new_text_instance["short_text"] = True
        new_text_instance["position_short_text"] = i+1
        new_baseline_text = ""
        new_baseline_text = list_of_long_text_chunks[i][0]
        new_text_instance["baseline_text"] = new_baseline_text
        list_baseline_text = []
        list_baseline_text = new_baseline_text.split()    
        new_text_instance["no_of_words"] = len(list_baseline_text)
        dataset_rejected.append(new_text_instance)

In [None]:
process_dataset_json(dataset_rejected,"GINCO-rejected-dataset.json")

Then we would need to merge the instances and add them to the main processed dataset.