In [1]:
import WebNLG_xmlReader.benchmark_reader as xml_reader
import os.path
import pickle

# Dataset Cleaning and Preperations

CACAPO contains data for both pipeline and neural end-to-end structures. As this project only focuses on E2E models, we will not need a majority of the data. The code below extracts the data and makes it easier to retrieve for model fine-tuning

In [2]:
combined_train_dataset = [
                '../Data/CACAPO/en/Incidents/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/en/Sports/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/en/Stocks/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/en/Weather/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Incidents/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Sports/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Stocks/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Weather/WebNLGFormatTrain.xml']

combined_dev_dataset = [
                '../Data/CACAPO/en/Incidents/WebNLGFormatDev.xml', 
                '../Data/CACAPO/en/Sports/WebNLGFormatDev.xml', 
                '../Data/CACAPO/en/Stocks//WebNLGFormatDev.xml', 
                '../Data/CACAPO/en/Weather/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Incidents/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Sports/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Stocks/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Weather/WebNLGFormatDev.xml']

combined_test_dataset = [
                '../Data/CACAPO/en/Incidents/WebNLGFormatTest.xml', 
                '../Data/CACAPO/en/Sports/WebNLGFormatTest.xml', 
                '../Data/CACAPO/en/Stocks/WebNLGFormatTest.xml', 
                '../Data/CACAPO/en/Weather/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Incidents/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Sports/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Stocks/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Weather/WebNLGFormatTest.xml']

all_data = [combined_train_dataset, combined_dev_dataset, combined_test_dataset]

In [61]:

# create instance of benchmark class to transform xml
train_instance =  xml_reader.Benchmark()
dev_instance =  xml_reader.Benchmark()
test_instance =  xml_reader.Benchmark()



def transform_data(data):
    """
    This function calls the xml_reader code to transform the xml into a more suitable code version to use.
    The function takes in the 3 different datasets, which it then transforms and collects into usable variables.
    """
    #loop through the different datasets groups and transform the xml into usable code
    for datasets in data:
        #choose the right files
        files = xml_reader.select_files(datasets)
        
        #For each datasplit, transform the xml and store the transformation into a usable variable
        try:
            if datasets == combined_train_dataset:
                train_instance.fill_benchmark(files)
            elif datasets == combined_dev_dataset:
                dev_instance.fill_benchmark(files)
            elif datasets == combined_test_dataset:
                test_instance.fill_benchmark(files)
            
            print(f'Completed the transformation of the datasets \n')
        except:
            print("Error: The proper datasets have not been found. Please check that all dataset splits are available")
    
    return train_instance, dev_instance, test_instance


def total_data_check(data_instance, iteration):
    labels = ['Train', 'Dev', 'Test']
    print(f"Number of entries: in {labels[iteration]}:      {data_instance.entry_count()} ") 
    print(f"Number of texts: in {labels[iteration]}:      {data_instance.total_lexcount()} ")
    print(f"Number of distinct properties in {labels[iteration]}:      {len(list(data_instance.unique_p_mtriples()))}")
    print("\n")

    
def single_entry_check(data_instance):
    
    for entry in data_instance.entries:
        #print(f'entry.id        {entry.id}')
        if entry.id == 'Id1':
            print(f"Info about {entry.id} in category '{entry.category}' in size '{entry.size}':")
            print("# of lexicalisations", entry.count_lexs())
            print("Properties: ", entry.relations())
            print("RDF triples: ", entry.list_triples())
            print("Subject:", entry.modifiedtripleset.triples[0].s)
            print("Predicate:", entry.modifiedtripleset.triples[0].p)
            print("Lexicalisation:", entry.lexs[0].lex)
            #print("Another lexicalisation:", entry.lexs[1].lex)
            if entry.dbpedialinks:
                # dbpedialinks is a list where each element is a Triple instance
                print("DB link, en:", entry.dbpedialinks[0].s)  # subject in English

            print("Article text", entry.lexs[0].return_text()) 
            

def extract_data(data_instance):
    RDF_set, text_set = [], []

    for entry in data_instance.entries:
        #RDF_text_set.append((entry.list_triples(), entry.lexs[0].return_text()))

        RDF_set.append(entry.list_triples())
        text_set.append(entry.lexs[0].return_text())


    #print(len(RDF_text_set))
    return RDF_set, text_set 

def write_to_file(data, iteration, data_type):
    """
    Data = dataset
    Iteration = iteration to determine the dataset split
    data_type = is the dataset RDF or text
    """
    labels = ['Train', 'Dev', 'Test']

    #print(f"path check  {os.path.exists(f'Data/Cleaned_data/{labels[iteration]}')}")

    try:

        if (os.path.exists(f'Data/Cleaned_data/{labels[iteration]}') == False) :
            print("Entered path check")

            save_path = 'C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/'

            name_of_file = f'{labels[iteration]}_{data_type}'

            completeName = os.path.join(save_path, name_of_file+".pkl")         

            with open(completeName, 'wb') as fp:
                print(f"Entered pickle check         {completeName}")

                pickle.dump(data, fp)

    except:
        print(f'file for {labels[iteration]} already exists')


def retrieve_data(file_name):
    
    dataset_path = f"../Data/Cleaned_data/{file_name}.pkl"
    with open(dataset_path, 'rb') as f:
        dataset = pickle.load(f)
    
    return dataset

def Overal_function(data):
    transformed_train, transformed_dev, transformed_test = transform_data(data)
    # print(f'train:      {transformed_train}')
    # print(f'dev:      {transformed_dev}')
    # print(f'test:      {transformed_test}')

    combined_transformation = [transformed_train, transformed_dev, transformed_test]
    RDF_text_datasets =[]

    for iteration, dataset in enumerate(combined_transformation):
        
             
        # print(f'dataset {dataset}')
        # print(f'iteration {iteration}')
        
        #total_data_check(dataset, iteration)
        #single_entry_check(dataset)

        RDF_set, Text_set = extract_data(dataset)
        #RDF_text_datasets.append()
        write_to_file(RDF_set, iteration, 'RDF')
        write_to_file(Text_set, iteration, 'text')
        print("\n\n\n")

    #return RDF_text_datasets

In [62]:
Overal_function(all_data)

Completed the transformation of the datasets 

Completed the transformation of the datasets 

Completed the transformation of the datasets 

0
Entered path check
Entered pickle check         C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/Train_RDF.pkl
0
Entered path check
Entered pickle check         C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/Train_text.pkl




1
Entered path check
Entered pickle check         C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/Dev_RDF.pkl
1
Entered path check
Entered pickle check         C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/Dev_text.pkl




2
Entered path check
Entered pickle check         C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/Test_RDF.pkl
2
Entered path check
Entered pickle check         C:/Users/Simon/Desktop/Arria Thesis/MscThesis/Data/Cleaned_data/Test_text.pkl






# Transformer modelling

In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-win_amd64.whl (1.1 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
def tokenize_data(data):
    """
    Data is a list of tuples consisting of (RDF_triple, article text)
    """
    
    for entry in data:
        tokenizer
        

In [None]:
#data
datasets = Overal_function(all_data)
tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(3e-5))

model.fit(tokenized_data, labels)