In [1]:
import json
import os
import re
import csv
import numpy as np
from annotationToString import convert_annotations_to_strings

annotations_folder = 'annotations'
final_annotations_file = 'final_annotations.json'

file_path = os.path.join(annotations_folder, final_annotations_file)

with open(file_path, 'r', encoding='utf-8') as f:
    final_data = json.load(f)
f.close()

In [2]:
def dataset_generator(final_data):

    dataset_aux = []
    final_rotulation = []
    item_count = 0
    patent_count = 0

    for index in range(len(final_data["annotations"])):
        annotation = final_data["annotations"][index]

        if annotation is not None and annotation[1]['entities']:
            
            patent_count += 1
            text = annotation[0]
            entities = annotation[1]
            
            match_length = len(re.match(r'\d_\d+_', text).group())
            text = re.sub(r'\d_\d+_', '', text)
            text = re.sub(r'\r', '', text)
            
            rotulation = convert_annotations_to_strings(text, entities, match_length)
            final_rotulation.append(rotulation)

            word_list = re.findall(r'\w+\s?|[^\w]\s?', text, re.UNICODE)
            start = 0
            end = 0
            sentence = 1

            for word in word_list:
                final_word = word.split()[0]
                end = start + len(word)-1
                dataset_aux.append([
                    item_count,
                    final_word,
                    'O',
                    start,
                    end,
                    sentence,
                    patent_count,
                ])
                item_count += 1
                start = start + len(word)
                if(word == '. '):
                    sentence += 1
    index = 0
    patent_num_aux = 1
    for word in dataset_aux:
        patent_num = word[6]
        if(patent_num != patent_num_aux):
            index += 1
        for entity in final_rotulation[index]:
            if(word[1] != '.'):
                if (word[3] == entity[2][0] and word[4] <= entity[2][1]):
                    word[2] = 'B-' + entity[1]
                elif (word[3] >= entity[2][0] and word[4] <= entity[2][1]):
                    word[2] = 'I-' + entity[1]
                    break
        patent_num_aux = patent_num

    dataset = [sublist[:3] + sublist[5:] for sublist in dataset_aux]
    return dataset

In [3]:
from numpy import floor


def partition_maker(dataset, partition_number):
    
    counters_patent = []
    count_aux = [0, 0, 0, 0, 0, 0]
    current_patent = dataset[0][4]
    
    category_map = {
        'FUNCIONALIDADE': 0,
        'HARDWARE': 1,
        'SOFTWARE': 2,
        'COMUNICAÇÃO': 3,
        'PESSOA': 4
    }

    for data in dataset:
        if data[4] != current_patent:
            count_aux[5] = current_patent
            counters_patent.append(count_aux)
            count_aux = [0, 0, 0, 0, 0, 0]
            current_patent = data[4]
        
        category = data[2].split('-')[-1]
        if category in category_map:
            count_aux[category_map[category]] += 1
    
    count_aux[5] = current_patent
    counters_patent.append(count_aux)
    
    counters_array = np.array(counters_patent)
    total_counts = np.sum(counters_array[:, :-1], axis=0)
    media = total_counts / partition_number
    
    sub_sets = [[np.zeros(5), []] for _ in range(partition_number)]

    print(media)
    
    for index in range(len(category_map) - 1, -1, -1):
        for partition in range(partition_number):
            while sub_sets[partition][0][index] < floor(media[index]):
                found = False
                for i, counter in enumerate(counters_array):
                    # Verificar se adicionar este counter vai desbalancear as outras categorias
                    would_balance = True
                    for j in range(len(category_map)):
                        if sub_sets[partition][0][j] + counter[j] > floor(media[j]):
                            would_balance = False
                            break
                    
                    if counter[index] > 0 and would_balance:
                        sub_sets[partition][0][:5] += counter[:-1]
                        sub_sets[partition][1].append(counter[5])
                        counters_array = np.delete(counters_array, i, 0)
                        found = True
                        break
                
                if not found:
                    break
    
    sub_sets[9][0][:5] += counters_array[2][:-1]
    sub_sets[9][1].append(counters_array[2][5])
    
    sub_sets[9][0][:5] += counters_array[0][:-1]
    sub_sets[9][1].append(counters_array[0][5])
    
    sub_sets[6][0][:5] += counters_array[4][:-1]
    sub_sets[6][1].append(counters_array[4][5])

    sub_sets[7][0][:5] += counters_array[3][:-1]
    sub_sets[7][1].append(counters_array[3][5])

    sub_sets[8][0][:5] += counters_array[1][:-1]
    sub_sets[8][1].append(counters_array[1][5])

    sub_sets[8][0][:5] += counters_array[5][:-1]
    sub_sets[8][1].append(counters_array[5][5])


    return [sub_set[1] for sub_set in sub_sets]
    

In [6]:
def reorder_patents(sub_sets, dataset):
    new_dataset = []
    count_item = 0
    count_patent = 1
    for index, sub_set in enumerate(sub_sets):
        for patent in sub_set:
            for data in dataset:
                if data[4] == patent:
                    
                    data_aux = data.copy()
                    
                    data_aux[0] = count_item
                    data_aux[4] = count_patent

                    partition = index + 1
                    data_aux.append(partition)

                    if(partition == 8):
                        data_aux.append("validation")
                    elif(partition > 8):
                        data_aux.append("test")
                    else:
                        data_aux.append("training")
                        
                    new_dataset.append(data_aux)
                    
                    count_item += 1

            count_patent += 1

    return new_dataset


In [7]:
dataset = dataset_generator(final_data)
sub_sets = partition_maker(dataset, 10)
dataset = reorder_patents(sub_sets, dataset)

for data in dataset:
    print(data)

[169.3  92.5  31.3  18.4   6.3]
[0, 'Fitting', 'O', 1, 1, 1, 'training']
[1, 'room', 'O', 1, 1, 1, 'training']
[2, 'management', 'O', 1, 1, 1, 'training']
[3, 'and', 'O', 1, 1, 1, 'training']
[4, 'occupancy', 'B-SOFTWARE', 1, 1, 1, 'training']
[5, 'monitoring', 'I-SOFTWARE', 1, 1, 1, 'training']
[6, 'system', 'I-SOFTWARE', 1, 1, 1, 'training']
[7, 'for', 'O', 1, 1, 1, 'training']
[8, 'retail', 'O', 1, 1, 1, 'training']
[9, 'operations', 'O', 1, 1, 1, 'training']
[10, 'having', 'O', 1, 1, 1, 'training']
[11, 'fitting', 'O', 1, 1, 1, 'training']
[12, 'rooms', 'O', 1, 1, 1, 'training']
[13, 'wherein', 'O', 1, 1, 1, 'training']
[14, 'the', 'O', 1, 1, 1, 'training']
[15, 'system', 'O', 1, 1, 1, 'training']
[16, 'if', 'O', 1, 1, 1, 'training']
[17, 'capable', 'O', 1, 1, 1, 'training']
[18, 'of', 'O', 1, 1, 1, 'training']
[19, 'notifying', 'B-FUNCIONALIDADE', 1, 1, 1, 'training']
[20, 'staff', 'I-FUNCIONALIDADE', 1, 1, 1, 'training']
[21, 'of', 'O', 1, 1, 1, 'training']
[22, 'fitting', 'O', 1

In [10]:
headers = ["item", "token", "tag", "sentence", "patent", "partition", "trainingTest"]

with open('final_dataset.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(headers)
    writer.writerows(dataset)