In [1]:
import json
import os
import re
import csv
from annotationToString import convert_annotations_to_strings

annotations_folder = 'annotations'
final_annotations_file = 'final_annotations.json'

file_path = os.path.join(annotations_folder, final_annotations_file)

with open(file_path, 'r', encoding='utf-8') as f:
    final_data = json.load(f)
f.close()

In [64]:
question = """Given the provided patent text, your goal is to identify and extract key information, organizing it into a structured JSON output. The information to be extracted includes the functionalities of the patent, all functionalities needs to start with a verb; hardware, which refers to the physical components of IT; software, which encompasses instructions, operating systems, programs, and applications installed and executed on the hardware; communication, which refers to data transmission controlled by software between distinct hardware using signals in a specific medium, including wireless; and people, which refers to those who use or are affected by IT, interacting with software, hardware, ranging from users to IT professionals.

The specific categories of entities to be identified and extracted from the text include functionalities, hardware, software, communication, and people. Each identified entity must be correctly categorized and included in the JSON output, according to the format specified below. Attention to detail is crucial to ensure that all relevant entities are captured and correctly classified. All entities must be written literally in the patent.

Expected Output Format (JSON):
```json
{
  "Functionality": ["providing smart mobility assistance", "monitor an area", "tracking data"],
  "Hardware": ["processor", "sensor", "user equipment"],
  "Software": ["mobile application server", "machine learning model", "back-end server"],
  "Communication": ["near field communication", "bluetooth", "wi-fi network"],
  "People": ["user", "persons with limited mobility", "passenger"]
}"""


In [57]:
def generate_output(patent_data):
    category_map = {
        "FUNCIONALIDADE": "Functionality",
        "HARDWARE": "Hardware",
        "SOFTWARE": "Software",
        "COMUNICAÇÃO": "Communication",
        "PESSOA": "People"
    }

    output_dict = {category: [] for category in category_map.values()}

    for entity in patent_data:
        category = category_map.get(entity[1])
        if category:
            output_dict[category].append(f'"{entity[0]}"')

    output_lines = [f'"{key}": [{", ".join(values)}]' for key, values in output_dict.items() if values]
    output = "{\n" + ",\n".join(output_lines) + "\n}"

    return output

In [62]:
def instruction_dataset_generator(final_data):

    dataset = []
    patent_count = 0

    for index in range(len(final_data["annotations"])):
        annotation = final_data["annotations"][index]

        if annotation is not None and annotation[1]['entities']:
            
            patent_count += 1
            text = annotation[0]
            entities = annotation[1]
            
            match_length = len(re.match(r'\d_\d+_', text).group())
            text = re.sub(r'\d_\d+_', '', text)
            text = re.sub(r'\r', '', text)
            
            rotulation = convert_annotations_to_strings(text, entities, match_length)

            output = generate_output(rotulation)
            dataset.append([question, text, output])

    return dataset

In [84]:
dataset = instruction_dataset_generator(final_data)
for data in dataset:
    print(data[0])
    print(data[1])
    print(data[2])
dataset[:3]

Given the provided patent text, your goal is to identify and extract key information, organizing it into a structured JSON output. The information to be extracted includes the functionalities of the patent, all functionalities needs to start with a verb; hardware, which refers to the physical components of IT; software, which encompasses instructions, operating systems, programs, and applications installed and executed on the hardware; communication, which refers to data transmission controlled by software between distinct hardware using signals in a specific medium, including wireless; and people, which refers to those who use or are affected by IT, interacting with software, hardware, ranging from users to IT professionals.

The specific categories of entities to be identified and extracted from the text include functionalities, hardware, software, communication, and people. Each identified entity must be correctly categorized and included in the JSON output, according to the format 

[['Given the provided patent text, your goal is to identify and extract key information, organizing it into a structured JSON output. The information to be extracted includes the functionalities of the patent, all functionalities needs to start with a verb; hardware, which refers to the physical components of IT; software, which encompasses instructions, operating systems, programs, and applications installed and executed on the hardware; communication, which refers to data transmission controlled by software between distinct hardware using signals in a specific medium, including wireless; and people, which refers to those who use or are affected by IT, interacting with software, hardware, ranging from users to IT professionals.\n\nThe specific categories of entities to be identified and extracted from the text include functionalities, hardware, software, communication, and people. Each identified entity must be correctly categorized and included in the JSON output, according to the fo

In [66]:
headers = ["question", "context", "answer"]

with open('instruction_dataset.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(headers)
    writer.writerows(dataset)

In [83]:
import pandas as pd

file_path = 'instruction_dataset.csv'
df = pd.read_csv(file_path)

print(df.head())

                                            question  \
0  Given the provided patent text, your goal is t...   
1  Given the provided patent text, your goal is t...   
2  Given the provided patent text, your goal is t...   
3  Given the provided patent text, your goal is t...   
4  Given the provided patent text, your goal is t...   

                                             context  \
0  Fitting room management and occupancy monitori...   
1  An X-ray sensing apparatus includes a detector...   
2  Apparatuses, methods and storage medium associ...   
3  A terminal device is provided to output inform...   
4  In one example, the present disclosure describ...   

                                              answer  
0  {\n"Functionality": ["notifying staff", "track...  
1  {\n"Functionality": ["generate a plurality of ...  
2  {\n"Functionality": ["receive sensor data", "t...  
3  {\n"Functionality": ["output information", "co...  
4  {\n"Functionality": ["providing smart mobility..