In [7]:
import json
import os
"""
DESC: Converted the original format of the batches to the new one that only includes requires rewrite
* dialogs = number of dialogs to have in a batch, -1 to include all

"""

def decreas_turn_num(json_data, offset = -1):
    for dialog_data in json_data.values():
        for turn in dialog_data['dialog']:
            turn['turn_num'] = turn['turn_num'] + offset
            
        for turn in dialog_data['annotations']:
            turn['turn_num'] = turn['turn_num'] + offset
            
    return json_data
        

def process_json(json_data, dialogs):
    # Read the JSON data from the input file
    new_dialog = {}
    # Loop through the data and process as required
    counter = 0
    for dialog_key, dialog_data in json_data.items():
        if dialogs != -1:
            if counter >= dialogs:
                break
            else:
                counter += 1
            
        dialog_data['annotator_name'] = None
        annotations = []
        key_to_remove = ['annotator_id', 'number_of_turns']
        for key, value in dialog_data.items():
            # Check if the key is a digit
            if key.isdigit() and key == "1":
                key_to_remove.append(key)
            elif key.isdigit():
                # Add the 'requires_rewrite' key
                annotations.append({'turn_num': int(key),'turn_id': dialog_data['dialog'][int(key)-1]['turn_id'] , 'requires rewrite': None})
                key_to_remove.append(key)
        
        dialog_data['annotations'] = annotations

        for key in key_to_remove:
            del dialog_data[key]
            
        new_dialog[dialog_key] = dialog_data

    return decreas_turn_num(new_dialog)


def convert_all_json_batches(folder_path, folder_output_path, dialog):
    batch_num = 1
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                data = json.load(file)
                json_data = process_json(data, dialog)
                output_path = os.path.join(folder_output_path, (f'batch_{batch_num}_src.json')) 
                # Write the modified data to the output file
                with open(output_path, 'w') as file:
                    json.dump(json_data, file, indent=4)
                batch_num+=1
   

# Define the input and output paths
folder_path = f"C:\\OneAI\\OneDrive - OneAI\\annotation\\requires_rewrite_only\\original-batches"
folder_output_path = f"C:\\OneAI\\OneDrive - OneAI\\annotation\\requires_rewrite_only\\requires-rewrite-empty-batches-shortened"



# Process the JSON data and save it
convert_all_json_batches(folder_path, folder_output_path, dialog=2)


In [8]:
import json
import os

def create_batch_data(batch_num, dialog_num, turn_num):
    dialogs = {}
    for dialog_i in range(dialog_num):
        turns = []
        dialog_turns = [{'turn_num': 0, 'turn_id': f"dialog_{dialog_i}_turn_0", "original_question": f"question_dialog_{batch_num}_{dialog_i}_0", "original_question_id": f"question_id_{dialog_i}_0", "answer": f"answer_0"}]
        for turn_i in range(1, turn_num):
            dialog_turns.append({'turn_num': turn_i, 'turn_id': f"dialog_{dialog_i}_turn_{turn_i}", "original_question": f"question_dialog_{batch_num}_{dialog_i}_{turn_i}", "original_question_id": f"question_id_{dialog_i}_{turn_i}", "answer": f"answer_{turn_i}"})
            turns.append({'turn_num': turn_i, 'turn_id': f"dialog_{dialog_i}_turn_{turn_i}", 'requires rewrite': None})
        
        dialogs[f"dialog_{dialog_i}"] = {"dialog": dialog_turns, "annotator_name": None, "annotations": turns}
    return dialogs

def create_batches_and_save(output_path, batches_num, dialog_num, turn_num):
    batches = []
    for batch_i in range(batches_num):
        batch = create_batch_data(batch_i, dialog_num, turn_num)
        batches.append(batch)
        
        batch_output_path = os.path.join(output_path, f"batch_{batch_i}.json")
        with open(batch_output_path, 'w') as file:
            json.dump(batch, file, indent=4)

folder_output_path = "requires-rewrite-toy-batches"

# Create the output folder if it doesn't exist
os.makedirs(folder_output_path, exist_ok=True)

create_batches_and_save(folder_output_path, 3, 2, 10)

In [5]:
import json
import os
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def save_json_file(output_path, data):
   with open(output_path, 'w') as file:
        json.dump(data, file, indent=4)

def execute_function_on_folder(input_path, output_path, function):
    batch_num = 1
    for file_name in os.listdir(input_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_path, file_name)
            data = read_json_file(file_path)
            new_data = function(data)
            output_file_path = os.path.join(output_path, (f'batch_{batch_num}_src.json')) 
            # Write the modified data to the output file
            save_json_file(output_file_path, new_data)
            batch_num+=1
            
def convert_xlsx_to_jsox(xlsx_file_path):
    data = pd.read_excel(xlsx_file_path)
    data_dict = data.to_dict()
    return data_dict

In [9]:
def remove_rewrites(json_data):
    new_file = {}
    for dialog_key, dialog_data in json_data.items():
        dialog = {"dialog": dialog_data['dialog'], "annotator_id": None}
        for key, value in dialog_data.items():
            if key.isdigit():
                turn = {"requires rewrite": None}
                dialog[key] = turn
        new_file[dialog_key] = dialog
    return new_file


execute_function_on_folder("vitalys-original-file", "new-dataset", remove_rewrites)

In [20]:
xlsx_file_path = "C:\\Users\\oriro\\Projects\\requires-rewrite-tkinter\\asi-dataset\\sampled_qa_14_4.xlsx"
data = pd.read_excel(xlsx_file_path).drop(columns=['Unnamed: 0'])

# Set the 'sample_id' column as the index
data.set_index('sample_id', inplace=True)

# Convert DataFrame to dictionary with 'index' orientation
data_dict = data.to_dict(orient='index')


In [22]:
data_dict

{'QReCC-Train_1102_1': {'dialog_id': 'QReCC-Train_1102',
  'turn_id': 1,
  'question': 'Why did Gavin leave Titãs?',
  'response': 'Gavin stated that he was physically and mentally exhausted because of the Titas tours and album releases.',
  'need_rewrite': nan,
  'based_on_text': 'The first single from the album was " Antes de Você " ("Before You"), [15] and it received radio airplay on May 7. [16] It was featured at the Caras & Bocas (current 19pm Rede Globo telenovela ) soundtrack. [15] The second single was " Porque Eu Sei que É Amor " ( Because I Know It\'s Love ) which was featured in Cama de Gato (a Brazilian telenovela that also features the song "Pelo Avesso" as opening theme, from their 2003 album Como Estão Vocês? ) and reached #16 at Brasil Hot 100 Airplay [17] In an interview to Jornal da Tarde , and regarding the music of the new album, Bonadio stated: [18] “ They have an endless talent. I don\'t want to make just another album of Titãs, don\'t want it to be similar to th