In [1]:
import string
import json
import pandas as pd
import re
import os


def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def save_json_file(output_path, data):
   with open(output_path, 'w') as file:
        json.dump(data, file, indent=4)

def split_json_file(json_data):
    """
    Splits a JSON file into two separate JSON objects.

    Args:
        json_data (dict): The JSON data to be split.

    Returns:
        tuple: A tuple containing two JSON objects. The first object contains the first half of the original JSON data,
               and the second object contains the second half of the original JSON data.
    """
    json_data1, json_data2 = {}, {}
    mid_index = len(json_data) // 2
    
    for key, value in json_data.items():
        if len(json_data1) < mid_index:
            json_data1[key] = value
        else:
            json_data2[key] = value

    return json_data1, json_data2

def process_yael_data(df, remove_no_welcome=True):
    """
    Process the Yael data and convert it into a specific format.

    Args:
        json_data (DataFrame): The input JSON data.

    Returns:
        tuple: A tuple containing two dictionaries. The first dictionary contains the processed data
        with enough turns, and the second dictionary contains the data with not enough turns.
    """
    df_grouped = df.groupby('chat_id')

    order = df.drop_duplicates(subset='chat_id').reset_index(drop=True)['chat_id']
    
    
    json_data = {}
    not_enough = {}
    
    counter = 0
    
    for group_name in order:  # #your data
        if counter == 0:
            print(f"first group name: {group_name}")
        group_df = df_grouped.get_group(group_name)

        counter += 1

        if counter % 500 == 0:
            pass
            #print(counter)
        if "is_english" in group_df.keys():
            if not group_df.iloc[0]['is_english']:
                continue

        welcome_message = 0
        group_df = group_df.reset_index(drop=True)
        dialog_dict = {"number_of_turns": 0, "annotator_id": None, "dialog": {}}

        if str(group_df.iloc[0]['welcome_message']) != "nan":
            dialog_dict["number_of_turns"] += 1
            welcome_message = 1
            dialog_dict["dialog"][0] = {
                "turn_num": 0,
                "original_question": group_df.iloc[0]['welcome_message'],
                "answer": "",
            }
        elif remove_no_welcome:
            print("no welcome message")
            continue
        

        turns_counter = 0

        for index in range(welcome_message, len(group_df), 2):
            dialog_dict["number_of_turns"] += 1
            row1 = group_df.iloc[index]

            if index + 1 < len(group_df):
                row2 = group_df.iloc[index + 1]
            else:
                row2 = None

            turn_num = int(index / 2) + welcome_message

            dialog_dict["dialog"][turn_num] = {
                "turn_num": int(index / 2) + welcome_message,
                "original_question": str(row1['message']),
                "answer": str(row2['message']) if row2 is not None else "None",
            }

            if index + welcome_message > 0:
                dialog_dict["dialog"][turn_num]["requires_rewrite"] = None
                dialog_dict["dialog"][turn_num]["enough_context"] = None
                turns_counter += 1

      
        
        if turns_counter > 0:
            json_data[group_name] = dialog_dict
        else:
            not_enough[group_name] = dialog_dict

    return json_data, not_enough

def count_dialogs_folder(input_path):
    """
    Count the number of dialogs in each CSV file in the specified folder.

    Args:
        input_path (str): The path to the folder containing the CSV files.

    Returns:
        None
    """
    def count(name, data):
        return f"file {name} has {len(data)} dialogs\n", len(data)
    counter = 0
    output = f"working on folder {input_path}, combined number of dialogs is"
    temp_output = f""
    for file_name in os.listdir(input_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_path, file_name)
            data = process_yael_data(pd.read_csv(file_path))[0]
            str, num = count(file_name, data)
            temp_output += str
            counter += num

    output = output + f" {counter}\n" + temp_output
    print(output)

def convert_yael_files_in_folder(input_path, output_path):
    """
    Convert Yael files in a folder.

    Args:
        input_path (str): The path to the folder containing the Yael files.
        output_path (str): The path to the folder where the converted files will be saved.

    Returns:
        None
    """
    batch_num = 1
    for file_name in os.listdir(input_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_path, file_name)
            data = read_json_file(file_path)
            new_data = process_yael_data(data)
            output_file_path = os.path.join(output_path, (f'batch_{batch_num}_src.json'))
            # Write the modified data to the output file
            save_json_file(output_file_path, new_data)
            batch_num+=1

def combine_csv_files(input_path):
    """
    Combines the data from multiple CSV files in the given input path and converts and combine all of them to the new format.

    Args:
        input_path (str): The path to the directory containing the CSV files.

    Returns:
        dict: A dictionary containing the combined data from all the CSV files.
    """
    converted_data_all_files = {}
    not_enough_all_files = {}

    for file_name in os.listdir(input_path):
         if file_name.endswith('.csv'):
            file_path = os.path.join(input_path, file_name)
            converted_data_dialog, not_enough_dialog = process_yael_data(pd.read_csv(file_path), file_name)
            converted_data_all_files.update(converted_data_dialog)
            not_enough_all_files.update(not_enough_dialog)


    return converted_data_all_files, not_enough_all_files



In [2]:
new_data, not_enough = combine_csv_files("datasets_3_6/date_sorted_filtered_double")

first group name: chat_9812e6bb-a78c-475b-ad5a-dacf1d7e4f7e
first group name: chat_2837b83c-e7ef-4bf4-8c21-459d94d22fdb
first group name: chat_e9403e6d-64c7-4d08-9082-842ad1867197
first group name: chat_1ec0212f-67dd-429b-ba61-42f1e6ba1ceb
first group name: chat_ee2def6d-26ad-4434-bbef-505838f72be7
first group name: chat_7299aaff-dfbf-441a-b57c-046e176818f5
first group name: chat_726d4434-6d99-4000-8a27-5e1c45600462
no welcome message
first group name: chat_7e69d283-c3db-4e07-8571-182ec92051bb


In [44]:
import random


"""
Create a test file by shuffling the data and selecting a subset of dialogs.

Args:
    new_data (dict): A dictionary containing the data to be shuffled.

Returns:
    dict: A dictionary containing a subset of dialogs from the shuffled data.
"""

items = list(new_data.items())
random.shuffle(items)
suffled_data = dict(items)

test_file = {}
dialog_test_key = []
turn_counter = 0
for key, value in suffled_data.items():
    number_of_turns = value['number_of_turns']
    turn_counter += number_of_turns
    dialog_test_key.append(key)
    test_file[key] = value
    if turn_counter > 60:
        break
    
    
for dialog_key, dialog_value in test_file.items():
    for turn_key, turn_value in dialog_value['dialog'].items():
        turn_value['needs_clarification'] = None


In [45]:
save_json_file("agent_conv_test.json", test_file)