In [20]:
import json
import pandas as pd
import random
import os

def check_if_ordered_stayed_the_same(original_yael_data, processed_annotation_data, not_enough_dialog=None, processed=False):
    if not processed:
        original_order = list(dict.fromkeys(original_yael_data['chat_id'].tolist()))
    else:
        original_order = list(dict.fromkeys(original_yael_data.keys()))
        
    if not_enough_dialog:
        original_order = [dialog_id for dialog_id in original_order if dialog_id not in not_enough_dialog.keys()]
    processed_order = list(processed_annotation_data.keys())

    if original_order == processed_order:
        print("The order stayed the same")
        return True
    else:
        print("The order changed")
        print(f"Length: {len(original_order)} | Original order: {original_order}")
        print(f"Length: {len(processed_order)} | Processed order: {processed_order}")
        changed_order_details(original_order, processed_order)
        return False

def changed_order_details(original_order, processed_order):
    original_set = set(original_order)
    processed_set = set(processed_order)

    if original_set == processed_set:
        print("Both orders have the same items when considered as sets.")
    else:
        print("The sets of items are different.")
        print(f"Items in original but not in processed: {original_set - processed_set}")
        print(f"Items in processed but not in original: {processed_set - original_set}")

    # Find where the orders differ
    min_length = min(len(original_order), len(processed_order))
    differences = []
    for i in range(min_length):
        if original_order[i] != processed_order[i]:
            differences.append((i, original_order[i], processed_order[i]))

    if differences:
        print("Differences in order at positions:")
        for index, original, processed in differences:
            print(f"Position {index}: Original: {original}, Processed: {processed}")
    else:
        if len(original_order) != len(processed_order):
            print(f"The lengths are different. Original length: {len(original_order)}, Processed length: {len(processed_order)}")
        else:
            print("The items are the same but the order is different.")
    
def read_json_file(file_path):
    """
    Reads a JSON file and returns the data as a Python object.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict: The data read from the JSON file.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        JSONDecodeError: If the file is not a valid JSON file.

    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def save_json_file(output_path, data):
    """
    Save data as JSON to the specified output path.

    Parameters:
    - output_path (str): The path to save the JSON file.
    - data (dict): The data to be saved as JSON.

    Returns:
    - None
    """
    with open(output_path, 'w') as file:
        json.dump(data, file, indent=4)

def split_json_file(json_data):
    """
    Splits a JSON file into two separate JSON objects.

    Args:
        json_data (dict): The JSON data to be split.

    Returns:
        tuple: A tuple containing two JSON objects. The first object contains the first half of the original JSON data,
               and the second object contains the second half of the original JSON data.
    """
    json_data1, json_data2 = {}, {}
    mid_index = len(json_data) // 2
    
    for key, value in json_data.items():
        if len(json_data1) < mid_index:
            json_data1[key] = value
        else:
            json_data2[key] = value

    return json_data1, json_data2

def process_yael_data(df, remove_no_welcome=True):
    """
    Process the Yael data and convert it into a specific format.

    Args:
        json_data (DataFrame): The input JSON data.

    Returns:
        tuple: A tuple containing two dictionaries. The first dictionary contains the processed data
        with enough turns, and the second dictionary contains the data with not enough turns.
    """
    df_grouped = df.groupby('chat_id')

    #order = df.drop_duplicates(subset='chat_id').reset_index(drop=True)['chat_id']
    
    order = list(dict.fromkeys((df['chat_id'].tolist())))
    json_data = {}
    not_fit_for_annotation = {}
    
    counter = 0
    
    for group_name in order:  # #your data
        if counter == 0:
            print(f"first group name: {group_name}")
        group_df = df_grouped.get_group(group_name)

        counter += 1

        if counter % 500 == 0:
            pass
            #print(counter)

        welcome_message = 0
        group_df = group_df.reset_index(drop=True)
        dialog_dict = {"number_of_turns": 0, "annotator_id": None, "dialog": {}}
        welcome_message = True
        if remove_no_welcome:
            if str(group_df.iloc[0]['welcome_message']) != "nan":
                dialog_dict["number_of_turns"] += 1
                welcome_message = 1
                dialog_dict["dialog"][0] = {
                    "turn_num": 0,
                    "original_question": group_df.iloc[0]['welcome_message'],
                    "answer": "",
                }
            else:
                welcome_message = False
        

        turns_counter = 0

        for index in range(welcome_message, len(group_df), 2):
            dialog_dict["number_of_turns"] += 1
            row1 = group_df.iloc[index]

            if index + 1 < len(group_df):
                row2 = group_df.iloc[index + 1]
            else:
                row2 = None

            turn_num = int(index / 2) + welcome_message

            dialog_dict["dialog"][turn_num] = {
                "turn_num": int(index / 2) + welcome_message,
                "original_question": str(row1['message']),
                "answer": str(row2['message']) if row2 is not None else "None",
            }

            if index + welcome_message > 0:
                dialog_dict["dialog"][turn_num]["requires_rewrite"] = None
                dialog_dict["dialog"][turn_num]["enough_context"] = None
                dialog_dict["dialog"][turn_num]["needs_clarification"] = None
                turns_counter += 1
            
        if "is_english" in group_df.keys(): # if the column is in the dialog
            if not group_df.iloc[0]['is_english']:
                not_fit_for_annotation[group_name] = dialog_dict
                
        elif remove_no_welcome and not welcome_message: # if the no welcome message (intro) in the dialog
            not_fit_for_annotation[group_name] = dialog_dict
            
        elif turns_counter > 0: # if the column is not in the data
           
            json_data[group_name] = dialog_dict
            
        else:
            not_fit_for_annotation[group_name] = dialog_dict

    return json_data, not_fit_for_annotation

def count_dialogs_folder(input_path):
    """
    Count the number of dialogs in each CSV file in the specified folder.

    Args:
        input_path (str): The path to the folder containing the CSV files.

    Returns:
        None
    """
    def count(name, data):
        return f"file {name} has {len(data)} dialogs\n", len(data)
    counter = 0
    output = f"working on folder {input_path}, combined number of dialogs is"
    temp_output = f""
    for file_name in os.listdir(input_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_path, file_name)
            data = process_yael_data(pd.read_csv(file_path))[0]
            str, num = count(file_name, data)
            temp_output += str
            counter += num

    output = output + f" {counter}\n" + temp_output
    print(output)

def convert_yael_files_in_folder(input_path, output_path):
    """
    Convert Yael files in a folder.

    Args:
        input_path (str): The path to the folder containing the Yael files.
        output_path (str): The path to the folder where the converted files will be saved.

    Returns:
        None
    """
    batch_num = 1
    for file_name in os.listdir(input_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_path, file_name)
            data = read_json_file(file_path)
            new_data = process_yael_data(data)
            output_file_path = os.path.join(output_path, (f'batch_{batch_num}_src.json'))
            # Write the modified data to the output file
            save_json_file(output_file_path, new_data)
            batch_num+=1

def combine_csv_files(input_path):
    """
    Combines the data from multiple CSV files in the given input path and converts and combine all of them to the new format.

    Args:
        input_path (str): The path to the directory containing the CSV files.

    Returns:
        dict: A dictionary containing the combined data from all the CSV files.
    """
    converted_data_all_files = {}
    not_fit_all_files = {}
    combined_original_data = pd.DataFrame()

    for file_name in os.listdir(input_path):
   

        if file_name.endswith('.csv'):
            file_path = os.path.join(input_path, file_name)
            origina_data = pd.read_csv(file_path)
            combined_original_data = pd.concat([combined_original_data, origina_data], ignore_index=True)
            converted_data_dialog, not_fit_dialog = process_yael_data(origina_data)
            check_if_ordered_stayed_the_same(origina_data, converted_data_dialog, not_fit_dialog)
            converted_data_all_files.update(converted_data_dialog)
            not_fit_all_files.update(not_fit_dialog)
            
    


    print(f"check order for all dialogs in all files:")
    check_if_ordered_stayed_the_same(combined_original_data, converted_data_all_files, not_fit_all_files)
    return converted_data_all_files,  not_fit_all_files 

def new_combine_csv_files(input_path):
    """
    Combines the data from multiple CSV files in the given input path and converts and combines all of them to the new format.

    Args:
        input_path (str): The path to the directory containing the CSV files.

    Returns:
        dict: A dictionary containing the combined data from all the CSV files.
    """
    combined_original_data = pd.DataFrame()

    # Combine all CSV files into one DataFrame
    for file_name in os.listdir(input_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_path, file_name)
            original_data = pd.read_csv(file_path)
            combined_original_data = pd.concat([combined_original_data, original_data], ignore_index=True)

    # Process the combined data
    converted_data_all_files, not_fit_all_files = process_yael_data(combined_original_data)

    # Check if the order stayed the same for the combined data
    check_if_ordered_stayed_the_same(combined_original_data, converted_data_all_files, not_fit_all_files)

    return converted_data_all_files, not_fit_all_files, combined_original_data

def create_test_file(new_data):
    """
    Create a test file from the given data.

    Args:
        new_data (dict): A dictionary containing the data to be used for creating the test file.

    Returns:
        tuple: A tuple containing the test file (dict) and the list of dialog keys used in the test file.
    """
    items = list(new_data.items())
    random.shuffle(items)
    shuffled_data = dict(items)

    test_file = {}
    dialog_test_key = []
    turn_counter = 0

    for key, value in shuffled_data.items():
        number_of_turns = value['number_of_turns']
        turn_counter += number_of_turns
        dialog_test_key.append(key)
        test_file[key] = value
        if turn_counter > 60:
            break

    for dialog_key, dialog_value in test_file.items():
        for turn_key, turn_value in dialog_value['dialog'].items():
            if turn_key != '0':
                turn_value['needs_clarification'] = None

    return test_file, dialog_test_key

def expand_test_file(test_file, json_data):
    
    test_file_keys = list(test_file.keys())
  
    items = list(json_data.items())
    random.shuffle(items)
    shuffled_data = dict(items)
    
    #remove existing keys
    for key in test_file_keys:
        if key in shuffled_data.keys():
            del shuffled_data[key]

    new_test_file = {}
    turn_counter = 0
    number = 5
    for key, value in shuffled_data.items():
        number_of_turns = value['number_of_turns']
        if number_of_turns < 5:
            continue
        else:
            number = random.randint(3, 15)
        turn_counter += number_of_turns
        new_test_file[key] = value
        if turn_counter > 60:
            break

    for dialog_key, dialog_value in new_test_file.items():
        for turn_key, turn_value in dialog_value['dialog'].items():
            if turn_key != '0':
                turn_value['needs_clarification'] = None
                
    test_file.update(new_test_file)

    return test_file, new_test_file

In [22]:
from db import *
test_file = read_json_file('datasets_3_6\\results\\agent_conv_test.json')
json_data, not_enough, combind = new_combine_csv_files("datasets_3_6/date_sorted_filtered_double")
new_test_file, test_dialog_to_add = expand_test_file(test_file, json_data)
save_json_file('datasets_3_6\\results\\dialog_to_add_1.json', test_dialog_to_add)


first group name: chat_9812e6bb-a78c-475b-ad5a-dacf1d7e4f7e
The order stayed the same


In [None]:
retrieve_json_template_by_file_id("agent")

In [7]:
new_test_file

{'chat_1cc622c8-4525-4c3e-964c-cff03832cfb4': {'number_of_turns': 2,
  'annotator_id': None,
  'dialog': {'0': {'turn_num': 0,
    'original_question': "Hello! I'm agent aibot-openvpn.\nI'm here to answer any questions you have, or maybe you'd rather choose a question others are asking me?",
    'answer': '',
    'needs_clarification': None},
   '1': {'turn_num': 1,
    'original_question': 'how do I install access server on ubuntu?',
    'answer': 'To install Access Server on Ubuntu, follow these steps:\n\n1. **Update your operating system**:\n   - Sign in to your Linux system on the console or via SSH and obtain root privileges.\n   - Run the following commands to install updates and set the correct time and date:\n     ```sh\n     apt update\n     apt upgrade\n     apt install tzdata\n     ```\n\n2. **Download and install Access Server**:\n   - Sign in to the [Access Server portal](https://myaccount.openvpn.com/signin/as).\n   - Click "Get Access Server".\n   - Select "Ubuntu LTS" f

In [None]:
json_file_proccesed, not_enough = combine_csv_files("datasets_3_6/date_sorted_filtered_double") 
json_file_proccesed1, not_enough1, combined_csv = new_combine_csv_files("datasets_3_6/date_sorted_filtered_double") #change the path to the folder containing the csv files

In [122]:
check_if_ordered_stayed_the_same(combined_csv, json_file_proccesed1, not_enough_dialog=not_enough1,  processed=False)

The order stayed the same


True

In [114]:
not_enough['chat_wo9s8ncc9n839nc8s93nc9p3nc_101']

{'number_of_turns': 1,
 'annotator_id': None,
 'dialog': {0: {'turn_num': 0,
   'original_question': 'Hi, Welcome to OneAI, we provide AI agents for your business.👋\n\nWhat would be the business objective of your Agent?',
   'answer': ''}}}

In [145]:

def find_keywords_in_json(json_data, keyword = []):
    """
    Find the specified keyword in the given JSON data.

    Args:
        json_data (dict): The JSON data to be searched.
        keyword (list): The list of keywords to be searched in the JSON data.

    Returns:
        dict: A dictionary containing the dialogs that contain the specified keyword.
    """
    keyword_data = {}
    for dialog_key, dialog_value in json_data.items():
        for turn_key, turn_value in dialog_value['dialog'].items():
            for kw in keyword:
                if kw.lower() in turn_value['original_question'].lower().split(' ') or kw in turn_value['answer'].lower().split(' ') :
                    print("---------------------------------------------------")
                    print(f"first utterance: {dialog_value['dialog'][0]['original_question']}")
                    print(f"keyword: {kw} in dialog: {dialog_key}")
                    print(f"original question: {turn_value['original_question']}")
                    print(f"answer: {turn_value['answer']}")
    return keyword_data


find_keywords_in_json(json_file_proccesed, ['pussy'])

---------------------------------------------------
first utterance: Hi, I'm OneAI's GPT! 👋

*Do you want to boost leads and sales with an AI Agent like me?*
keyword: pussy in dialog: chat_41130fdb-eb28-4cae-8ec8-36d2cb5e67c5
original question: aswser my fucking question pussy
answer: I'm here to help! Let me know how I can assist you with the OneAgent solution. What are your business needs and objectives?


{}

In [128]:
h = "hello man"
h.split(' ')

['hello', 'man']