# Functions needed to create the datasets from the raw generated data
## Needed imports

In [3]:
import json
import os
import random
import re

from copy import deepcopy
from rouge import Rouge

## Functions

In [11]:
# Functions needed to build the dataset
def list_files(parent_directory: str = ".", extension: str = ".txt") -> list:
    """List all files in parent and subdirectories with a specific extension.
    
    Args:
        parent_directory (str, optional): parent directory. Defaults to ".".
        extension (str, optional): extension of the files to list. Defaults to ".txt".
    
    Returns:
        list: list of files with the specific extension
    """
    files_list = []
    for root, _, files in os.walk(parent_directory):
        for file in files:
            if file.endswith(extension):
                files_list.append(os.path.join(root, file))
    return files_list


def validate_line(line: str) -> bool:
    """Validate each line of the txt file to be a valid json

    Args:
        line (str): line of the txt file

    Returns:
        bool: True if the line is a valid json, False otherwise
    """
    try:        
        # Add the brackets to make it a valid json
        line = f"[{line}]"
        
        # Remove the last comma before the closing bracket
        line = re.sub(r"\,\s*\]", r"]", line, 1)

        json.loads(line)
        return True

    except:
        return False


def extract_json(filename: str) -> list:
    """Extract the json from the txt file
    
    Args:
        filename (str): txt filename
        
    Returns:
        list: list of json objects
    """
    with open(filename, 'r') as f:
        text = f.read()
        text = text[max(text.find('['), 0):]

        # Remove the </s> tag, [ and ] from the text
        text = text.replace(r"</s>","")
        text = text.replace(r"[","")
        text = text.replace(r"]","")
        text = text.replace(r"\n","")
        
        # Remove malformed json elements
        text = re.sub(r"(\}\,\s*\w+\s*\{)", r"},\n{", text)
        
        # Only keep the valid json elements
        text = filter(validate_line, text.split("\n"))
        text = "\n".join(text)
        
        # Add the brackets to make it a valid json
        text = f"[{text}]"
        
        # Remove the last comma
        text = re.sub(r"\,\s*\]", r"]", text, 1)

        return json.loads(text)


def natural_sort(list_to_sort: list) -> list:
    """Sort a list of strings in natural order
    
    Args:
        list_to_sort (list): list of strings to sort
        
    Returns:
        list: sorted list
    """
    def convert(text):
        return int(text) if text.isdigit() else text.lower()
    
    def alphanumeric_key(key: str) -> list:
        return [convert(character) for character in re.split('([0-9]+)', key)]

    return sorted(list_to_sort.copy(), key=alphanumeric_key)


def extract_dataset(dataset_files, tools):
    dataset = []

    for file in natural_sort(dataset_files):
        dataset.append(extract_json(file))

    new_dataset = []
    for data, tool in zip(dataset, tools):
        new_entry = tool
        reference = tool['use_cases'][0]['user_request']
        new_data = []
        
        for generation in data:
            scored_generation = generation
            user_request = scored_generation['user_request']
            # print(rouge_score(user_request, reference))
            scored_generation.update({"rouge_score": rouge_score(user_request, reference)})
            new_data.append(scored_generation)

        new_entry.update({"dataset": new_data})
        new_dataset.append(new_entry)

    return deepcopy(new_dataset)


def make_json_file(data, filename: str) -> None:
    """Make a json file from the data
    
    Args:
        data (list): data to write in the json file
        filename (str): name of the json file
    """
    with open(filename, 'w') as outfile:
        json.dump(data, outfile, indent=2)

# Functions needed to evaluate the dataset
rouge_scorer = Rouge()

def rouge_score(generation, reference) -> float:
    """Compute the rouge score of a generation compared to a reference.
    
    Args:
        generation (str): generated text
        reference (str): reference text
        
    Returns:
        float: rouge score
    """
    return rouge_scorer.get_scores(
        hyps=generation,
        refs=reference,
    )[0]["rouge-l"]["f"]


def print_dataset_mean_rouge(dataset):
    """Print the mean rouge score of each class and the total mean rouge score.
    
    Args:
        dataset (list): dataset to evaluate
    """
    total_average_rouge_score = 0
    total_number_of_generations = 0

    for data in dataset:    
        classe = data['dataset']
        average_rouge_score = 0

        for generation in classe:
            average_rouge_score += generation['rouge_score']
            
        average_rouge_score /= len(classe)
        total_average_rouge_score += average_rouge_score
        total_number_of_generations += len(classe)
        
        print("Class:", data['tool_name'], "\taverage ROUGE score:", average_rouge_score)

    total_average_rouge_score /= total_number_of_generations
    print("Total average ROUGE score:", total_average_rouge_score)


def print_dataset_size(dataset):
    """Print the size of each class and the total size of the dataset.
    
    Args:
        dataset (list): dataset to evaluate
    """
    total_number_of_elements = 0
    total_per_class = {}

    for data in dataset:    
        classe = data['dataset']
        total_per_class[data['tool_name']] = len(classe)
        total_number_of_elements += len(classe)
    
    print("Total number of elements:", total_number_of_elements)
    print(total_per_class)
    
    # Compute mean
    mean = total_number_of_elements / len(dataset)
    print("Mean:", mean)
    
    # Compute media
    dataset_size = sorted(total_per_class.values())
    if len(dataset_size) % 2 == 0:
        median = (dataset_size[len(dataset_size) // 2] + dataset_size[len(dataset_size) // 2 - 1]) / 2
    else:
        median = dataset_size[len(dataset_size) // 2]
        
    print("Median:", median)

# Functions needed to automatically describe the dataset
def lower_case_first_letter(string: str) -> str:
    """Lower case the first letter of a string.
    
    Args:
        string (str): string to lower case
    
    Returns:
        str: string with the first letter lower cased
    """
    return string[0].lower() + string[1:]


def create_an_enumeration(list_of_strings: list) -> str:
    """Create an enumeration of a list of strings.
    
    Args:
        list_of_strings (list): list of strings to enumerate
        
    Returns:
        str: enumeration of the list of strings
    """
    return "\n".join([f"{i+1}. {string}" for i, string in enumerate(list_of_strings)])


def randomly_select_n_generations_for_a_specific_tool(tool, n: int):
    """Randomly select n generations for a specific tool.
    
    Args:
        tool (dict): tool to select the generations from
        n (int): number of generations to select
    
    Returns:
        list: list of n generations
    """
    generations = tool['dataset']
    random.shuffle(generations)
    return deepcopy(generations[:n])


def randomly_select_n_generations(dataset, n: int):
    """Randomly select n generations for each tool in the dataset.
    
    Args:
        dataset (list): dataset to select the generations from
        n (int): number of generations to select
        
    Returns:
        list: list of n generations for each tool in the dataset
    """
    for tool in dataset:
        generations = randomly_select_n_generations_for_a_specific_tool(tool, n)
        list_of_generations = [generation['user_request'] for generation in generations]
        print(f"""The tool '{tool['tool_name']}' should {lower_case_first_letter(tool['description'])}'
{create_an_enumeration(list_of_generations)}""")


def make_description(dataset_a, dataset_b, n: int = 5, filename: str = "form.txt"):
    """Make a description of the datasets, based on the tools descriptions and on randomly selected generations.
    
    Args:
        dataset_a (list): dataset a
        dataset_b (list): dataset b
        n (int, optional): number of generations to select for each tool. Defaults to 5.
        filename (str, optional): name of the file to write the description in. Defaults to "form.txt".
    """
    tools_questions = []
    for tool_a, tool_b in zip(dataset_a, dataset_b):
        random_generations_a = randomly_select_n_generations_for_a_specific_tool(tool_a, n)
        random_generations_b = randomly_select_n_generations_for_a_specific_tool(tool_b, n)
        list_of_generations_a = [generation['user_request'] for generation in random_generations_a]
        list_of_generations_b = [generation['user_request'] for generation in random_generations_b]
        enumerated_generations_a = create_an_enumeration(list_of_generations_a)
        enumerated_generations_b = create_an_enumeration(list_of_generations_b)

        tools_questions.append(f"""The tool '{tool_a['tool_name']}' should {lower_case_first_letter(tool_a['description'])}
In the dataset a, the following user requests were generated for the tool '{tool_a['tool_name']}':

{enumerated_generations_a}

In the dataset b, the following user requests were generated for the tool '{tool_a['tool_name']}':

{enumerated_generations_b}
""")
    with open(filename, 'w') as text_file:
        text_file.write("\n".join(tools_questions))

## Generate the datasets from the raw data

In [8]:
TOOLS_FILE = "tools/tools.json"
DATASET_FILES_0 = "datasets/results_prompts_0"
DATASET_FILES_1 = "datasets/results_prompts_1"

# Open tool json file
with open(TOOLS_FILE, 'r') as json_file:
    tools = json.load(json_file)

# print(list(f"{DATASET_FILES_0}/{file}" for file in os.listdir(DATASET_FILES_0)))
dataset_files_0 = [f"{DATASET_FILES_0}/{file}" for file in os.listdir(DATASET_FILES_0)]
dataset_files_1 = [f"{DATASET_FILES_1}/{file}" for file in os.listdir(DATASET_FILES_1)]

dataset_0 = extract_dataset(dataset_files_0, tools)
dataset_1 = extract_dataset(dataset_files_1, tools)

make_json_file(dataset_0, f"{DATASET_FILES_0}.json")
make_json_file(dataset_1, f"{DATASET_FILES_1}.json")

print(extract_dataset(dataset_files_1, tools) == extract_dataset(dataset_files_0, tools))


False


## Study the datasets with ROUGE ('built-in' the datasets)
### Test Rouge

In [7]:
generation = "to make people trustworthy you need to trust them"
reference = "the way to make people trustworthy is to trust them"

print(reference)
print(generation)

print(f"{rouge_score(generation, reference) = }")

the way to make people trustworthy is to trust them
to make people trustworthy you need to trust them
rouge_score(generation, reference) = 0.7058823479584776


## Study the dataset
1. Get a broad and granular view of the datasets ROUGE scores
2. Compute the size of the datasets (number of elements, number of elements per class and statistical metrics)

### Compare the ROUGE scores of the datasets

In [9]:
print_dataset_mean_rouge(dataset_0)

Class: detect_object 	average ROUGE score: 0.38469106173714723
Class: enumerate_objects 	average ROUGE score: 0.3301340027102317
Class: navigation 	average ROUGE score: 0.516528920634861
Class: position 	average ROUGE score: 0.010416666471354169
Class: add_face 	average ROUGE score: 0.05291005201058202
Class: remove_face 	average ROUGE score: 0.06144781029461283
Class: look_for_face 	average ROUGE score: 0.010101009898989903
Class: enumerate_individuals 	average ROUGE score: 0.013935339704892384
Class: age_estimation 	average ROUGE score: 0.15151514904109012
Class: gender_estimation 	average ROUGE score: 0.06593406501567235
Class: emotion_estimation 	average ROUGE score: 0.09256198094665673
Class: colors 	average ROUGE score: 0.32435640436759766
Class: object_color 	average ROUGE score: 0.4510207879933894
Class: ocr 	average ROUGE score: 0.2859504082268971
Class: ocr_objects 	average ROUGE score: 0.2684777295577613
Class: money 	average ROUGE score: 0.17371845877603714
Class: environme

In [9]:
print_dataset_mean_rouge(dataset_1)

Class: detect_object 	average ROUGE score: 0.8560846510871253
Class: enumerate_objects 	average ROUGE score: 0.46048647825218103
Class: navigation 	average ROUGE score: 0.5571747043697814
Class: position 	average ROUGE score: 0.03571428526785715
Class: add_face 	average ROUGE score: 0.4015567715959727
Class: remove_face 	average ROUGE score: 0.4032633982902994
Class: look_for_face 	average ROUGE score: 0.37996031254269425
Class: enumerate_individuals 	average ROUGE score: 0.037037036851851855
Class: age_estimation 	average ROUGE score: 0.36381673412765986
Class: gender_estimation 	average ROUGE score: 0.15120772778003222
Class: emotion_estimation 	average ROUGE score: 0.10148336264821496
Class: colors 	average ROUGE score: 0.60030120898732
Class: object_color 	average ROUGE score: 0.7677018583550037
Class: ocr 	average ROUGE score: 0.7867768545116455
Class: ocr_objects 	average ROUGE score: 0.3409951110237474
Class: money 	average ROUGE score: 0.1476967451334499
Class: environment_desc

### Compute the size of the dataset

In [12]:
print_dataset_size(dataset_0)
print_dataset_size(dataset_1)

Total number of elements: 510
{'detect_object': 22, 'enumerate_objects': 22, 'navigation': 22, 'position': 24, 'add_face': 21, 'remove_face': 66, 'look_for_face': 22, 'enumerate_individuals': 23, 'age_estimation': 58, 'gender_estimation': 23, 'emotion_estimation': 55, 'colors': 21, 'object_color': 22, 'ocr': 22, 'ocr_objects': 22, 'money': 21, 'environment_description': 22, 'environment_question': 22}
Mean: 28.333333333333332
Median: 22.0
Total number of elements: 455
{'detect_object': 27, 'enumerate_objects': 22, 'navigation': 23, 'position': 21, 'add_face': 24, 'remove_face': 22, 'look_for_face': 24, 'enumerate_individuals': 27, 'age_estimation': 22, 'gender_estimation': 69, 'emotion_estimation': 22, 'colors': 22, 'object_color': 23, 'ocr': 22, 'ocr_objects': 21, 'money': 21, 'environment_description': 22, 'environment_question': 21}
Mean: 25.27777777777778
Median: 22.0


## Build descriptions of the datasets based on the tools descriptions and randomly selected generated user requests

In [205]:
# random.seed(42)
random.seed(None)
make_description(dataset_0, dataset_1)