In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from collections import Counter

In [5]:
with open("../prompts.json", "r") as f:
    samples = json.load(f)

In [7]:
TFs, MCQs, SAQs, what_else = [], [], [], []

for sample in samples:
    # Check if the sample contains valid choices
    if 'choices' in sample.keys() and sample['choices'] is not None:
        if all(choice.lower() in ['true', 'false'] for choice in sample['choices']):
            TFs.append(sample) # check if these choices are True or False
        else:
            MCQs.append(sample) # check if these choices are arbitrary choices of MCQs
    elif ('choices' not in sample.keys() or sample['choices'] is None) and ('answer' in sample.keys() and sample['answer'] is not None):
        SAQs.append(sample)
    else: # if the sample contains neither choices nor answer
        if set(sample.keys()) == {'question', 'sol_id'}:
            what_else.append(sample)

assert len(MCQs) + len(TFs) + len(SAQs) + len(what_else) == len(samples), "Something is wrong with the classification of samples"

In [9]:
print(len(TFs), len(MCQs), len(SAQs), len(what_else))

1 64 35 0


## SAQ

In [10]:
SAQs_with_explanation = []

for sample in SAQs:
    if 'explanation' in sample.keys() and sample['explanation'] is not None:
        SAQs_with_explanation.append(sample)

print("Number of SAQ Type question with explanation", len(SAQs_with_explanation))

Number of SAQ Type question with explanation 1


In [11]:
count = 0

for idx, sample in enumerate(SAQs):
    if isinstance(sample['answer'], list):
        count += 1
        if any(isinstance(i, list) for i in sample['answer']):
            # if answer is a list of lists
            sample['answer'] = '. '.join([', '.join(l) for l in sample['answer']])
        else:
            # if answer is a single list
            sample['answer'] = ', '.join(sample['answer'])

print("Number of solutions with answer in a list format: ", count)

Number of solutions with answer in a list format:  1


In [20]:
SAQs_all = SAQs

## MCQ

In [12]:
def filter_sample_with_explanation(samples):
    """
    Used when transforming solution into training samples
    Add system prompt if there is explanation for the sample
    """
    samples_with_explanation = []
    samples_without_explanation = []

    for sample in samples:
        if 'explanation' in sample.keys() and sample['explanation'] is not None:
            samples_with_explanation.append(sample)
        else:
            samples_without_explanation.append(sample)

    return samples_with_explanation, samples_without_explanation

In [14]:
MCQs_with_explanation, MCQs_without_explanation = filter_sample_with_explanation(MCQs)
print(len(MCQs_with_explanation), len(MCQs_without_explanation))

20 44


In [15]:
for sample in MCQs_with_explanation:
    if isinstance(sample['answer'], list): # if answers are mutliples
        assistant_response = "Correct answers are: "+ ", ".join(sample['answer'])
    else:
        assistant_response = "Correct answer is: " + str(sample['answer'])

    user_query = sample['question'] + "\nchoices:\n" + '\n'.join(sample['choices'])
    sample['question'] = user_query
    sample['answer'] = f"{assistant_response}, Explanation: {sample['explanation']}"


for sample in MCQs_without_explanation:
    if isinstance(sample['answer'], list): # if answers are mutliples
        assistant_response = "Correct answers are: "+ ", ".join(sample['answer'])
    else:
        assistant_response = "Correct answer is: " + str(sample['answer'])

    user_query = sample['question'] + "\nchoices:\n" + '\n'.join(sample['choices'])
    sample['question'] = user_query
    sample['answer'] = f"{assistant_response}"

In [17]:
MCQs_all = MCQs_with_explanation + MCQs_without_explanation

## TF

In [19]:
TFs_with_explanation, TFs_without_explanation = filter_sample_with_explanation(TFs)
len(TFs_with_explanation), len(TFs_without_explanation)

(0, 1)

In [21]:
TFs_all = TFs_without_explanation

## Save New Prompt

In [22]:
prompts_all = SAQs_all + TFs_all + MCQs_all

100

In [26]:
for prompt in prompts_all:
    if "explanation" in prompt.keys():
        prompt.pop("explanation")
    if "choices" in prompt.keys():
        prompt.pop("choices")

In [27]:
with open('../dataset/prompts_processed.json', 'w') as file:
    json.dump(prompts_all, file, indent=4)

## Add guid

In [2]:
import json

In [3]:
with open("../dataset/prompts_processed.json", "r") as f:
    samples = json.load(f)

In [4]:
with open("../dataset/prompts_processed_result.json", "r") as g:
    results = json.load(g)

In [6]:
guids = []
for sample in samples:
    guid = sample['guid']
    guids.append(guid)

for idx, result in enumerate(results):
    result['guid'] = guids[idx]

In [7]:
with open('../dataset/prompts_processed_result_guid.json', 'w') as file:
    json.dump(results, file, indent=4)

In [5]:
import json

In [8]:
def process_prompt_data(prompts_r_path: str, prompts_w_path: str):
    def filter_sample_with_explanation(prompts):
        samples_with_explanation = []
        samples_without_explanation = []
        for prompt in prompts:
            if 'explanation' in prompt.keys() and prompt['explanation'] is not None:
                samples_with_explanation.append(prompt)
            else:
                samples_without_explanation.append(prompt)
        return samples_with_explanation, samples_without_explanation

    def process_explanation(prompts, expl_flag=False):
        if expl_flag:
            for prompt in prompts:
                if isinstance(prompt['answer'], list) and len(prompt['answer']) > 1:  # if answers are multiples
                    assistant_response = "Correct answers are: " + ", ".join(prompt['answer'])
                else:
                    assistant_response = "Correct answer is: " + str(prompt['answer'])

                user_query = prompt['question'] + "\nchoices:\n" + '\n'.join(prompt['choices'])
                prompt['question'] = user_query
                prompt['answer'] = f"{assistant_response}, Explanation: {prompt['explanation']}"
        else:
            for prompt in prompts:
                if isinstance(prompt['answer'], list) and len(prompt['answer']) > 1:  # if answers are multiples
                    assistant_response = "Correct answers are: " + ", ".join(prompt['answer'])
                else:
                    assistant_response = "Correct answer is: " + str(prompt['answer'])

                user_query = prompt['question'] + "\nchoices:\n" + '\n'.join(prompt['choices'])
                prompt['question'] = user_query
                prompt['answer'] = f"{assistant_response}"
        return prompts

    with open(prompts_r_path, "r") as f:
        samples = json.load(f)

    total_length = len(samples)
    empty_count = 0

    for sample in samples:
        if isinstance(sample['answer'], list):
            if not sample['answer']:
                samples.remove(sample)
                empty_count += 1
    print(f"Inspected {empty_count} samples with empty answer. Remaining {len(samples)} out of {total_length}.")

    TFs, MCQs, SAQs, what_else = [], [], [], []
    for sample in samples:
        # Check if the sample contains valid choices
        if 'choices' in sample.keys() and sample['choices'] is not None:
            if all(choice.lower() in ['true', 'false'] for choice in sample['choices']):
                TFs.append(sample)  # check if these choices are True or False
            else:
                MCQs.append(sample)  # check if these choices are arbitrary choices of MCQs
        elif ('choices' not in sample.keys() or sample['choices'] is None) and (
                'answer' in sample.keys() and sample['answer'] is not None):
            SAQs.append(sample)
        else:  # if the sample contains neither choices nor answer
            if set(sample.keys()) == {'question', 'sol_id'}:
                what_else.append(sample)

    assert len(MCQs) + len(TFs) + len(SAQs) + len(what_else) == len(
        samples), "Sample(s) with wrong form exist(s). Please check the original dataset..."

    # MCQ
    MCQs_with_explanation, MCQs_without_explanation = filter_sample_with_explanation(MCQs)
    if len(MCQs_without_explanation) > 0:
        MCQs_with_explanation = process_explanation(MCQs_with_explanation, expl_flag=True)
    if len(MCQs_without_explanation) > 0:
        MCQs_without_explanation = process_explanation(MCQs_without_explanation, expl_flag=False)
    MCQs_all = MCQs_with_explanation + MCQs_without_explanation

    # TF
    TFs_with_explanation, TFs_without_explanation = filter_sample_with_explanation(TFs)
    if len(TFs_with_explanation) > 0:
        TFs_with_explanation = process_explanation(TFs_with_explanation, expl_flag=True)
    if len(TFs_without_explanation) > 0:
        TFs_without_explanation = process_explanation(TFs_without_explanation, expl_flag=False)
    TFs_all = TFs_with_explanation + TFs_without_explanation

    # SAQ
    for sample in SAQs:
        if isinstance(sample['answer'], list):
            if len(sample['answer']) > 1:
                sample['answer'] = ' '.join(sample['answer'])
            else:
                sample['answer'] = sample['answer'][0]

    # Save
    samples_all = MCQs_all + TFs_all + SAQs
    for sample in samples_all:
        if "explanation" in sample.keys():
            sample.pop("explanation")
        if "choices" in sample.keys():  
            sample.pop("choices")

    with open(prompts_w_path, 'w') as file:
        json.dump(samples_all, file, indent=4)

In [10]:
process_prompt_data('../prompts.json', '../dataset/prompts_processed_test.json')

Inspected 2 samples with empty answer. Remaining 98 out of 100.
