In [None]:
#iFrst clone the FrenchMedMCQA repo..

In [7]:
import os, json

def convert_json_to_jsonl(input_json_files):
    """
    Converts multiple JSON files containing medical questions to JSONL format
    suitable for fine-tuning Mistral's API.

    Args:
        input_json_files (list of str): List of paths to the input JSON files.
    """
    for input_json_file in input_json_files:
        # Generate output file path by appending '_transformed.jsonl'
        output_jsonl_file = f"{os.path.splitext(input_json_file)[0]}_transformed_2.jsonl"
        
        print(f"Processing {input_json_file} to {output_jsonl_file}...")

        # Load the training data from the JSON file
        with open(input_json_file, 'r', encoding='utf-8') as f:
            train_data = json.load(f)

        # Prepare a list to hold the fine-tuning data
        fine_tune_data = []

        for item in train_data:
            question = item['question']
            answers = item['answers']
            
            # Build the prompt (question + possible answers)
            prompt = f"{question}\n"
            for letter, answer_text in answers.items():
                prompt += f"{letter}: {answer_text}\n"
            
            # Use the correct answers' letters as the response
            response = ','.join(sorted(item['correct_answers']))

            # Append the structured messages
            fine_tune_data.append({
                "messages": [
                    {
                        "role": "user",
                        "content": prompt.strip()  # The user prompt
                    },
                    {
                        "role": "assistant",
                        "content": response        # The letter(s) of the correct answer(s)
                    }
                ]
            })

        # Save the fine-tuning data to JSON Lines format
        with open(output_jsonl_file, 'w', encoding='utf-8') as out_f:
            for entry in fine_tune_data:
                out_f.write(json.dumps(entry, ensure_ascii=False) + '\n')

file_list = ['FrenchMedMCQA/corpus/dev.json', 'FrenchMedMCQA/corpus/test.json', 'FrenchMedMCQA/corpus/train.json']
convert_json_to_jsonl(file_list)

Processing FrenchMedMCQA/corpus/dev.json to FrenchMedMCQA/corpus/dev_transformed_2.jsonl...
Processing FrenchMedMCQA/corpus/test.json to FrenchMedMCQA/corpus/test_transformed_2.jsonl...
Processing FrenchMedMCQA/corpus/train.json to FrenchMedMCQA/corpus/train_transformed_2.jsonl...
