# Converting Json to Jsonl for Fine-Tune the Model


### Install the necesarry libraries. 

In [154]:
pip install openai

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [155]:
pip install openai[datalib]

zsh:1: no matches found: openai[datalib]
Note: you may need to restart the kernel to use updated packages.


In [156]:
pip install urllib3==1.26.6 

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [157]:
pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [158]:
pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


### Import the libraries and enviornment file to gain access to the Open API Key
#### The key can be generated here: https://platform.openai.com/account/api-keys

In [159]:
import os
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

### Authenticate to the API using the API Key
#### Pull from environment variables or use openai.api_key = ("your_key_here") to hardcode the key

In [160]:
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY']  
)

### Helper Functions

In [162]:
import json
from collections import defaultdict

# Updated function to handle multi-turn chat examples and format them in the desired JSONL format
def json_to_chat_jsonl(input_file, output_file):
    """
    Converts a JSON file to JSONL format, ensuring multi-turn chat examples
    with weight keys are properly formatted.
    """
    try:
        # Open and load the input JSON file
        with open(input_file, 'r') as f:
            data = json.load(f)
        
        # Open the output file for writing in JSONL format
        with open(output_file, 'w') as outfile:
            for example in data:
                # Process each example and write to JSONL
                json.dump(example, outfile)
                outfile.write('\n')
        print(f"Converted JSON to JSONL format successfully. Output file: {output_file}")
    
    except Exception as e:
        print(f"Error processing file: {e}")


# Validation function for the dataset structure
def validate_chat_format(dataset):
    """
    Validates the structure of chat examples for correctness.
    """
    format_errors = defaultdict(int)

    for example in dataset:
        if not isinstance(example, dict):
            format_errors["example_not_dict"] += 1
            continue

        messages = example.get("messages", None)
        if not messages:
            format_errors["missing_messages"] += 1
            continue
        
        for message in messages:
            if not isinstance(message, dict):
                format_errors["message_not_dict"] += 1
                continue

            if "role" not in message or "content" not in message:
                format_errors["missing_role_or_content"] += 1

            if message.get("role") not in {"system", "user", "assistant"}:
                format_errors["invalid_role"] += 1

            if "weight" in message and message["role"] != "assistant":
                format_errors["invalid_weight"] += 1

    if format_errors:
        print("Validation errors found:")
        for error_type, count in format_errors.items():
            print(f"{error_type}: {count}")
    else:
        print("All examples are valid.")

# Example usage
# Input: 'input.json' containing chat examples
# Output: 'output.jsonl' in the desired JSONL format
input_file = 'custom_support.json'
output_file = 'output.jsonl'

json_to_chat_jsonl(input_file, output_file)

# Load the file again for validation if needed
with open(input_file, 'r') as f:
    dataset = json.load(f)

validate_chat_format(dataset)


Converted JSON to JSONL format successfully. Output file: output.jsonl
All examples are valid.


### Convert JSON to JSONL

In [163]:
json_to_chat_jsonl('custom_support.json', 'output.jsonl')

Converted JSON to JSONL format successfully. Output file: output.jsonl


### Check File Format

https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [164]:
from collections import defaultdict
import json

def check_chat_format(dataset):
    """
    Validates the structure of multi-turn chat examples.
    Ensures messages contain valid roles, keys, and weights where applicable.
    """
    format_errors = defaultdict(int)

    for idx, example in enumerate(dataset):
        # Check if the example is a dictionary
        if not isinstance(example, dict):
            format_errors["example_not_dict"] += 1
            continue

        # Ensure "messages" key exists and is a list
        messages = example.get("messages", None)
        if not messages or not isinstance(messages, list):
            format_errors["missing_or_invalid_messages"] += 1
            continue

        # Check messages within the example
        for message in messages:
            if not isinstance(message, dict):
                format_errors["message_not_dict"] += 1
                continue

            # Validate "role" and "content"
            if "role" not in message or message["role"] not in {"system", "user", "assistant"}:
                format_errors["invalid_or_missing_role"] += 1

            if "content" not in message or not isinstance(message["content"], str):
                format_errors["invalid_or_missing_content"] += 1

            # Check for unrecognized keys
            valid_keys = {"role", "content", "weight"}
            unrecognized_keys = set(message.keys()) - valid_keys
            if unrecognized_keys:
                format_errors["unrecognized_keys"] += 1

            # Validate "weight" key if present
            if "weight" in message:
                if message["role"] != "assistant" or not isinstance(message["weight"], int) or message["weight"] not in {0, 1}:
                    format_errors["invalid_weight"] += 1

        # Ensure at least one "assistant" message exists
        if not any(msg.get("role") == "assistant" for msg in messages):
            format_errors["missing_assistant_message"] += 1

    # Report the results
    if format_errors:
        print("Validation errors found:")
        for error_type, count in format_errors.items():
            print(f"{error_type}: {count}")
    else:
        print("All examples are valid.")

# Example usage
data_path = "output.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Check the format of the dataset
check_chat_format(dataset)


All examples are valid.


In [166]:
# Format validation

check_chat_format(dataset)

All examples are valid.
