In [108]:
input_csv_path = "data/train.csv"
output_jsonl_path = 5000
num_rows = 5000

In [109]:
import json
import pandas as pd
import re

def transform_conversations_to_jsonl(input_csv_path, output_jsonl_path, num_rows=None):
    def clean_and_transform_conversation_regex(conversation_str):
        """
        Clean up and transform a single conversation string using regular expressions.
        """
        pattern = r"\{'from': '(\w+)', 'value': \"(.*?)\"\}"
        matches = re.findall(pattern, conversation_str, re.DOTALL)
        transformed_conversation = []
        for match in matches:
            from_field, value = match
            role = "user" if from_field == "human" else "assistant"
            content = value.replace('\\"', '"').strip()
            transformed_conversation.append({"role": role, "content": content})
        return transformed_conversation

    df = pd.read_csv(input_csv_path, nrows=num_rows)
    
    df["transformed_conversations"] = df["conversations"].apply(clean_and_transform_conversation_regex)
    
    with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
        for _, row in df.iterrows():
            output_record = {"messages": row["transformed_conversations"]}
            json.dump(output_record, jsonl_file)
            jsonl_file.write('\n')  


transform_conversations_to_jsonl(input_csv_path, f"data/output_{num_rows}.jsonl", num_rows)

data_path = f"data/output_{num_rows}.jsonl"
data_path

'data/output_5000.jsonl'

In [110]:
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 5000
First example:
{'role': 'user', 'content': "I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me."}
{'role': 'assistant', 'content': "Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on?"}
{'role': 'user', 'content': "I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me."}
{'role': 'assistant', 'content': "I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been facing at work?"}
{'role': 'user', 'content': "Well, the workload has increased significantly, and I find it ha

In [111]:
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Found errors:
missing_messages_list: 7
example_missing_assistant_message: 3


In [112]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 5000
Num examples missing user message: 10

#### Distribution of num_messages_per_example:
min / max: 0, 26
mean / median: 13.5964, 14.0
p5 / p95: 10.0, 17.0

#### Distribution of num_total_tokens_per_example:
min / max: 3, 1637
mean / median: 744.3164, 737.0
p5 / p95: 534.0, 972.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 0, 805
mean / median: 359.5268, 357.0
p5 / p95: 241.0, 485.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [113]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~3721582 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~11164746 tokens
