In [None]:
# https://community.openai.com/t/error-with-openai-1-56-0-client-init-got-an-unexpected-keyword-argument-proxies/1040332/11
!pip install openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
!pip install datasets

login huggingface

In [None]:
# from google.colab import userdata
from huggingface_hub import login

HF_Token = "your own key"
login(token=HF_Token)

dataset: https://huggingface.co/datasets/lavita/ChatDoctor-HealthCareMagic-100k

In [1]:
import json
from datasets import load_dataset

def convert_dataset_to_sft_format():
    """
    Download and convert HuggingFace dataset to SFT format
    """
    print("Downloading dataset...")

    # Load dataset
    try:
        ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
    except Exception as e:
        print(f"Failed to download dataset: {e}")
        return

    print(f"Dataset loaded successfully, total samples: {len(ds['train'])}")

    # System prompt
    system_content = ("You are a highly professional medical assistant. Your task is to provide accurate and well-structured medical information based on a patient's symptoms. Your tone must be objective and professional. Always start with a polite greeting and end with a supportive, yet professional, closing. When discussing possible diagnoses or treatments, it is critical to state that this is for informational purposes only and strongly recommend a consultation with a healthcare professional.")

    # Convert data format
    converted_data = []

    for i, sample in enumerate(ds['train']):
        # Check if required fields exist
        if 'input' not in sample or 'output' not in sample:
            print(f"Warning: Sample {i} missing required fields, skipping")
            continue

        # Build message format
        messages = [
            {
                "role": "system",
                "content": system_content
            },
            {
                "role": "user",
                "content": sample['input']
            },
            {
                "role": "assistant",
                "content": sample['output']
            }
        ]

        # Add to result list
        converted_data.append({"messages": messages})

        # Show progress every 1000 samples
        if (i + 1) % 1000 == 0:
            print(f"Processed {i + 1} samples...")

    # Save as JSON file
    output_file = "sft_dataset.json"

    print(f"Saving to {output_file}...")

    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in converted_data:
                # One JSON object per line
                f.write(json.dumps(item, ensure_ascii=False) + '\n')

        print(f"Conversion completed!")
        print(f"- Total processed samples: {len(converted_data)}")
        print(f"- Output file: {output_file}")
        print(f"- Format: One JSON object per line")

    except Exception as e:
        print(f"Failed to save file: {e}")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def validate_output_format(file_path="sft_dataset.json", num_samples=3):
    """
    Validate output file format
    """
    print(f"\nValidating output file format...")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= num_samples:
                    break

                data = json.loads(line.strip())
                print(f"\nSample {i + 1}:")
                print(json.dumps(data, ensure_ascii=False, indent=2))

    except Exception as e:
        print(f"Error validating file: {e}")

# main

In [3]:
convert_dataset_to_sft_format()
validate_output_format()

Downloading dataset...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 112165/112165 [00:00<00:00, 1113421.79 examples/s]


Dataset loaded successfully, total samples: 112165
Processed 1000 samples...
Processed 2000 samples...
Processed 3000 samples...
Processed 4000 samples...
Processed 5000 samples...
Processed 6000 samples...
Processed 7000 samples...
Processed 8000 samples...
Processed 9000 samples...
Processed 10000 samples...
Processed 11000 samples...
Processed 12000 samples...
Processed 13000 samples...
Processed 14000 samples...
Processed 15000 samples...
Processed 16000 samples...
Processed 17000 samples...
Processed 18000 samples...
Processed 19000 samples...
Processed 20000 samples...
Processed 21000 samples...
Processed 22000 samples...
Processed 23000 samples...
Processed 24000 samples...
Processed 25000 samples...
Processed 26000 samples...
Processed 27000 samples...
Processed 28000 samples...
Processed 29000 samples...
Processed 30000 samples...
Processed 31000 samples...
Processed 32000 samples...
Processed 33000 samples...
Processed 34000 samples...
Processed 35000 samples...
Processed 360

# max_sequence_length determination

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
SFT Max Sequence Length Analyzer
Analyze dataset to determine optimal max_seq_length for SFT training
"""

import json
import numpy as np
# import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from collections import Counter

def analyze_sequence_lengths(dataset_file="sft_dataset.json", model_name="meta-llama/Llama-3.2-3B"):
    """
    Analyze sequence lengths in the dataset to determine optimal max_seq_length

    Args:
        dataset_file: Path to the SFT dataset JSON file
        model_name: Model name for tokenizer (should match your target model)

    Returns:
        dict: Analysis results with recommendations
    """
    print(f"Loading tokenizer for {model_name}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
    except Exception as e:
        print(f"Failed to load tokenizer: {e}")
        print("Using approximate character-based estimation (1 token ≈ 4 characters)")
        tokenizer = None

    print(f"Analyzing dataset: {dataset_file}")

    # Statistics containers
    input_lengths = []  # system + user message lengths
    total_lengths = []  # system + user + assistant message lengths
    assistant_lengths = []  # assistant message lengths only

    sample_count = 0

    try:
        with open(dataset_file, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    data = json.loads(line.strip())
                    messages = data.get('messages', [])

                    if len(messages) < 3:
                        print(f"Warning: Line {line_num} has fewer than 3 messages, skipping")
                        continue

                    # Extract messages
                    system_msg = messages[0]['content'] if messages[0]['role'] == 'system' else ""
                    user_msg = messages[1]['content'] if messages[1]['role'] == 'user' else ""
                    assistant_msg = messages[2]['content'] if messages[2]['role'] == 'assistant' else ""

                    # Calculate token lengths
                    if tokenizer:
                        # Use actual tokenizer
                        system_tokens = len(tokenizer.encode(system_msg, add_special_tokens=False))
                        user_tokens = len(tokenizer.encode(user_msg, add_special_tokens=False))
                        assistant_tokens = len(tokenizer.encode(assistant_msg, add_special_tokens=False))

                        # Add special tokens (approximate)
                        special_tokens_overhead = 6  # <|begin_of_text|>, <|start_header_id|>, etc.
                    else:
                        # Approximate estimation
                        system_tokens = len(system_msg) // 4
                        user_tokens = len(user_msg) // 4
                        assistant_tokens = len(assistant_msg) // 4
                        special_tokens_overhead = 6

                    # Calculate lengths
                    input_length = system_tokens + user_tokens + special_tokens_overhead
                    total_length = input_length + assistant_tokens

                    input_lengths.append(input_length)
                    total_lengths.append(total_length)
                    assistant_lengths.append(assistant_tokens)

                    sample_count += 1

                    if sample_count % 1000 == 0:
                        print(f"Processed {sample_count} samples...")

                except json.JSONDecodeError:
                    print(f"Warning: Invalid JSON on line {line_num}, skipping")
                except Exception as e:
                    print(f"Warning: Error processing line {line_num}: {e}")

    except FileNotFoundError:
        print(f"Error: Dataset file {dataset_file} not found")
        return None
    except Exception as e:
        print(f"Error reading dataset: {e}")
        return None

    if not input_lengths:
        print("No valid samples found in dataset")
        return None

    # Calculate statistics
    input_lengths = np.array(input_lengths)
    total_lengths = np.array(total_lengths)
    assistant_lengths = np.array(assistant_lengths)

    results = {
        'total_samples': len(input_lengths),
        'input_stats': {
            'min': int(np.min(input_lengths)),
            'max': int(np.max(input_lengths)),
            'mean': float(np.mean(input_lengths)),
            'median': float(np.median(input_lengths)),
            'std': float(np.std(input_lengths)),
            'p90': float(np.percentile(input_lengths, 90)),
            'p95': float(np.percentile(input_lengths, 95)),
            'p99': float(np.percentile(input_lengths, 99))
        },
        'total_stats': {
            'min': int(np.min(total_lengths)),
            'max': int(np.max(total_lengths)),
            'mean': float(np.mean(total_lengths)),
            'median': float(np.median(total_lengths)),
            'std': float(np.std(total_lengths)),
            'p90': float(np.percentile(total_lengths, 90)),
            'p95': float(np.percentile(total_lengths, 95)),
            'p99': float(np.percentile(total_lengths, 99))
        },
        'assistant_stats': {
            'min': int(np.min(assistant_lengths)),
            'max': int(np.max(assistant_lengths)),
            'mean': float(np.mean(assistant_lengths)),
            'median': float(np.median(assistant_lengths))
        }
    }

    # Print analysis results
    print(f"\n=== Sequence Length Analysis Results ===")
    print(f"Total samples analyzed: {results['total_samples']}")
    print(f"Tokenizer used: {'Actual tokenizer' if tokenizer else 'Character-based estimation'}")

    print(f"\n📊 INPUT LENGTH STATS (System + User + Special Tokens):")
    print(f"  Min: {results['input_stats']['min']} tokens")
    print(f"  Max: {results['input_stats']['max']} tokens")
    print(f"  Mean: {results['input_stats']['mean']:.1f} tokens")
    print(f"  Median: {results['input_stats']['median']:.1f} tokens")
    print(f"  90th percentile: {results['input_stats']['p90']:.0f} tokens")
    print(f"  95th percentile: {results['input_stats']['p95']:.0f} tokens")
    print(f"  99th percentile: {results['input_stats']['p99']:.0f} tokens")

    print(f"\n📊 TOTAL LENGTH STATS (Input + Assistant):")
    print(f"  Min: {results['total_stats']['min']} tokens")
    print(f"  Max: {results['total_stats']['max']} tokens")
    print(f"  Mean: {results['total_stats']['mean']:.1f} tokens")
    print(f"  Median: {results['total_stats']['median']:.1f} tokens")
    print(f"  90th percentile: {results['total_stats']['p90']:.0f} tokens")
    print(f"  95th percentile: {results['total_stats']['p95']:.0f} tokens")
    print(f"  99th percentile: {results['total_stats']['p99']:.0f} tokens")

    # Calculate coverage percentages for different max_seq_length values
    common_lengths = [512, 1024, 1536, 2048, 3072, 4096]

    print(f"\n🎯 COVERAGE ANALYSIS (% samples that fit within max_seq_length):")
    print("Based on INPUT length (System + User):")
    for length in common_lengths:
        coverage = (input_lengths <= length).mean() * 100
        print(f"  max_seq_length={length}: {coverage:.1f}% of samples")

    print(f"\nBased on TOTAL length (Input + Assistant):")
    for length in common_lengths:
        coverage = (total_lengths <= length).mean() * 100
        print(f"  max_seq_length={length}: {coverage:.1f}% of samples")

    # Recommendations
    print(f"\n💡 RECOMMENDATIONS:")

    # Find optimal length for 95% coverage of input
    optimal_input_95 = int(np.ceil(results['input_stats']['p95'] / 128) * 128)  # Round up to nearest 128
    optimal_total_95 = int(np.ceil(results['total_stats']['p95'] / 128) * 128)

    print(f"  For 95% input coverage: max_seq_length = {optimal_input_95}")
    print(f"  For 95% total coverage: max_seq_length = {optimal_total_95}")
    print(f"  Current setting (1536): covers {(input_lengths <= 1536).mean()*100:.1f}% of inputs")

    if results['input_stats']['p95'] > 1536:
        print(f"  ⚠️  Your current max_seq_length=1536 may truncate {(input_lengths > 1536).mean()*100:.1f}% of inputs")
        print(f"  Consider increasing to {optimal_input_95} for better coverage")
    else:
        print(f"  ✅ Your current max_seq_length=1536 looks good for input coverage")

    return results

def plot_length_distribution(dataset_file="sft_dataset.json", model_name="meta-llama/Llama-3.2-3B"):
    """
    Create histograms showing length distribution
    """
    print("Creating length distribution plots...")

    # This is a simplified version - you'd need to run the analysis first
    # and then create plots based on the data
    print("To create plots, run the analyze_sequence_lengths function first")
    print("Then use the returned data to create matplotlib histograms")

if __name__ == "__main__":
    # Analyze sequence lengths
    results = analyze_sequence_lengths()

    if results:
        print(f"\n🎯 Quick recommendation for your current setup:")
        print(f"Your max_seq_length=1536 setting covers approximately:")
        print(f"- {(np.array([1536]) >= results['input_stats']['p95']).any() and 'More than 95%' or 'Less than 95%'} of input sequences")
        print(f"- Consider the analysis above to decide if you need to adjust it")

  from .autonotebook import tqdm as notebook_tqdm


Loading tokenizer for meta-llama/Llama-3.2-3B...
Failed to load tokenizer: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B.
401 Client Error. (Request ID: Root=1-689464ee-13e75a4c3ccc2a0a356a0940;bf4be3d5-99ec-433b-a4d0-1c277af84083)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in.
Using approximate character-based estimation (1 token ≈ 4 characters)
Analyzing dataset: sft_dataset.json
Processed 1000 samples...
Processed 2000 samples...
Processed 3000 samples...
Processed 4000 samples...
Processed 5000 samples...
Processed 6000 samples...
Processed 7000 samples...
Processed 8000 samples...
Processed 9000 samples...
Processed 10000 samples...
Processed 11000 samples...
Processed 12000 samples...
Processed 13000 samples...
Process