# Synthetic Prompts Checkpoint Analyzer

This notebook analyzes the checkpoint files created during the synthetic prompt generation process to verify the process is working as expected. Since synthetic prompt generation can be computationally expensive and time-consuming, this analysis helps monitor progress and quality.

In [None]:
# Import required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from datetime import datetime

# Enable inline plotting
%matplotlib inline

# Set styling
plt.style.use('ggplot')
sns.set(style='whitegrid')

## Load Checkpoint Files

The checkpoint directory contains CSV files of synthetic prompts at different stages of generation.

In [None]:
# Define the checkpoint directory path (relative to project root)
checkpoint_dir = '../checkpoints/'

# Find all CSV files in the checkpoint directory
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'synthetic_prompts_*.csv'))

# Sort files by the numeric value in their filename
checkpoint_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Display found checkpoint files
print(f"Found {len(checkpoint_files)} checkpoint files:")
for file in checkpoint_files:
    print(f" - {os.path.basename(file)}")

## Analyze Checkpoint Growth

Check how the number of synthetic prompts grows across checkpoints.

In [None]:
# Create a dictionary to store checkpoint info
checkpoint_stats = {}

# Load each checkpoint file and extract basic statistics
for file in checkpoint_files:
    checkpoint_num = int(file.split('_')[-1].split('.')[0])
    df = pd.read_csv(file)
    
    # Store stats
    checkpoint_stats[checkpoint_num] = {
        'num_prompts': len(df),
        'file_size_kb': os.path.getsize(file) / 1024,
        'data': df
    }

# Create a dataframe of checkpoint statistics
stats_df = pd.DataFrame({
    'checkpoint': list(checkpoint_stats.keys()),
    'num_prompts': [stats['num_prompts'] for stats in checkpoint_stats.values()],
    'file_size_kb': [stats['file_size_kb'] for stats in checkpoint_stats.values()]
})

# Display the stats
stats_df

In [None]:
# Visualize the growth of prompts across checkpoints
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.set_xlabel('Checkpoint Number')
ax1.set_ylabel('Number of Prompts', color='tab:blue')
ax1.plot(stats_df['checkpoint'], stats_df['num_prompts'], marker='o', color='tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

ax2 = ax1.twinx()  # Create a second y-axis
ax2.set_ylabel('File Size (KB)', color='tab:red')
ax2.plot(stats_df['checkpoint'], stats_df['file_size_kb'], marker='s', color='tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')

plt.title('Checkpoint Growth Analysis')
fig.tight_layout()
plt.show()

## Analyze Prompt Content

Examine the latest checkpoint file in detail to check the quality of generated prompts.

In [None]:
# Get the latest checkpoint file
latest_checkpoint_num = max(checkpoint_stats.keys())
latest_df = checkpoint_stats[latest_checkpoint_num]['data']

# Display basic info about the latest checkpoint
print(f"Latest checkpoint: {latest_checkpoint_num}")
print(f"Number of prompts: {len(latest_df)}")
print(f"\nColumns in the dataset:")
for col in latest_df.columns:
    print(f" - {col}")

# Display the first few rows of the latest checkpoint
latest_df.head()

In [None]:
# Analyze prompt length distribution
if 'prompt' in latest_df.columns:
    latest_df['prompt_length'] = latest_df['prompt'].apply(len)
    
    plt.figure(figsize=(10, 6))
    sns.histplot(latest_df['prompt_length'], kde=True)
    plt.title('Distribution of Prompt Lengths')
    plt.xlabel('Character Count')
    plt.ylabel('Frequency')
    plt.axvline(latest_df['prompt_length'].mean(), color='red', linestyle='--', label=f'Mean: {latest_df["prompt_length"].mean():.1f}')
    plt.legend()
    plt.show()
    
    print(f"Minimum prompt length: {latest_df['prompt_length'].min()}")
    print(f"Maximum prompt length: {latest_df['prompt_length'].max()}")
    print(f"Average prompt length: {latest_df['prompt_length'].mean():.1f}")

## Sample Prompts

Review a random sample of prompts from the latest checkpoint to manually assess quality.

In [None]:
# Display random samples from the latest checkpoint
sample_size = min(5, len(latest_df))
random_samples = latest_df.sample(sample_size)

print(f"Showing {sample_size} random prompts from the latest checkpoint:")
for i, (_, row) in enumerate(random_samples.iterrows(), 1):
    print(f"\nSample {i}:")
    if 'prompt' in row:
        print(f"Prompt: {row['prompt']}")
    
    # Print any other relevant columns
    other_cols = [col for col in row.index if col != 'prompt' and not pd.isna(row[col])]
    for col in other_cols:
        print(f"{col}: {row[col]}")

## Checkpoint Comparison

If multiple checkpoints exist, compare how the content has evolved.

In [None]:
# Compare prompt length distribution across checkpoints if we have multiple
if len(checkpoint_stats) > 1:
    plt.figure(figsize=(12, 6))
    
    for checkpoint_num, stats in checkpoint_stats.items():
        df = stats['data']
        if 'prompt' in df.columns:
            df['prompt_length'] = df['prompt'].apply(len)
            sns.kdeplot(df['prompt_length'], label=f'Checkpoint {checkpoint_num}')
    
    plt.title('Prompt Length Distribution Across Checkpoints')
    plt.xlabel('Character Count')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

## Cost Analysis

If the checkpoint contains information about token counts or API calls, analyze the cost implications.

In [None]:
# Check if we have token count information to estimate costs
token_columns = [col for col in latest_df.columns if 'token' in col.lower()]

if token_columns:
    print("Token-related columns found. Performing cost analysis...")
    
    # Assuming 'input_tokens' and 'output_tokens' columns exist and using hypothetical pricing
    if 'input_tokens' in latest_df.columns and 'output_tokens' in latest_df.columns:
        # Example pricing - adjust based on your actual model and pricing
        INPUT_TOKEN_PRICE = 0.0015 / 1000  # $0.0015 per 1000 tokens
        OUTPUT_TOKEN_PRICE = 0.002 / 1000  # $0.002 per 1000 tokens
        
        total_input_tokens = latest_df['input_tokens'].sum()
        total_output_tokens = latest_df['output_tokens'].sum()
        
        input_cost = total_input_tokens * INPUT_TOKEN_PRICE
        output_cost = total_output_tokens * OUTPUT_TOKEN_PRICE
        total_cost = input_cost + output_cost
        
        print(f"Total input tokens: {total_input_tokens:,}")
        print(f"Total output tokens: {total_output_tokens:,}")
        print(f"Estimated input cost: ${input_cost:.2f}")
        print(f"Estimated output cost: ${output_cost:.2f}")
        print(f"Estimated total cost: ${total_cost:.2f}")
else:
    print("No token-related columns found in the dataset.")

## Generation Progress Rate

Analyze how quickly prompts are being generated based on checkpoint timestamps.

In [None]:
# Try to get file modification times to estimate generation rate
timestamps = {}
for file in checkpoint_files:
    checkpoint_num = int(file.split('_')[-1].split('.')[0])
    timestamps[checkpoint_num] = os.path.getmtime(file)

if len(timestamps) > 1:
    checkpoint_nums = sorted(timestamps.keys())
    
    # Calculate time differences and generation rates
    time_diffs = []
    rates = []
    
    for i in range(1, len(checkpoint_nums)):
        prev_num = checkpoint_nums[i-1]
        curr_num = checkpoint_nums[i]
        
        time_diff = timestamps[curr_num] - timestamps[prev_num]  # in seconds
        prompt_diff = checkpoint_stats[curr_num]['num_prompts'] - checkpoint_stats[prev_num]['num_prompts']
        
        if time_diff > 0:
            rate = prompt_diff / (time_diff / 3600)  # prompts per hour
            time_diffs.append(time_diff / 3600)  # convert to hours
            rates.append(rate)
    
    if rates:
        # Calculate average generation rate
        avg_rate = sum(rates) / len(rates)
        
        # Estimate time to generate 1000 more prompts
        time_for_1000 = 1000 / avg_rate
        
        print(f"Average generation rate: {avg_rate:.2f} prompts/hour")
        print(f"Estimated time to generate 1000 more prompts: {time_for_1000:.2f} hours")
        
        # Visualize generation rate
        plt.figure(figsize=(10, 6))
        plt.bar(range(len(rates)), rates)
        plt.axhline(avg_rate, color='red', linestyle='--', label=f'Average: {avg_rate:.2f}')
        plt.xlabel('Checkpoint Transition')
        plt.ylabel('Generation Rate (prompts/hour)')
        plt.title('Synthetic Prompt Generation Rate')
        plt.xticks(range(len(rates)), [f'{checkpoint_nums[i]}-{checkpoint_nums[i+1]}' for i in range(len(rates))])
        plt.legend()
        plt.show()
else:
    print("Not enough checkpoints to analyze generation rate.")

## Checking for Duplicates

Verify that synthetic prompts are unique across checkpoints.

In [None]:
# Check for duplicates in the latest checkpoint
if 'prompt' in latest_df.columns:
    duplicate_count = latest_df.duplicated(subset=['prompt']).sum()
    duplicate_percent = (duplicate_count / len(latest_df)) * 100 if len(latest_df) > 0 else 0
    
    print(f"Number of duplicate prompts in latest checkpoint: {duplicate_count}")
    print(f"Percentage of duplicates: {duplicate_percent:.2f}%")

    # Check for near-duplicates using a simple similarity metric
    if len(latest_df) > 1:
        print("\nChecking for near-duplicates (this might take a while for large datasets)...")
        
        # For demonstration, we'll just check a sample of prompts
        sample_size = min(100, len(latest_df))
        prompt_sample = latest_df['prompt'].sample(sample_size).tolist()
        
        import difflib
        similarity_threshold = 0.9
        near_duplicate_pairs = []
        
        for i in range(len(prompt_sample)):
            for j in range(i+1, len(prompt_sample)):
                similarity = difflib.SequenceMatcher(None, prompt_sample[i], prompt_sample[j]).ratio()
                if similarity > similarity_threshold:
                    near_duplicate_pairs.append((i, j, similarity))
        
        if near_duplicate_pairs:
            print(f"Found {len(near_duplicate_pairs)} potential near-duplicate pairs in the sample:")
            for i, j, similarity in near_duplicate_pairs[:3]:  # Show just a few examples
                print(f"\nSimilarity: {similarity:.2f}")
                print(f"Prompt 1: {prompt_sample[i][:100]}...")
                print(f"Prompt 2: {prompt_sample[j][:100]}...")
            if len(near_duplicate_pairs) > 3:
                print(f"...and {len(near_duplicate_pairs) - 3} more pairs.")
        else:
            print("No near-duplicates found in the sample.")

## Conclusion

Summarize the findings from the checkpoint analysis.

In [None]:
# Print a summary of the findings
print("## Synthetic Prompts Checkpoint Analysis Summary")

print(f"\n### Process Status:")
print(f"- Total checkpoints found: {len(checkpoint_stats)}")
if len(checkpoint_stats) > 0:
    print(f"- Latest checkpoint: {max(checkpoint_stats.keys())}")
    print(f"- Total prompts generated so far: {checkpoint_stats[max(checkpoint_stats.keys())]['num_prompts']}")

if len(checkpoint_stats) > 1:
    print(f"\n### Generation Progress:")
    first_checkpoint = min(checkpoint_stats.keys())
    last_checkpoint = max(checkpoint_stats.keys())
    total_growth = checkpoint_stats[last_checkpoint]['num_prompts'] - checkpoint_stats[first_checkpoint]['num_prompts']
    print(f"- Prompt growth: +{total_growth} prompts since first checkpoint")
    if 'avg_rate' in locals() and avg_rate > 0:
        print(f"- Average generation rate: {avg_rate:.2f} prompts/hour")
        print(f"- Estimated completion time for 1000 more prompts: {time_for_1000:.2f} hours")

print(f"\n### Data Quality:")
if 'duplicate_count' in locals():
    print(f"- Duplicate prompts: {duplicate_count} ({duplicate_percent:.2f}%)")

if 'token_columns' in locals() and token_columns:
    print(f"\n### Resource Usage:")
    if 'total_cost' in locals():
        print(f"- Estimated cost so far: ${total_cost:.2f}")

print("\nRecommendations:")
if len(checkpoint_stats) < 2:
    print("- Create more checkpoints to enable better progress tracking")
elif 'duplicate_count' in locals() and duplicate_percent > 5:
    print("- Review generation process to reduce duplicates")
elif 'avg_rate' in locals() and avg_rate < 10:
    print("- Consider optimization to improve generation rate")
else:
    print("- Process appears to be running as expected")