In [1]:
import json
import random
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data (run once)
nltk.download('punkt')

# Load base dataset
def load_base_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return [(item['original'], item['enhanced']) for item in data]

# Augmentation function
def augment_data(data, augment_factor=20):
    augmented_data = data.copy()
    
    # Expanded synonym dictionary
    synonyms = {
        'explain': ['describe', 'clarify', 'elucidate', 'illustrate', 'detail'],
        'write': ['create', 'code', 'implement', 'develop', 'build'],
        'provide': ['give', 'supply', 'offer', 'present', 'deliver'],
        'how': ['ways to', 'method to', 'approach to', 'technique for', 'steps to'],
        'implement': ['build', 'develop', 'code', 'construct', 'execute'],
        'debug': ['troubleshoot', 'fix', 'resolve', 'diagnose', 'repair']
    }
    
    # Ensure balanced verb usage
    verbs = ['Explain', 'Write', 'Debug', 'Create', 'Implement', 'Describe', 'Develop', 'Set up', 'Optimize', 'Compare']
    
    for _ in range(augment_factor):
        for original, enhanced in data:
            new_original, new_enhanced = original, enhanced
            
            # Synonym Replacement
            for word, syn_list in synonyms.items():
                if random.random() < 0.4:
                    new_original = new_original.replace(word, random.choice(syn_list))
                    new_enhanced = new_enhanced.replace(word, random.choice(syn_list))
            
            # Paraphrasing
            if random.random() < 0.5:
                prefixes = ['Can you', 'I need', 'Please tell me how to', 'How do I', 'Show me how to', 'What is the way to']
                for prefix in prefixes:
                    if new_original.startswith(prefix):
                        new_prefix = random.choice([p for p in prefixes if p != prefix])
                        new_original = new_original.replace(prefix, new_prefix, 1)
                        break
            
            # Filler Word Addition/Removal
            fillers = ['please', 'kindly', 'in detail', 'comprehensive', 'step-by-step', 'quickly']
            if random.random() < 0.5:
                words = new_original.split()
                insert_pos = random.randint(1, len(words))
                filler = random.choice(fillers)
                new_original = ' '.join(words[:insert_pos] + [filler] + words[insert_pos:])
            else:
                for filler in fillers:
                    new_original = new_original.replace(filler, '').strip()
            
            # Verb Variation (balanced)
            if random.random() < 0.4:
                for verb in verbs:
                    if new_enhanced.startswith(verb):
                        new_verb = random.choice([v for v in verbs if v != verb])
                        new_enhanced = new_enhanced.replace(verb, new_verb, 1)
                        break
            
            # Random Cropping
            if random.random() < 0.3 and len(new_original.split()) > 5:
                words = new_original.split()
                start = random.randint(0, 2)
                end = len(words) - random.randint(0, 2)
                new_original = ' '.join(words[start:end])
            
            # Random Word Shuffling
            if random.random() < 0.2:
                words = new_original.split()
                if len(words) > 3:
                    i, j = random.sample(range(len(words)), 2)
                    words[i], words[j] = words[j], words[i]
                    new_original = ' '.join(words)
            
            # Add augmented pair if different
            if (new_original, new_enhanced) != (original, enhanced):
                augmented_data.append((new_original, new_enhanced))
    
    # Balance verbs in enhanced prompts
    from collections import Counter
    verb_counts = Counter(pair[1].split()[0] for pair in augmented_data)
    print(f"Verb distribution: {verb_counts}")
    
    return augmented_data[:1000]  # Limit to 1,000 pairs

# Save dataset
def save_dataset(data, file_path='large_programming_prompts.json'):
    json_data = [{'original': orig, 'enhanced': enh} for orig, enh in data]
    with open(file_path, 'w') as f:
        json.dump(json_data, f, indent=2)

# Main execution
if __name__ == '__main__':
    # Load base dataset
    data = load_base_dataset('base_programming_prompts.json')
    
    # Augment to ~1,000 pairs
    augmented_data = augment_data(data, augment_factor=20)
    
    # Save augmented dataset
    save_dataset(augmented_data)
    print(f"Original dataset size: {len(data)}, Augmented dataset size: {len(augmented_data)}")

Verb distribution: Counter({'Write': 165, 'Explain': 134, 'Implement': 111, 'Set': 88, 'Create': 71, 'Optimize': 62, 'Compare': 48, 'Debug': 45, 'Develop': 40, 'Handle': 34, 'Use': 34, 'Describe': 31, 'Containerize': 19, 'Validate': 18, 'Merge': 17, 'Join': 16, 'Parse': 15})
Original dataset size: 49, Augmented dataset size: 948


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saiet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
