In [None]:
import gc
import json
import random

def process_dialect_file_streaming(filename, dialect_key, output_handle):
    """Process file and write directly to output - NO MEMORY ACCUMULATION"""
    
    prompt_templates = {
        'pontic': [
            "Γράψε στην ποντιακή διάλεκτο:",
            "Απάντησε στα ποντιακά:",
            "Στην ποντιακή γλώσσα:",
            "Συνέχισε στα ποντιακά:",
        ],
        'cretan': [
            "Γράψε στην κρητική διάλεκτο:",
            "Απάντησε στα κρητικά:",
            "Στην κρητική γλώσσα:",
            "Συνέχισε στα κρητικά:",
        ],
        'northern': [
            "Γράψε στη βορειοελλαδική διάλεκτο:",
            "Απάντησε στα βορειοελλαδικά:",
            "Στη διάλεκτο της Βόρειας Ελλάδας:",
            "Συνέχισε στα βορειοελλαδικά:",
        ],
        'cypriot': [
            "Γράψε στην κυπριακή διάλεκτο:",
            "Απάντησε στα κυπριακά:",
            "Στην κυπριακή γλώσσα:",
            "Συνέχισε στα κυπριακά:",
        ]
    }
    
    print(f"Processing {filename}...")
    
    # Read file in chunks to avoid memory issues
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    
    words = text.split()
    total_words = len(words)
    print(f"  {total_words:,} words")
    
    chunk_size = 100
    templates = prompt_templates[dialect_key]
    count = 0
    
    for i in range(0, len(words), chunk_size):
        chunk = words[i:i+chunk_size]
        
        if len(chunk) >= 50:
            chunk_text = ' '.join(chunk)
            
            if len(chunk) >= 80:
                mid = len(chunk) // 2
                prompt_part = ' '.join(chunk[:mid])
                completion_part = ' '.join(chunk[mid:])
                template = random.choice(templates)
                
                example = {
                    'prompt': f'{template} {prompt_part}',
                    'completion': completion_part
                }
            else:
                template = random.choice(templates)
                example = {
                    'prompt': template,
                    'completion': chunk_text
                }
            
            # Write immediately, don't store
            output_handle.write(json.dumps(example, ensure_ascii=False) + '\n')
            count += 1
    
    # Clear memory
    del words, text
    gc.collect()
    
    print(f"  ✅ Wrote {count} examples")
    return count

def create_natural_prompts_streaming_with_cypriot():
    """Process files with STREAMING - includes both Cypriot files"""
    
    output_file = 'natural_prompts_all_dialects.jsonl'
    total_count = 0
    
    dialect_files = [
        ('cretan.txt', 'cretan'),
        ('pontic.txt', 'pontic'),
        ('nothern.txt', 'northern'),
        ('cypriot_1.txt', 'cypriot'),
        ('cypriot_2.txt', 'cypriot')
    ]
    
    # Open output file once, write as we go
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, dialect_key in dialect_files:
            try:
                count = process_dialect_file_streaming(filename, dialect_key, out_f)
                total_count += count
            except FileNotFoundError:
                print(f"❌ {filename} not found")
            except Exception as e:
                print(f"❌ Error: {e}")
    
    print(f"\n✅ Total: {total_count:,} examples")
    print(f"✅ Saved to: {output_file}")

if __name__ == "__main__":
    # Clear any existing memory
    gc.collect()
    create_natural_prompts_streaming_with_cypriot()