In [2]:
import os

# Load text file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

vnc_file_path = '../../data/raw/VN/visual-novels.txt'
file_content = load_text_file(vnc_file_path)
file_lines = file_content.split('\n')
print(f"Total lines in file: {len(file_lines)}")

Total lines in file: 3234086


In [3]:
# Split lines by pattern [          - title - Doki Doki Literature Club.txt          ]
import re
pattern = r'\[\s*- title - (.+?)\.txt\s*\]'
splits = re.split(pattern, file_content)
print(f"Total splits found: {len(splits)}")

Total splits found: 273


In [4]:
# Get lines for visual novel "Doki Doki Literature Club.txt"
vn_title = "Doki Doki Literature Club"
try:
    index = splits.index(vn_title)
    vn_content = splits[index + 1]
    vn_lines = vn_content.split('\n')
    print(f"Total lines in '{vn_title}': {len(vn_lines)}")
except ValueError:
    print(f"Visual novel '{vn_title}' not found.")

Total lines in 'Doki Doki Literature Club': 7249


In [5]:
vn_lines_sample = vn_lines[:10]
vn_lines_sample

['',
 'Sayori: "Heeeeeeeyyy!!"',
 "*I see an annoying girl running toward me from the distance, waving her arms in the air like she's totally oblivious to any attention she might draw to herself.*",
 '*That girl is Sayori, my neighbor and good friend since we were children.*',
 "*You know, the kind of friend you'd never see yourself making today, but it just kind of works out because you've known each other for so long?*",
 '*We used to walk to school together on days like this, but starting around high school she would oversleep more and more frequently, and I would get tired of waiting up.*',
 "*But if she's going to chase after me like this, I almost feel better off running away.*",
 '*However, I just sigh and idle in front of the crosswalk and let Sayori catch up to me.*',
 'Sayori: "I overslept again!"',
 'Sayori: "But I caught you this time!"']

In [6]:
# Remove lines that starts with and ends with *
cleaned_vn_lines = [line for line in vn_lines if not (line.startswith('*') and line.endswith('*'))]
print(f"Total lines after cleaning: {len(cleaned_vn_lines)}")
cleaned_vn_lines_sample = cleaned_vn_lines[:10]
cleaned_vn_lines_sample

Total lines after cleaning: 5522


['',
 'Sayori: "Heeeeeeeyyy!!"',
 'Sayori: "I overslept again!"',
 'Sayori: "But I caught you this time!"',
 '<USER>: "Maybe, but only because I decided to stop and wait for you."',
 'Sayori: "That\'s mean, <USER>!"',
 '<USER>: "Well, if people stare at you for acting weird then I don\'t want them to think we\'re a couple or something."',
 'Sayori: "But you did wait for me, after all."',
 'Sayori: "I guess you don\'t have it in you to be mean even if you want to~"',
 '<USER>: "Whatever you say, Sayori..."']

In [7]:
def clean_text_formatting(text):
    """
    Remove VN formatting artifacts from text.

    Removes:
    - All Ren'Py tags (formatting, animation, timing)
    - Excessive ellipsis (more than 3 dots)

    Keeps:
    - <USER> placeholder (intentional for inference)
    """
    # Remove Ren'Py markup tags (comprehensive patterns)

    # Basic formatting tags
    text = re.sub(r'\{/?i\}', '', text)  # {i} and {/i}
    text = re.sub(r'\{/?b\}', '', text)  # {b} and {/b}
    text = re.sub(r'\{/?u\}', '', text)  # {u} and {/u}
    text = re.sub(r'\{/?s\}', '', text)  # {s} and {/s}

    # Formatting with parameters (color, size, font, etc.)
    text = re.sub(r'\{/?color[^}]*\}', '', text)  # {color=...} and {/color}
    text = re.sub(r'\{/?size[^}]*\}', '', text)  # {size=...} and {/size}
    text = re.sub(r'\{/?font[^}]*\}', '', text)  # {font=...} and {/font}

    # Animation/timing tags (NEW)
    text = re.sub(r'\{nw\}', '', text)  # No-wait
    text = re.sub(r'\{w(?:=[\d.]+)?\}', '', text)  # Wait/pause
    text = re.sub(r'\{p(?:=[\d.]+)?\}', '', text)  # Paragraph pause
    text = re.sub(r'\{fast\}', '', text)  # Fast text mode
    text = re.sub(r'\{done\}', '', text)  # Text complete
    text = re.sub(r'\{clear\}', '', text)  # Clear screen

    # Text speed control (NEW)
    text = re.sub(r'\{cps=\d+\}', '', text)  # Character-per-second
    text = re.sub(r'\{/cps\}', '', text)  # End CPS

    # Spacing controls (NEW)
    text = re.sub(r'\{(?:v)?space=\d+\}', '', text)  # space/vspace

    # Clean excessive ellipsis (keep max 3 dots)
    text = re.sub(r'\.{4,}', '...', text)

    # Clean up extra whitespace
    text = ' '.join(text.split())

    return text.strip()

In [8]:
# Clean the VN lines
final_cleaned_vn_lines = [clean_text_formatting(line) for line in cleaned_vn_lines if line.strip()]
print(f"Total lines after final cleaning: {len(final_cleaned_vn_lines)}")

Total lines after final cleaning: 5519


In [9]:
# Remove empty lines
final_vn_lines = [line for line in cleaned_vn_lines if line.strip() != '']
print(f"Total lines after removing empty lines: {len(final_vn_lines)}")
final_vn_lines_sample = final_vn_lines[:15]
final_vn_lines_sample

Total lines after removing empty lines: 5519


['Sayori: "Heeeeeeeyyy!!"',
 'Sayori: "I overslept again!"',
 'Sayori: "But I caught you this time!"',
 '<USER>: "Maybe, but only because I decided to stop and wait for you."',
 'Sayori: "That\'s mean, <USER>!"',
 '<USER>: "Well, if people stare at you for acting weird then I don\'t want them to think we\'re a couple or something."',
 'Sayori: "But you did wait for me, after all."',
 'Sayori: "I guess you don\'t have it in you to be mean even if you want to~"',
 '<USER>: "Whatever you say, Sayori..."',
 'Sayori: "By the way, <USER>..."',
 'Sayori: "Have you decided on a club to join yet?"',
 '<USER>: "A club?"',
 '<USER>: "I told you already, I\'m really not interested in joining any clubs."',
 '<USER>: "I haven\'t been looking, either."',
 'Sayori: "You told me you would join a club this year!"']

In [10]:
# Get character names from the cleaned lines
character_names = set()
for line in final_vn_lines:
    if ':' in line:
        character = line.split(':')[0].strip()
        character_names.add(character)
print(f"Total unique character names: {len(character_names)}")
character_names

Total unique character names: 5


{'<USER>', 'Monika', 'Natsuki', 'Sayori', 'Yuri'}

In [11]:
# Get total number of lines spoken by each character
from collections import defaultdict
character_line_counts = defaultdict(int)
for line in final_vn_lines:
    if ':' in line:
        character = line.split(':')[0].strip()
        character_line_counts[character] += 1
character_line_counts_sample = dict(list(character_line_counts.items())[:10])
character_line_counts_sample

{'Sayori': 507, '<USER>': 1787, 'Yuri': 847, 'Natsuki': 653, 'Monika': 1725}

In [12]:
from dataclasses import dataclass

@dataclass
class ChunkingConfig:
    """Configuration for chunking parameters"""
    chunk_size: int = 20          # Lines per conversation
    overlap: int = 5              # Lines to overlap between chunks
    min_chunk_size: int = 5       # Minimum lines to keep a chunk
    include_other_chars: bool = False  # Include other characters as context
    
@dataclass
class CharacterPersona:
    """Character personality and traits"""
    name: str
    description: str

# Character definitions
CHARACTER_PERSONAS = {
    "Monika": CharacterPersona(
        name="Monika",
        description="You are Monika, the Literature Club president. Confident, intelligent, and caring. You're thoughtful and philosophical, ambitious and kind with a mysterious side."
    ),
    "Sayori": CharacterPersona(
        name="Sayori", 
        description="You are Sayori, a cheerful childhood friend. Bubbly, energetic, and optimistic, though you hide deeper feelings. Sunny personality, slightly clumsy, deeply caring."
    ),
    "Natsuki": CharacterPersona(
        name="Natsuki",
        description="You are Natsuki, a tsundere who loves manga and baking. Defensive exterior but sweet underneath. Feisty, proud, and secretly soft-hearted."
    ),
    "Yuri": CharacterPersona(
        name="Yuri",
        description="You are Yuri, shy and sophisticated with a passion for literature. Elegant but socially anxious. Intellectual, timid, and intense when comfortable."
    )
}

In [13]:
import json
import re
from typing import List, Dict, Tuple, Optional

def parse_dialogue_line(line: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Parse dialogue line in format: 'Character: "dialogue"'
    
    Args:
        line: Raw dialogue line
        
    Returns:
        Tuple of (character_name, dialogue_text) or (None, None) if parsing fails
    """
    # Handle various quote formats
    patterns = [
        r'(.+?):\s*"(.+?)"',      # Double quotes
        r"(.+?):\s*'(.+?)'",      # Single quotes
        r'(.+?):\s*"(.+?)"',      # Smart quotes
    ]
    
    for pattern in patterns:
        match = re.match(pattern, line.strip())
        if match:
            character = match.group(1).strip()
            dialogue = match.group(2).strip()
            return character, dialogue
    
    return None, None

In [14]:
def chunk_dialogue_with_window(dialogue_lines: List[str], 
                                config: ChunkingConfig) -> List[List[str]]:
    """
    Split flat dialogue into chunks using sliding window
    
    Args:
        dialogue_lines: Flat list of all dialogue lines
        config: Chunking configuration
        
    Returns:
        List of dialogue chunks
    """
    chunks = []
    stride = config.chunk_size - config.overlap
    
    for i in range(0, len(dialogue_lines), stride):
        chunk = dialogue_lines[i:i + config.chunk_size]
        
        # Only keep chunks that meet minimum size
        if len(chunk) >= config.min_chunk_size:
            chunks.append(chunk)
        elif chunk and i + config.chunk_size >= len(dialogue_lines):
            # Keep final chunk even if smaller (end of story)
            chunks.append(chunk)
    
    return chunks

In [15]:
import sys

# Try to import EmotionClassifier from local package 'src'. If the import fails,
# add the project root to sys.path so the local package can be found.
try:
    from src.utils.emotion_classifier import EmotionClassifier, EmotionUtils
except ModuleNotFoundError:
    # 'os' is imported in an earlier notebook cell; reuse it to build the project root.
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    if project_root not in sys.path:
        sys.path.insert(0, project_root)
    # Retry import
    from src.utils.emotion_classifier import EmotionClassifier, EmotionUtils


def extract_character_conversation_with_emotions(
    chunk: List[str],
    target_character: str,
    emotion_classifier: EmotionClassifier,
    include_others: bool = False
) -> tuple[List[Dict], List[str], List[str]]:
    """
    Convert chunk to message format WITH emotion classification
    
    Returns:
        Tuple of (messages, user_emotions, assistant_emotions)
    """
    messages = []
    user_emotions = []
    assistant_emotions = []
    
    user_buffer = []
    assistant_buffer = []
    context_buffer = []
    
    def flush_buffers():
        nonlocal user_buffer, assistant_buffer, context_buffer
        
        if user_buffer:
            content = " ".join(user_buffer)
            
            # Classify user emotion
            user_emotion = emotion_classifier.get_primary_emotion(content)
            user_emotions.append(user_emotion)
            
            if include_others and context_buffer:
                context_str = " | ".join(context_buffer)
                content = f"[Others: {context_str}]\n\n{content}"
            
            messages.append({
                "role": "user",
                "content": content
            })
            user_buffer = []
            context_buffer = []
        
        if assistant_buffer:
            content = " ".join(assistant_buffer)
            
            # Classify assistant emotion
            assistant_emotion = emotion_classifier.get_primary_emotion(content)
            assistant_emotions.append(assistant_emotion)
            
            messages.append({
                "role": "assistant",
                "content": content
            })
            assistant_buffer = []
    
    for line in chunk:
        character, dialogue = parse_dialogue_line(line)
        
        if not character or not dialogue:
            continue
        
        if character == '<USER>':
            if assistant_buffer:
                flush_buffers()
            user_buffer.append(dialogue)
            
        elif character == target_character:
            if user_buffer or context_buffer:
                flush_buffers()
            assistant_buffer.append(dialogue)
            
        else:
            if include_others:
                context_buffer.append(f"{character}: {dialogue}")
    
    flush_buffers()
    return messages, user_emotions, assistant_emotions

In [16]:
def estimate_affection_with_emotions(
    messages: List[Dict],
    user_emotions: List[str],
    assistant_emotions: List[str],
    chunk_index: int,
    total_chunks: int,
    character: str
) -> int:
    """
    Enhanced affection estimation using emotion classification
    """
    # Base affection from story progress
    progress = chunk_index / max(total_chunks, 1)
    base_affection = int(20 + (progress * 60))
    
    # Add emotion-based adjustments
    emotion_impact = 0
    
    for user_emotion in user_emotions:
        impact = EmotionUtils.get_affection_impact(user_emotion)
        emotion_impact += impact
    
    # Bonus for emotional reciprocity
    # If user shows positive emotion and assistant responds warmly
    for i, user_emotion in enumerate(user_emotions):
        if i < len(assistant_emotions):
            assistant_emotion = assistant_emotions[i]
            
            # Positive user + positive assistant = bonus
            if (user_emotion in EmotionUtils.POSITIVE_EMOTIONS and 
                assistant_emotion in EmotionUtils.POSITIVE_EMOTIONS):
                emotion_impact += 3
            
            # Negative user + caring assistant = big bonus (emotional support)
            if (user_emotion in EmotionUtils.NEGATIVE_EMOTIONS and 
                assistant_emotion in {'caring', 'optimism', 'approval'}):
                emotion_impact += 5
    
    total_affection = base_affection + emotion_impact
    return min(max(total_affection, 0), 100)

def format_training_example_with_emotions(
    messages: List[Dict],
    user_emotions: List[str],
    assistant_emotions: List[str],
    persona: CharacterPersona,
    affection: int,
    chunk_idx: int
) -> Dict:
    """
    Format training example with emotion context in system prompt
    """
    # Get primary user emotion for this conversation
    primary_user_emotion = max(set(user_emotions), key=user_emotions.count) if user_emotions else "neutral"
    
    # Create emotion-aware system prompt
    emotion_guidance = get_emotion_response_guidance(primary_user_emotion)
    
    system_message = {
        "role": "system",
        "content": f"""{persona.description}

Current affection: {affection}/100
User's emotional state: {primary_user_emotion}

{emotion_guidance}"""
    }
    
    return {
        "messages": [system_message] + messages,
        "metadata": {
            "character": persona.name,
            "chunk_index": chunk_idx,
            "affection": affection,
            "user_emotions": user_emotions,
            "assistant_emotions": assistant_emotions,
            "primary_user_emotion": primary_user_emotion,
            "num_turns": len(messages) // 2
        }
    }

def get_emotion_response_guidance(emotion: str) -> str:
    """Get response guidance based on user emotion"""
    guidance = {
        'joy': "The user is happy! Match their enthusiasm and share in their joy.",
        'sadness': "The user seems sad. Be empathetic, supportive, and caring.",
        'anger': "The user appears upset. Stay calm, be understanding, and don't escalate.",
        'fear': "The user is anxious or scared. Be reassuring and comforting.",
        'love': "The user is expressing affection. Respond warmly and appreciate their feelings.",
        'caring': "The user is being caring. Show appreciation and reciprocate the warmth.",
        'curiosity': "The user is curious. Be informative and engaging in your response.",
        'confusion': "The user seems confused. Be clear, patient, and helpful in explaining.",
        'gratitude': "The user is thankful. Acknowledge their gratitude warmly.",
        'disappointment': "The user is disappointed. Be understanding and try to uplift them.",
        'excitement': "The user is excited! Share their excitement and be energetic.",
        'annoyance': "The user seems annoyed. Be patient and try to understand their frustration.",
    }
    return guidance.get(emotion, "Respond naturally based on the conversation context.")

In [17]:
def get_top_emotions(emotions: List[str], top_k: int) -> List[tuple]:
    """Get most common emotions"""
    from collections import Counter
    counter = Counter(emotions)
    return counter.most_common(top_k)


def process_vn_dialogue(
    dialogue_lines: List[str],
    target_character: str,
    config: Optional[ChunkingConfig] = None,
    emotion_classifier: Optional[EmotionClassifier] = None
) -> List[Dict]:
    """
    Complete pipeline with emotion classification
    """
    if config is None:
        config = ChunkingConfig()
    
    if emotion_classifier is None:
        emotion_classifier = EmotionClassifier()
    
    if target_character not in CHARACTER_PERSONAS:
        raise ValueError(f"Unknown character: {target_character}")
    
    persona = CHARACTER_PERSONAS[target_character]
    
    print(f"{'='*60}")
    print(f"Processing dialogue for {target_character} with emotion analysis")
    print(f"{'='*60}")
    
    chunks = chunk_dialogue_with_window(dialogue_lines, config)
    print(f"Created {len(chunks)} chunks")
    
    training_examples = []
    
    for chunk_idx, chunk in enumerate(chunks):
        if not any(target_character in line for line in chunk):
            continue
        
        # Extract with emotion classification
        messages, user_emotions, assistant_emotions = extract_character_conversation_with_emotions(
            chunk,
            target_character,
            emotion_classifier,
            include_others=config.include_other_chars
        )
        
        if len(messages) < 2:
            continue
        
        # Calculate affection with emotion context
        affection = estimate_affection_with_emotions(
            messages,
            user_emotions,
            assistant_emotions,
            chunk_idx,
            len(chunks),
            target_character
        )
        
        # Format with emotion-aware system prompt
        training_example = format_training_example_with_emotions(
            messages,
            user_emotions,
            assistant_emotions,
            persona,
            affection,
            chunk_idx
        )
        
        training_examples.append(training_example)
    
    print(f"✓ Created {len(training_examples)} emotion-aware training examples")
    
    # Emotion statistics
    all_user_emotions = []
    all_assistant_emotions = []
    for ex in training_examples:
        all_user_emotions.extend(ex['metadata']['user_emotions'])
        all_assistant_emotions.extend(ex['metadata']['assistant_emotions'])
    
    print(f"\nEmotion Statistics:")
    print(f"  Most common user emotions: {get_top_emotions(all_user_emotions, 5)}")
    print(f"  Most common assistant emotions: {get_top_emotions(all_assistant_emotions, 5)}")
    
    return training_examples

In [18]:
def get_statistics(training_examples: List[Dict]) -> Dict:
    """Get statistics about the processed dataset"""
    if not training_examples:
        return {}
    
    stats = {
        "total_examples": len(training_examples),
        "total_turns": sum(ex["metadata"]["num_turns"] for ex in training_examples),
        "avg_turns_per_conversation": sum(ex["metadata"]["num_turns"] for ex in training_examples) / len(training_examples),
        "affection_min": min(ex["metadata"]["affection"] for ex in training_examples),
        "affection_max": max(ex["metadata"]["affection"] for ex in training_examples),
        "affection_avg": sum(ex["metadata"]["affection"] for ex in training_examples) / len(training_examples),
    }
    
    return stats


In [19]:
# Configuration
config = ChunkingConfig(
    chunk_size=23,
    overlap=5,
    min_chunk_size=4,
    include_other_chars=False
)

# Remove "<USER>" from list of character names if present
character_names.discard("<USER>")

# Process
training_data = {}

for character in character_names:
    print(f"Processing character: {character}...")

    training_data[character] = process_vn_dialogue(
        dialogue_lines=final_vn_lines,
        target_character=character,
        config=config
    )

training_data

Processing character: Yuri...
Loading emotion classifier on cuda...


Device set to use cuda:0


✓ Emotion classifier loaded
Processing dialogue for Yuri with emotion analysis
Created 307 chunks


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✓ Created 128 emotion-aware training examples

Emotion Statistics:
  Most common user emotions: [('neutral', 258), ('joy', 81), ('surprise', 51), ('anger', 27), ('sadness', 12)]
  Most common assistant emotions: [('neutral', 249), ('joy', 70), ('anger', 34), ('surprise', 26), ('sadness', 25)]
Processing character: Natsuki...
Loading emotion classifier on cuda...


Device set to use cuda:0


✓ Emotion classifier loaded
Processing dialogue for Natsuki with emotion analysis
Created 307 chunks
✓ Created 111 emotion-aware training examples

Emotion Statistics:
  Most common user emotions: [('neutral', 200), ('joy', 69), ('anger', 45), ('surprise', 23), ('sadness', 11)]
  Most common assistant emotions: [('neutral', 117), ('anger', 94), ('joy', 61), ('surprise', 31), ('sadness', 22)]
Processing character: Sayori...
Loading emotion classifier on cuda...


Device set to use cuda:0


✓ Emotion classifier loaded
Processing dialogue for Sayori with emotion analysis
Created 307 chunks
✓ Created 90 emotion-aware training examples

Emotion Statistics:
  Most common user emotions: [('neutral', 129), ('joy', 57), ('anger', 53), ('surprise', 34), ('sadness', 12)]
  Most common assistant emotions: [('joy', 112), ('neutral', 79), ('anger', 40), ('sadness', 28), ('surprise', 19)]
Processing character: Monika...
Loading emotion classifier on cuda...


Device set to use cuda:0


✓ Emotion classifier loaded
Processing dialogue for Monika with emotion analysis
Created 307 chunks
✓ Created 110 emotion-aware training examples

Emotion Statistics:
  Most common user emotions: [('neutral', 170), ('joy', 57), ('surprise', 29), ('anger', 28), ('sadness', 8)]
  Most common assistant emotions: [('neutral', 163), ('joy', 75), ('anger', 19), ('sadness', 15), ('surprise', 11)]


{'Yuri': [{'messages': [{'role': 'system',
     'content': "You are Yuri, shy and sophisticated with a passion for literature. Elegant but socially anxious. Intellectual, timid, and intense when comfortable.\n\nCurrent affection: 14/100\nUser's emotional state: joy\n\nThe user is happy! Match their enthusiasm and share in their joy."},
    {'role': 'user',
     'content': "Don't make promises you can't keep! Fine... I'll stop by for a cupcake, okay? I told you, don't call me a 'new member--'"},
    {'role': 'assistant',
     'content': "Welcome to the Literature Club. It's a pleasure meeting you. Sayori always says nice things about you."},
    {'role': 'user', 'content': '... S-Sorry...'},
    {'role': 'assistant', 'content': 'Natsuki...'},
    {'role': 'user',
     'content': "Ah... Well, it's nice to meet both of you. Y-<USER> too, Monika."}],
   'metadata': {'character': 'Yuri',
    'chunk_index': 2,
    'affection': 14,
    'user_emotions': ['anger', 'neutral', 'joy'],
    'assist

In [20]:
def save_training_data(training_examples: List[Dict],
                      output_path: str,
                      format: str = "jsonl"):
    """
    Save training data in various formats
    
    Args:
        training_examples: Processed training examples
        output_path: Path to save file
        format: 'jsonl', 'json', or 'dataset'
    """
    if format == "jsonl":
        # Save as JSON Lines (one example per line)
        with open(output_path, 'w', encoding='utf-8') as f:
            for example in training_examples:
                # Remove metadata for training
                training_format = {"messages": example["messages"]}
                f.write(json.dumps(training_format, ensure_ascii=False) + '\n')
        print(f"✓ Saved to {output_path} (JSONL format)")
        
    elif format == "json":
        # Save as single JSON file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(training_examples, f, indent=2, ensure_ascii=False)
        print(f"✓ Saved to {output_path} (JSON format)")
        
    elif format == "dataset":
        # Save as HuggingFace Dataset
        try:
            from datasets import Dataset
            
            # Prepare data (remove metadata)
            clean_data = [{"messages": ex["messages"]} for ex in training_examples]
            dataset = Dataset.from_list(clean_data)
            dataset.save_to_disk(output_path)
            print(f"✓ Saved to {output_path} (HF Dataset format)")
        except ImportError:
            print("⚠ Install datasets: pip install datasets")
            raise


In [21]:
# Save training data
output_dir = '../../data/processed/VN/'
os.makedirs(output_dir, exist_ok=True)

for character, data in training_data.items():

    # json
    output_path = os.path.join(output_dir, f'vn_training_data_{character}.jsonl')
    save_training_data(
        training_examples=data,
        output_path=output_path,
        format='jsonl'
    )

    # # dataset
    # output_path = os.path.join(output_dir, f'vn_training_data_{data["metadata"]["character"].lower()}_dataset')
    # save_training_data(
    #     training_examples=[data],
    #     output_path=output_path,
    #     format='dataset'
    # )

✓ Saved to ../../data/processed/VN/vn_training_data_Yuri.jsonl (JSONL format)
✓ Saved to ../../data/processed/VN/vn_training_data_Natsuki.jsonl (JSONL format)
✓ Saved to ../../data/processed/VN/vn_training_data_Sayori.jsonl (JSONL format)
✓ Saved to ../../data/processed/VN/vn_training_data_Monika.jsonl (JSONL format)
