In [None]:
import json
import pandas as pd
from datasets import Dataset

def preprocess_data(train_file, valid_file):
    """
    Preprocess the training and validation files for LLaMA fine-tuning.
    """
    def parse_file(file_content):
        # Extract instruction-input-output pairs
        examples = []
        for line in file_content.split('\n'):
            if line.strip():
                try:
                    data = json.loads(line)
                    if all(k in data for k in ['instruction', 'input', 'output']):
                        # Format the prompt
                        prompt = f"### Instruction: {data['instruction']}\n\n### Input: {data['input']}\n\n### Response:"
                        
                        # Format the completion/response
                        completion = data['output']
                        
                        examples.append({
                            'text': f"{prompt}\n{completion}",
                            'prompt': prompt,
                            'completion': completion
                        })
                except:
                    continue
        return examples

    # Create datasets
    train_examples = parse_file(train_file)
    valid_examples = parse_file(valid_file)
    
    # Convert to Datasets format
    train_dataset = Dataset.from_pandas(pd.DataFrame(train_examples))
    valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_examples))
    
    return train_dataset, valid_dataset

def format_for_llama(example):
    """
    Format a single example for LLaMA training.
    """
    return {
        'text': example['text'],
        'prompt': example['prompt'],
        'completion': example['completion']
    }