In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
# Split data 9:1 for training:eval. Format the text by combining Title + Body to give the model context.
INPUT_FILE = "data/raw_bmw_data.jsonl"
TRAIN_FILE = "data/train.json"
VAL_FILE = "data/val.json"

def load_data(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

def preprocess_text(row):
    """Combines title and text with a separator."""
    # Clean up excessive whitespace.
    title = row['title'].strip()
    text = row['text'].strip()
    return f"Title: {title}\nContent: {text}"

def main():
    if not os.path.exists(INPUT_FILE):
        print("Raw data not found. Run scrape.py first.")
        return

    df = load_data(INPUT_FILE)
    print(f"Loaded {len(df)} raw documents.")

    # Preprocessing
    df['processed_text'] = df.apply(preprocess_text, axis=1)
    
    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    
    # Convert to HuggingFace dataset friendly dict format
    train_data = [{"text": text} for text in train_df['processed_text']]
    val_data = [{"text": text} for text in val_df['processed_text']]

    # Save
    with open(TRAIN_FILE, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, indent=2)
        
    with open(VAL_FILE, 'w', encoding='utf-8') as f:
        json.dump(val_data, f, indent=2)

    print(f"Saved {len(train_data)} training samples and {len(val_data)} validation samples.")

if __name__ == "__main__":
    main()

Loaded 20 raw documents.
Saved 18 training samples and 2 validation samples.
