In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
# Split data 9:1 for training:eval. Format the text by combining Title + Body to give the model context.
INPUT_FILE = "data/raw_bmw_data.jsonl"
TRAIN_FILE = "data/train.json"
VAL_FILE = "data/val.json"

def load_data(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

def preprocess_text(row):
    """Combines title and text with a separator."""
    # Clean up excessive whitespace.
    title = row['title'].strip()
    text = row['text'].strip()
    return f"Title: {title}\nContent: {text}"

def main():
    if not os.path.exists(INPUT_FILE):
        print("Raw data not found. Run scrape.py first.")
        return

    df = load_data(INPUT_FILE)
    print(f"Loaded {len(df)} raw documents.")

    # Preprocessing
    df['processed_text'] = df.apply(preprocess_text, axis=1)
    
    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    
    # Convert to HuggingFace dataset friendly dict format
    train_data = [{"text": text} for text in train_df['processed_text']]
    val_data = [{"text": text} for text in val_df['processed_text']]

    # Save
    with open(TRAIN_FILE, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, indent=2)
        
    with open(VAL_FILE, 'w', encoding='utf-8') as f:
        json.dump(val_data, f, indent=2)

    print(f"Saved {len(train_data)} training samples and {len(val_data)} validation samples.")

if __name__ == "__main__":
    main()

Loaded 20 raw documents.
Saved 18 training samples and 2 validation samples.


In [9]:
train_df, val_df = train_test_split(a, test_size=0.1, random_state=42)

In [15]:
val_df

Unnamed: 0,url,title,date_str,text,processed_text
0,https://www.press.bmwgroup.com/global/article/...,Technical specifications of the MINI John Coop...,,Julian KischBMW GroupTel: +49-151-601-38072sen...,Title: Technical specifications of the MINI Jo...
17,https://www.press.bmwgroup.com/global/article/...,BMW Group and Solid Power take next step in AS...,,Bernhard EdererBMW GroupTel: +49-89-382-28556F...,Title: BMW Group and Solid Power take next ste...


In [13]:
val_data = [{"text": text} for text in val_df['processed_text']]

In [14]:
val_data

[{'text': 'Title: Technical specifications of the MINI John Cooper Works Convertible, valid from 11/2024.\nContent: Julian KischBMW GroupTel: +49-151-601-38072send an e-mail Julian KischBMW Group In the attachment you will find the technical specifications of the\n  MINI John Cooper Works Convertible, valid from 11/2024. Here you can see the webcast of the BMW Group Keynote at the Japan Mobility Show 2025.'},
 {'text': 'Title: BMW Group and Solid Power take next step in ASSB development path: new partner Samsung SDI joins the effort\nContent: Bernhard EdererBMW GroupTel: +49-89-382-28556Fax: +49-89-382-20626send an e-mail Bernhard EdererBMW Group Munich.Since 2022, the BMW Group and Solid Power,\n  Inc. (Nasdaq: SLDP) have intensified their activities for the\n  development of all-solid-state battery (ASSB) technology through their\n  technology transfer agreement. The latest milestone was the\n  integration of Solid Power’s large-format pure ASSB cells into a BMW\n  i7 technology test