In [10]:
import requests
from bs4 import BeautifulSoup
import json
import os
import time
from urllib.parse import urljoin
from datetime import datetime

In [11]:
# Configuration
BASE_URL = "https://www.press.bmwgroup.com/global"
OUTPUT_FILE = "data/raw_bmw_data.jsonl"
MAX_ARTICLES = 20  # Limit the number of articles.

In [12]:
def get_article_links(base_url):
    """
    Get links from the homepage.
    Note: The order on the homepage may not strictly follow chronological sequence (as some posts may be pinned),
    so we first retrieve the links, then extract dates from the detail pages, and finally sort them.
    """
    print(f"Fetching articles from {base_url}...")
    # Send an HTTP request header to the server.
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # 15s timeout to request access to the server.
    # If no response is received or the request fails, an error will be reported and the program will exit.
    try:
        response = requests.get(base_url, headers=headers, timeout=15)
        response.raise_for_status()
    except Exception as e:
        print(f"Connection error: {e}")
        return []

    # Convert the downloaded webpage source code into a structured object for Python.
    soup = BeautifulSoup(response.content, 'html.parser')
    
    links = []
    seen_urls = set()

    # Traverse all links on the website.
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if '/article/detail/' in href:
            full_link = urljoin(base_url, href)
            if full_link not in seen_urls:
                links.append(full_link)
                seen_urls.add(full_link)
                
    print(f"Found {len(links)} potential articles.")
    return links[:MAX_ARTICLES]

def parse_bmw_date(date_str):
    """
    Convert BMW-formatted dates (e.g., ‘15.03.2024’ or ‘15 Oct 2023’) into datetime objects for sorting.
    """
    try:
        # Try the common DD.MM.YYYY format.
        return datetime.strptime(date_str.strip(), "%d.%m.%Y")
    except ValueError:
        pass 
        
    try:
        # Try the English format DD Month YYYY (e.g., “12 Oct 2023”).
        return datetime.strptime(date_str.strip(), "%d %b %Y")
    except ValueError:
        # If parsing fails, return an old date at the end.
        return datetime(1970, 1, 1)

def scrape_article(url):
    """Extract the title, date, and body text"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; EducationalProject/1.0)'}
        response = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # 1. Extract the title.
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"
        
        # 2. Extract the date .
        date_str = ""
        date_obj = datetime(1970, 1, 1) # Default value
        
        date_tag = soup.find('div', class_='date')
        if not date_tag:
            date_tag = soup.find('span', class_='date')
            
        if date_tag:
            date_str = date_tag.get_text(strip=True)
            date_obj = parse_bmw_date(date_str)
        
        # 3. Extract the text.
        text = ""
        content_div = soup.find('div', class_='detail_content')
        if not content_div:
            content_div = soup.find('div', class_='ezxmltext-field')
        
        if content_div:
            paragraphs = content_div.find_all('p')
        else:
            # Fallback.
            paragraphs = soup.find_all('p')

        # Concatenate text, retaining paragraphs longer than 20 characters.
        text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20]
        text = " ".join(text_parts)
        
        return {
            "url": url, 
            "title": title, 
            "date_str": date_str,
            "timestamp": date_obj, # Datetime objects used for sorting.
            "text": text
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def main():
    os.makedirs("data", exist_ok=True)
    links = get_article_links(BASE_URL)
    
    data = []
    print(f"Scraping {len(links)} articles...")
    
    for i, link in enumerate(links):
        print(f"[{i+1}/{len(links)}] Processing {link}")
        article = scrape_article(link)
        
        if article and len(article['text']) > 200:
            data.append(article)
        else:
            print("   -> Skipped (Content too short or parse failed)")
        
        time.sleep(1) 

    # Sort by time.
    print("Sorting articles by date (newest first)...")
    data.sort(key=lambda x: x['timestamp'], reverse=True)

    # Remove the datetime object before saving (since it cannot be directly JSON-serialized), retaining only the string.
    final_data = []
    for entry in data:
        # Remove the timestamp object used for sorting, retaining the string date for reference.
        del entry['timestamp']
        final_data.append(entry)
        print(f"   - {entry['date_str']}: {entry['title'][:50]}...")

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        for entry in final_data:
            f.write(json.dumps(entry) + '\n')
    
    print(f"\nSuccessfully saved {len(final_data)} articles to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

Fetching articles from https://www.press.bmwgroup.com/global...
Found 70 potential articles.
Scraping 20 articles...
[1/20] Processing https://www.press.bmwgroup.com/global/article/detail/T0450133EN/technical-specifications-of-the-mini-john-cooper-works-convertible-valid-from-11/2024
[2/20] Processing https://www.press.bmwgroup.com/global/article/detail/T0450131EN/technical-specifications-of-the-mini-john-cooper-works-valid-from-11/2024
[3/20] Processing https://www.press.bmwgroup.com/global/article/detail/T0450130EN/technical-specifications-of-the-mini-john-cooper-works-aceman-valid-from-01/2025
[4/20] Processing https://www.press.bmwgroup.com/global/article/detail/T0450128EN/technical-specifications-of-the-mini-john-cooper-works-electric-valid-from-11/2024
[5/20] Processing https://www.press.bmwgroup.com/global/article/detail/T0443474EN/specifications-of-the-bmw-5-series-sedan-valid-from-03/2025
[6/20] Processing https://www.press.bmwgroup.com/global/article/detail/T0452972EN/bmw-mus