In [19]:
#!/usr/bin/python

# About:    Scraping Bicycle Geometry Data from geometrics.mtb-news.de
# Author:   Dorian Prill & Susana Kohlhaas
# Date:     2023-03-22
# License:  MIT

import os
import requests  # type: ignore
import json
from time import sleep
from bs4 import BeautifulSoup  # type: ignore
import polars as pl  # type: ignore

# Define URLs
entry_url = 'https://geometrics.mtb-news.de/bikes'
bike_api_url = 'https://geometrics.mtb-news.de/api/bikes?variants='
target_class_list = 'mtbnews-geometry__bike-list'

# Define save path
savename = './data/geometrics.mtb-news.de'

# Ensure the "data" directory exists
os.makedirs("./data", exist_ok=True)

# Define headers to avoid 403 Forbidden error (mimic a real browser)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
}

# the desired columns
columns = [
    'URL', 'Brand', 'Model', 'Year', 'Category', 'Motorized', 'Frame Size',
    'Frame Config', 'Wheel Size', 'Reach', 'Stack', 'STR', 'Front Center',
    'Head Tube Angle', 'Seat Tube Angle Effective', 'Seat Tube Angle Real',
    'Top Tube Length', 'Top Tube Length Horizontal', 'Head Tube Length',
    'Seat Tube Length', 'Standover Height', 'Chainstay Length', 'Wheelbase',
    'Bottom Bracket Offset', 'Bottom Bracket Height', 'Fork Installation Height',
    'Fork Offset', 'Fork Trail', 'Suspension Travel (rear)', 'Suspension Travel (front)',
]

dtypes = [
    pl.Utf8, pl.Utf8, pl.Utf8, pl.Int32, pl.Utf8, pl.Boolean, pl.Utf8, pl.Utf8, pl.Utf8, pl.Float32,
    pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, 
    pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, pl.Float32, 
    pl.Float32, pl.Float32, pl.Float32, pl.Float32
]

# create empty entry dict
values = [[] for _ in columns]
data = dict(zip(columns, values))

# translate categories to English
category_map = {
    'Mountainbike': 'Mountain',
    'Rennrad': 'Road',
    'Gravel-Bike/CycloCross-Bike': 'Gravel/CX',
    'Sonstiges': 'Other',  # rare
}

### GO SCRAPE ###

# Get the page
try: 
    page = requests.get(entry_url, headers=headers)
    page.raise_for_status()  # Raise an error if the request fails
except Exception as e:
    print(f'Error fetching {entry_url}: {e}')
    exit(1)

soup = BeautifulSoup(page.content, 'html.parser')

# select the bike manufacturer starting letter sublists (0-9, A-Z)
sublists = soup.find_all('ul', attrs={'class': target_class_list})

# follow links to individual bikes (default tab on load lists all bike categories)
for ul in sublists:
    # for all bike entries in the sublists
    for li in ul.find_all('li'):

        # experiment with rate limiting, introduce random sleep to avoid detection
        sleep(0.5)

        # follow href to individual bike page 
        bikeurl = li.find('a').get('href')
        bikepage = requests.get(bikeurl, headers=headers)
        bikesoup = BeautifulSoup(bikepage.content, 'html.parser')
        
        # find detail table link from button with text 'Diese Geometrien untereinander vergleichen'
        button = bikesoup.find('a', attrs={'class': 'btn btn-primary'})

        if button is None or ' '.join(button.get_text().split()) != 'Diese Geometrien untereinander vergleichen':
            print(f'Skipping {bikeurl} - No detail table button found')
            continue

        table_url = button.get('href')

        # disassemble it to get the bike IDs for the API call
        api_variants = table_url.split('/')[-1].split('@')[0].replace('_', ',')
        api_call = f'{bike_api_url}{api_variants}'

        try:
            biketable = requests.get(api_call, headers=headers)
            biketable.raise_for_status()
            model_variants = biketable.json()['data']
        except Exception as e:
            print(f'No variants found for {bikeurl} - Error: {e}')
            continue

        for entry in model_variants:
            # Store bike data in dictionary
            data['URL'].append(entry['model']['url'])
            data['Brand'].append(entry['model']['brand']['name'])
            data['Model'].append(entry['model']['model_name'])
            data['Year'].append(entry['model']['year'])
            data['Category'].append(category_map.get(entry['model']['type'], 'Unknown'))
            data['Motorized'].append(entry['model']['has_motor'])
            data['Frame Size'].append(entry['frame_size'])
            data['Frame Config'].append(entry['frame_config'])
            data['Wheel Size'].append(entry['wheelsize'])
            data['Reach'].append(entry['reach'])
            data['Stack'].append(entry['stack'])
            data['STR'].append(entry['stack_to_reach'])
            data['Front Center'].append(entry['front_center'])
            data['Head Tube Angle'].append(entry['head_angle'])
            data['Seat Tube Angle Effective'].append(entry['seat_angle_effective'])
            data['Seat Tube Angle Real'].append(entry['seat_angle_real'])
            data['Top Tube Length'].append(entry['top_tube_length'])
            data['Top Tube Length Horizontal'].append(entry['top_tube_horizontal_length'])
            data['Head Tube Length'].append(entry['head_tube_length'])
            data['Seat Tube Length'].append(entry['seat_tube_length'])
            data['Standover Height'].append(entry['standover_height'])
            data['Chainstay Length'].append(entry['chainstay_length'])
            data['Wheelbase'].append(entry['wheel_base'])
            data['Bottom Bracket Offset'].append(entry['bottom_bracket_offset'])
            data['Bottom Bracket Height'].append(entry['bottom_bracket_height'])
            data['Fork Installation Height'].append(entry['fork_installation_height'])
            data['Fork Offset'].append(entry['fork_offset'])
            data['Fork Trail'].append(entry['fork_trail'])
            data['Suspension Travel (rear)'].append(entry['travel_rear'])
            data['Suspension Travel (front)'].append(entry['travel_front'])

# construct data frame from dict
df = pl.DataFrame(data=data, infer_schema_length=len(data['Model']))

# display first few rows
print(df.head())

# save data to CSV and IPC
df.write_csv(savename + '.csv')
df.write_ipc(savename + '.arrow', compression='zstd')

print("Scraping complete! Data saved successfully.")


Error fetching https://geometrics.mtb-news.de/bikes: 403 Client Error: Forbidden for url: https://geometrics.mtb-news.de/bikes
shape: (0, 30)
┌──────┬───────┬───────┬──────┬───┬─────────────┬────────────┬───────────────┬─────────────────────┐
│ URL  ┆ Brand ┆ Model ┆ Year ┆ … ┆ Fork Offset ┆ Fork Trail ┆ Suspension    ┆ Suspension Travel   │
│ ---  ┆ ---   ┆ ---   ┆ ---  ┆   ┆ ---         ┆ ---        ┆ Travel (rear) ┆ (front)             │
│ null ┆ null  ┆ null  ┆ null ┆   ┆ null        ┆ null       ┆ ---           ┆ ---                 │
│      ┆       ┆       ┆      ┆   ┆             ┆            ┆ null          ┆ null                │
╞══════╪═══════╪═══════╪══════╪═══╪═════════════╪════════════╪═══════════════╪═════════════════════╡
└──────┴───────┴───────┴──────┴───┴─────────────┴────────────┴───────────────┴─────────────────────┘
Scraping complete! Data saved successfully.
