In [1]:
import json
import os
import re
import pandas as pd
from datetime import datetime
from tqdm import tqdm

### Get sorted paths of snapshots

In [2]:
folder_path = "data/"

# Regular expression to match the file format
pattern = re.compile(r"nextbike_data_(\d{8}_\d{6})")

files_with_dates = []

for filename in os.listdir(folder_path):
    match = pattern.match(filename)
    if match:
        date_str = match.group(1)
        try:
            file_datetime = datetime.strptime(date_str, "%Y%m%d_%H%M%S")
            full_path = os.path.join(folder_path, filename)
            files_with_dates.append((file_datetime, full_path))
        except ValueError:
            print(f"Skipping file with invalid datetime: {filename}")

files_with_dates.sort()
sorted_paths = [path for _, path in files_with_dates]

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/'

### Helper functions

In [None]:
def extract_station_data(data, idx):
    stations = {}
    city  = data["countries"][idx]["cities"][0]
    for place in city.get("places", []):
        stations[place["uid"]] = {
            "name": place["name"],
            "lat": place["lat"],
            "lng": place["lng"],
            "bikes": set(bike["number"] for bike in place.get("bike_list", []))
        }
    return stations

In [None]:
def load_state(path):
    with open(path, "r") as f:
        state = json.load(f)
        return state

In [None]:
def extract_timestamp_from_path(path):
    filename = os.path.basename(path)
    datetime_str = filename.replace("nextbike_data_", "").replace(".json", "")
    return datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")

In [None]:
idx2city = {0: "berlin", 1: "barcelona", 2: "innsbruck", 3: "warsaw"}

### Main loop, only for one city (Berlin)

In [None]:
for k,v in idx2city.items():
    print(f"{k}: {v}")

0: berlin
1: barcelona
2: innsbruck
3: warsaw


In [None]:
for idx, city in idx2city.items():
    print(f"Processing city: {city}")
    save_path = f"bike_movements_{city}.csv"

    bike_movements = []
    disappeared_bikes = {}  # {bike_id: {"from_station": name, "departure_time": timestamp}}

    previous_path = sorted_paths[0]
    previous_state = extract_station_data(load_state(previous_path), idx)
    previous_timestamp = extract_timestamp_from_path(previous_path)

    for current_path in tqdm(sorted_paths[1:]):
        try:
            current_state = extract_station_data(load_state(current_path), idx)
            current_timestamp = extract_timestamp_from_path(current_path)

            # Check for disappeared bikes across all stations
            for station_id, station_info in previous_state.items():
                if station_id in current_state:
                    current_bikes = current_state[station_id]["bikes"]
                    disappeared = station_info["bikes"] - current_bikes
                    for bike in disappeared:
                        if bike not in disappeared_bikes:
                            disappeared_bikes[bike] = {
                                "from_station": station_info["name"],
                                "from_station_lat": station_info["lat"],
                                "from_station_lng": station_info["lng"],
                                "departure_time": previous_timestamp
                            }

            # Try to locate disappeared bikes in the current state
            bikes_found = []
            for new_station_id, new_station_info in current_state.items():
                if re.match(r"^BIKE \d+$", new_station_info["name"]): # We do not process temporary stations, those stattion are created when bike is rented but not returned, can be seen in bike_movements_old.ipynb 
                    continue
                for bike in new_station_info["bikes"]:
                    if bike in disappeared_bikes:
                        movement = {
                            "bike_id": bike,
                            "from_station": disappeared_bikes[bike]["from_station"],
                            "from_station_lat": disappeared_bikes[bike]["from_station_lat"],
                            "from_station_lng": disappeared_bikes[bike]["from_station_lng"],
                            "to_station": new_station_info["name"],
                            "to_station_lat": new_station_info["lat"],
                            "to_station_lng": new_station_info["lng"],
                            "departure_time": disappeared_bikes[bike]["departure_time"],
                            "arrival_time": current_timestamp
                        }
                        bike_movements.append(movement)
                        # print(f"Bike {bike} moved from {movement['from_station']} at {movement['departure_time']} "
                        #       f"to {movement['to_station']} at {movement['arrival_time']}")
                        bikes_found.append(bike)

            # Remove bikes we've found from the disappeared set
            for bike in bikes_found:
                del disappeared_bikes[bike]

            previous_state = current_state
            previous_timestamp = current_timestamp
            # print("-" * 20)
            if len(bike_movements) > 1000:
                new_data = pd.DataFrame(bike_movements)
                new_data.to_csv(save_path, mode='a', header=not os.path.exists(save_path), index=False)
                bike_movements = []

        except Exception as e:
            print(f"Error processing path {current_path}: {e}")

    # Save any remaining bike movements
    new_data = pd.DataFrame(bike_movements)
    new_data.to_csv(save_path, mode='a', header=not os.path.exists(save_path), index=False)

Processing city: berlin


 28%|██▊       | 4284/15127 [16:27<38:38,  4.68it/s]   

Error processing path data/nextbike_data_20250420_150400.json: 'NoneType' object is not subscriptable


 57%|█████▋    | 8695/15127 [32:41<15:03,  7.12it/s]   

Error processing path data/nextbike_data_20250423_210341.json: 'NoneType' object is not subscriptable


 81%|████████  | 12288/15127 [46:28<07:17,  6.49it/s]  

Error processing path data/nextbike_data_20250426_105401.json: 'NoneType' object is not subscriptable


 98%|█████████▊| 14814/15127 [56:53<00:36,  8.52it/s]  

Error processing path data/nextbike_data_20250429_062750.json: 'NoneType' object is not subscriptable


100%|██████████| 15127/15127 [57:55<00:00,  4.35it/s]


Processing city: barcelona


 28%|██▊       | 4284/15127 [23:16<50:46,  3.56it/s]   

Error processing path data/nextbike_data_20250420_150400.json: 'NoneType' object is not subscriptable


 57%|█████▋    | 8694/15127 [46:03<10:11, 10.52it/s]   

Error processing path data/nextbike_data_20250423_210341.json: 'NoneType' object is not subscriptable


 81%|████████  | 12286/15127 [1:04:12<17:32,  2.70it/s]  

Error processing path data/nextbike_data_20250426_105401.json: 'NoneType' object is not subscriptable


 98%|█████████▊| 14812/15127 [1:16:52<01:20,  3.92it/s]  

Error processing path data/nextbike_data_20250429_062750.json: 'NoneType' object is not subscriptable


100%|██████████| 15127/15127 [1:19:08<00:00,  3.19it/s]


Processing city: innsbruck


 28%|██▊       | 4284/15127 [18:36<23:07,  7.81it/s]   

Error processing path data/nextbike_data_20250420_150400.json: 'NoneType' object is not subscriptable


 57%|█████▋    | 8693/15127 [38:34<24:30,  4.37it/s]   

Error processing path data/nextbike_data_20250423_210341.json: 'NoneType' object is not subscriptable


 81%|████████  | 12288/15127 [53:42<06:24,  7.39it/s]  

Error processing path data/nextbike_data_20250426_105401.json: 'NoneType' object is not subscriptable


 98%|█████████▊| 14812/15127 [1:04:57<01:48,  2.91it/s]  

Error processing path data/nextbike_data_20250429_062750.json: 'NoneType' object is not subscriptable


100%|██████████| 15127/15127 [1:06:17<00:00,  3.80it/s]


Processing city: warsaw


 28%|██▊       | 4284/15127 [18:47<35:10,  5.14it/s]   

Error processing path data/nextbike_data_20250420_150400.json: 'NoneType' object is not subscriptable


 57%|█████▋    | 8688/15127 [39:44<23:26,  4.58it/s]   

Error processing path data/nextbike_data_20250423_210341.json: 'NoneType' object is not subscriptable


 81%|████████  | 12286/15127 [57:16<14:08,  3.35it/s]   

Error processing path data/nextbike_data_20250426_105401.json: 'NoneType' object is not subscriptable


 98%|█████████▊| 14812/15127 [1:09:21<01:40,  3.13it/s]  

Error processing path data/nextbike_data_20250429_062750.json: 'NoneType' object is not subscriptable


100%|██████████| 15127/15127 [1:10:42<00:00,  3.57it/s]


In [None]:
df = pd.read_csv("bike_movements_warsaw.csv")

In [None]:
df["from_station"].value_counts()

from_station
Metro Centrum Nauki Kopernik        3352
Arkadia                             2029
Metro Dworzec Wileński - Targowa    1944
Stefana Banacha - UW                1867
Westfield Mokotów                   1707
                                    ... 
AMB - METRO La Pau                    25
AMB - Campus Diagonal - Besòs         16
AMB - Diagonal II                      8
AMB - METRO Baró de Viver              6
AMB - Riera Blanca Nord                5
Name: count, Length: 343, dtype: int64

### Movement count for each bike

In [None]:
from collections import defaultdict

bike_movement_counts = defaultdict(int)

for movement in bike_movements:
    bike_id = movement["bike_id"]
    bike_movement_counts[bike_id] += 1

bike_movement_summary = sorted(bike_movement_counts.items(), key=lambda x: x[1], reverse=True)

for bike_id, count in bike_movement_summary[:10]:
    print(f"Bike {bike_id} moved {count} times")

Bike 16204 moved 7 times
Bike 100064 moved 5 times
Bike 19155 moved 5 times
Bike 16968 moved 5 times
Bike 15180 moved 5 times
Bike 10914 moved 5 times
Bike 14425 moved 5 times
Bike 17778 moved 5 times
Bike 14328 moved 5 times
Bike 19473 moved 5 times


### Movements for selected bike

In [None]:
target_bike_id = "16204"

target_bike_movements = [
    move for move in bike_movements if move["bike_id"] == target_bike_id
]

for move in target_bike_movements:
    print(
        f"Bike {move['bike_id']} departed from {move['from_station']} at {move['departure_time']} "
        f"and arrived at {move['to_station']} at {move['arrival_time']}"
    )


Bike 16204 departed from S+U Neukölln | BONUS-Station: Return(Rückgabe) here=15 mins free at 2025-04-02 20:32:34 and arrived at virtuell - Oderstraße/Siegfriedstraße at 2025-04-02 20:41:47
Bike 16204 departed from virtuell - Oderstraße/Siegfriedstraße at 2025-04-02 21:01:18 and arrived at virtuell - Oderstraße/Siegfriedstraße at 2025-04-02 21:23:58
Bike 16204 departed from virtuell - Oderstraße/Siegfriedstraße at 2025-04-02 21:43:29 and arrived at S Ostbahnhof | BONUS-Station: Return(Rückgabe) here=15 mins free at 2025-04-02 22:13:15
Bike 16204 departed from S Ostbahnhof | BONUS-Station: Return(Rückgabe) here=15 mins free at 2025-04-02 22:34:50 and arrived at Köpenicker Straße/Eisenbahnstraße | BONUS-Station: Return(Rückgabe) here=15 mins free at 2025-04-02 22:42:02
Bike 16204 departed from Köpenicker Straße/Eisenbahnstraße | BONUS-Station: Return(Rückgabe) here=15 mins free at 2025-04-03 01:10:14 and arrived at virtuell - Leipziger Straße/Jerusalemer Straße at 2025-04-03 01:30:49
Bike