In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("bmh")
import seaborn as sns
sns.set()
sns.set_palette("GnBu_d")
import folium
from folium import plugins
from folium.plugins import HeatMap
from datetime import datetime, timedelta, date, time
# from haversine import haversine
import urllib
import json
import ssl
import pickle

---

### **Import the data**

Import CSV files

In [2]:
df_bonn = pd.read_csv("data/bonn.csv")
df_essen = pd.read_csv("data/essen.csv")

In [3]:
df_bonn.head()

Unnamed: 0,day,time,b_number,city,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng
0,2019-03-06,01:13:00,21169,bonn,0 days 02:51:00.000000000,50.921682,6.959204,50.953793,6.899248
1,2019-03-06,14:43:00,21169,bonn,0 days 00:11:00.000000000,50.953793,6.899251,50.947463,6.922385
2,2019-03-06,15:21:00,21169,bonn,0 days 00:21:00.000000000,50.947462,6.922391,50.929703,6.932212
3,2019-03-06,17:45:00,21169,bonn,0 days 00:51:00.000000000,50.929703,6.932212,50.945799,6.915687
4,2019-03-07,05:58:00,21169,bonn,0 days 00:07:00.000000000,50.945765,6.915829,50.951143,6.91555


In [4]:
df_bonn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329629 entries, 0 to 329628
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   day            329629 non-null  object 
 1   time           329629 non-null  object 
 2   b_number       329629 non-null  int64  
 3   city           329629 non-null  object 
 4   trip_duration  329629 non-null  object 
 5   orig_lat       329629 non-null  float64
 6   orig_lng       329629 non-null  float64
 7   dest_lat       329629 non-null  float64
 8   dest_lng       329629 non-null  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 22.6+ MB


In [5]:
# Check df_essen for any trip_durations longer than a day
for i in list(df_essen["trip_duration"]):
    if i[0] != "0":
        print(i[0])

Import API data

In [6]:
context = ssl._create_unverified_context()

def request(city_id):
    response = urllib.request.urlopen(f"https://api.nextbike.net/maps/nextbike-live.json?city={city_id}", context=context)
    response = response.read().decode("utf-8")
    return json.loads(response)

bonn_json = request(547)["countries"][0] # Bonn city ID at NextBike is 547
essen_json = request(158)["countries"][0] # Essen city ID at NextBike ist 133
# JSONs only contain one key on 0th level, "countries", which only contains a list with 1 element

In [7]:
# Use as center points for maps:
bonn = np.array([bonn_json["lat"], bonn_json["lng"]])
essen = np.array([essen_json["lat"], essen_json["lng"]])

---

### **Format the data**

In [8]:
def format_trip_duration(df):
    """
    Assumption: all time durations are less than 1 day
    Assumption: all time durations are measured in seconds (not milliseconds)
    First turn "0 days 02:51:00.0000" into list split by " ", so we get 
    x=["0", "days", "02:51:00.0000"].
    Since all trip_durations are less than 1 day, we can discard x[0] and x[1] and 
    focus on x[2].
    We then split x[2], i.e. "02:51:00.0000", into a list split by ":", so we get 
    x=["02", "51", "00.0000"].
    We now turn these elements into numbers and pass them into the timedelta constructor.
    Timedelta objects describe differences in time, like time durations.
    """
    df["trip_duration"] = df["trip_duration"].apply(
        lambda s: s.split(" ")[2].split(":"))
    df["trip_duration"] = df["trip_duration"].apply(
        lambda x: timedelta(hours=int(x[0]), minutes=int(x[1]), seconds=int(float(x[2]))))
    return df

def str_to_date(row):
    return datetime.strptime(f"{row['day']} {row['time']}", "%Y-%m-%d %H:%M:%S")

In [9]:
%%time
# Sort values, reset index (and drop old index)
# Bonn:
df_bonn.sort_values(["day", "time"], inplace=True)
df_bonn = df_bonn.reset_index()
df_bonn = df_bonn.drop("index", axis=1)
# Essen:
df_essen.sort_values(["day", "time"], inplace=True)
df_essen = df_essen.reset_index()
df_essen = df_essen.drop("index", axis=1)

# Format trip duration
df_bonn = format_trip_duration(df_bonn)
df_essen = format_trip_duration(df_essen)

# Create timestamp, weekday, hour columns
# Bonn:
df_bonn["timestamp"] = df_bonn.apply(str_to_date, axis=1)
df_bonn["weekday"] = df_bonn["timestamp"].apply(lambda row: row.weekday())
df_bonn["hour"] = df_bonn["timestamp"].apply(lambda row: row.hour)
# Essen:
df_essen["timestamp"] = df_essen.apply(str_to_date, axis=1)
df_essen["weekday"] = df_essen["timestamp"].apply(lambda row: row.weekday())
df_essen["hour"] = df_essen["timestamp"].apply(lambda row: row.hour)

# Combine longitude/latitude coordinate columns: orig
# Bonn:
df_bonn["orig"] = list(zip(df_bonn["orig_lat"].round(4), df_bonn["orig_lng"].round(4)))
df_bonn["orig"] = df_bonn["orig"].apply(np.array) # turn tuples in np.arrays
df_bonn["dest"] = list(zip(df_bonn["dest_lat"].round(4), df_bonn["dest_lng"].round(4)))
df_bonn["dest"] = df_bonn["dest"].apply(np.array) # turn tuples in np.arrays
# Essen:
df_essen["orig"] = list(zip(df_essen["orig_lat"].round(4), df_essen["orig_lng"].round(4)))
df_essen["orig"] = df_essen["orig"].apply(np.array) # turn tuples in np.arrays
df_essen["dest"] = list(zip(df_essen["dest_lat"].round(4), df_essen["dest_lng"].round(4)))
df_essen["dest"] = df_essen["dest"].apply(np.array) # turn tuples in np.arrays

CPU times: user 17.3 s, sys: 281 ms, total: 17.5 s
Wall time: 17.6 s


In [19]:
df_bonn

Unnamed: 0,day,time,b_number,city,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng,timestamp,weekday,hour,orig,dest
0,2019-02-01,00:03:00,44894,bonn,00:07:00,50.736571,7.100907,50.739031,7.092978,2019-02-01 00:03:00,4,0,"[50.7366, 7.1009]","[50.739, 7.093]"
1,2019-02-01,00:12:00,44973,bonn,00:20:00,50.736387,7.092718,50.732040,7.100436,2019-02-01 00:12:00,4,0,"[50.7364, 7.0927]","[50.732, 7.1004]"
2,2019-02-01,00:37:00,44802,bonn,00:06:00,50.737204,7.091110,50.733627,7.095017,2019-02-01 00:37:00,4,0,"[50.7372, 7.0911]","[50.7336, 7.095]"
3,2019-02-01,00:38:00,44717,bonn,00:07:00,50.747591,7.082589,50.754293,7.071525,2019-02-01 00:38:00,4,0,"[50.7476, 7.0826]","[50.7543, 7.0715]"
4,2019-02-01,00:44:00,44639,bonn,00:10:00,50.739222,7.090649,50.752462,7.071222,2019-02-01 00:44:00,4,0,"[50.7392, 7.0906]","[50.7525, 7.0712]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329624,2019-06-30,23:49:00,44862,bonn,00:46:00,50.715853,7.113276,50.725320,7.153470,2019-06-30 23:49:00,6,23,"[50.7159, 7.1133]","[50.7253, 7.1535]"
329625,2019-06-30,23:50:00,44838,bonn,00:06:00,50.649484,7.200588,50.657667,7.194236,2019-06-30 23:50:00,6,23,"[50.6495, 7.2006]","[50.6577, 7.1942]"
329626,2019-06-30,23:54:00,45084,bonn,00:21:00,50.737742,7.101788,50.709573,7.115528,2019-06-30 23:54:00,6,23,"[50.7377, 7.1018]","[50.7096, 7.1155]"
329627,2019-06-30,23:55:00,45010,bonn,00:05:00,50.737391,7.103318,50.744449,7.100407,2019-06-30 23:55:00,6,23,"[50.7374, 7.1033]","[50.7444, 7.1004]"


In [11]:
df_bonn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329629 entries, 0 to 329628
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype          
---  ------         --------------   -----          
 0   day            329629 non-null  object         
 1   time           329629 non-null  object         
 2   b_number       329629 non-null  int64          
 3   city           329629 non-null  object         
 4   trip_duration  329629 non-null  timedelta64[ns]
 5   orig_lat       329629 non-null  float64        
 6   orig_lng       329629 non-null  float64        
 7   dest_lat       329629 non-null  float64        
 8   dest_lng       329629 non-null  float64        
 9   timestamp      329629 non-null  datetime64[ns] 
 10  weekday        329629 non-null  int64          
 11  hour           329629 non-null  int64          
 12  orig           329629 non-null  object         
 13  dest           329629 non-null  object         
dtypes: datetime64[ns](1), float64(4), in