In [140]:
import pandas as pd
import numpy as np
import folium
import plotly.express as px
import seaborn as sns
from folium import plugins
from folium.plugins import HeatMap

# Load processed dataframe

In [141]:
df_boston = pd.read_csv('./data/prepared/rides_data_prepared.csv', dtype={'start_station_id': np.int64, 'end_station_id': np.int64, 'end_station_name': 'string', 'start_station_name': 'string', 'bike_id': np.int64, 'user_type': 'string'})
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
df_boston['end_time'] = pd.to_datetime(df_boston['end_time'], format='%Y-%m-%d %X')
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,start_station_municipality,end_station_municipality,date_time
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0,42.387995,-71.119084,42.373379,-71.111075,Cambridge,Cambridge,2015-01-01 00:00:00
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0,42.361962,-71.092053,42.372969,-71.094445,Cambridge,Cambridge,2015-01-01 00:00:00
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0,42.366277,-71.09169,42.36507,-71.1031,Cambridge,Cambridge,2015-01-01 01:00:00
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0,42.387995,-71.119084,42.373379,-71.111075,Cambridge,Cambridge,2015-01-01 01:00:00
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0,42.356954,-71.113687,42.374035,-71.101427,Cambridge,Cambridge,2015-01-01 01:00:00


In [142]:
df_boston.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,trip_length,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,start_station_municipality,end_station_municipality,date_time
0,2015-01-01 00:21:44,2015-01-01 00:30:47,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,277,Subscriber,543.0,42.387995,-71.119084,42.373379,-71.111075,Cambridge,Cambridge,2015-01-01 00:00:00
1,2015-01-01 00:27:03,2015-01-01 00:34:21,80,95,MIT Stata Center at Vassar St / Main St,Cambridge St - at Columbia St / Webster Ave,648,Subscriber,438.0,42.361962,-71.092053,42.372969,-71.094445,Cambridge,Cambridge,2015-01-01 00:00:00
2,2015-01-01 00:31:31,2015-01-01 00:35:46,91,68,One Kendall Square at Hampshire St / Portland St,Central Square at Mass Ave / Essex St,555,Subscriber,255.0,42.366277,-71.09169,42.36507,-71.1031,Cambridge,Cambridge,2015-01-01 01:00:00
3,2015-01-01 00:53:46,2015-01-01 01:00:58,115,96,Porter Square Station,Cambridge Main Library at Broadway / Trowbridg...,1307,Subscriber,432.0,42.387995,-71.119084,42.373379,-71.111075,Cambridge,Cambridge,2015-01-01 01:00:00
4,2015-01-01 01:07:06,2015-01-01 01:19:21,105,88,Lower Cambridgeport at Magazine St/Riverside Rd,Inman Square at Vellucci Plaza / Hampshire St,177,Customer,735.0,42.356954,-71.113687,42.374035,-71.101427,Cambridge,Cambridge,2015-01-01 01:00:00


# Station analysis

In [143]:
# function to visualize station volume over year
def map_stations(df, lat, lon, count, data):

    color_scale = [(0, 'orange'), (1,'red')]

    fig = px.scatter_mapbox(df, lat=lat, lon=lon, color=count, hover_data= [data],
    color_continuous_scale=color_scale, size=count, zoom=12, height=300, width=800)

    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    return fig


## Starts stations volume  per year
Visualization which contains information on how many rides start at a respective station per year

In [144]:
#prepare dataframe to count up number of usage as start station
#s = subscriber, c = customer
df_s = df_boston[df_boston['user_type'] == "Subscriber"]
df_c = df_boston[df_boston['user_type'] == "Customer"]
df_start = df_boston.groupby(['start_station_id','start_station_latitude', 'start_station_longitude']).size().reset_index(name='counts_start')
df_start_s = df_s.groupby(['start_station_id','start_station_latitude', 'start_station_longitude']).size().reset_index(name='counts_start')
df_start_c = df_c.groupby(['start_station_id','start_station_latitude', 'start_station_longitude']).size().reset_index(name='counts_start')

In [145]:
#visualize station use of start stations by subscribers and customers
s_fig_s = map_stations(df_start_s, "start_station_latitude", "start_station_longitude", "counts_start","start_station_id" )
s_fig_s.show() #subscriber
s_fig_c = map_stations(df_start_c, "start_station_latitude", "start_station_longitude", "counts_start", 'start_station_id' )
s_fig_c.show() #customers

We can make out three major hotspots for subscribers: The city center around down town, stations around the MIT, and stations around the harvard university. Where the most used stations are near the universitys.
In contrast the regular customers tends to use the bikes mostly in the downtown area near the train stations and the city center and hardly in the university area.


## End station rides volume
Visualization which contains information on how many rides end at a respective station per year

In [146]:
#prepare dataframe to count up number of usage as end station
#s = subsriber, c = customer
df_end= df_boston.groupby(['end_station_id','end_station_latitude', 'end_station_longitude']).size().reset_index(name='counts_end')
df_end_s = df_s.groupby(['end_station_id','end_station_latitude', 'end_station_longitude']).size().reset_index(name='counts_end')
df_end_c = df_c.groupby(['end_station_id','end_station_latitude', 'end_station_longitude']).size().reset_index(name='counts_end')

In [147]:
#visualize station use of end stations by subscribers
e_fig_s = map_stations(df_end_s, "end_station_latitude", "end_station_longitude", "counts_end", 'end_station_id' )
e_fig_s.show() #subscriber
e_fig_c = map_stations(df_end_c, "end_station_latitude", "end_station_longitude", "counts_end", 'end_station_id' )
e_fig_c.show() #customers

There seems to be no big difference between the amount of trips ended and started in certain areas which might indicate that people that decide to rent a bike also choose it again later when leaving their destination. 

Also the observations of customer and subscriber useage of start stations does not change for end stations.

## Total station rides volume
Visualization which contains information on how many rides start and end at a respective station per year

In [148]:
#prerpare dataframe with information of total usage of stations as start and end point
df_start = df_start.rename(columns = {'start_station_id': 'station_id'})
df_end = df_end.rename(columns = {'end_station_id': 'station_id'})
df_total = df_start.merge(df_end, on = 'station_id')
df_total['total_rides'] = df_total['counts_start'] + df_total['counts_end']

In [149]:
#visualize station use of stations 
fig_t = map_stations(df_total, "start_station_latitude", "start_station_longitude", "total_rides", 'station_id' )
fig_t.show() 

## Volume share per station 
What is the percentage of total of total rides a station is used? 

In [150]:
#share of different stations, most frequently used stations

df_station_share = df_total.copy()
yearly_total_rides = sum(df_station_share['total_rides'])
df_station_share['share'] = (df_station_share['total_rides'] / yearly_total_rides)*100
df_station_share = df_station_share.drop(['start_station_latitude', 'start_station_longitude','counts_start', 'counts_end'], axis = 1)
df_station_share = df_station_share.rename(columns = {'end_station_id': 'station_id'})
df_station_share = df_station_share.sort_values('share', ascending=False).reset_index(drop = True)
df_station_share.head(10)


Unnamed: 0,station_id,end_station_latitude,end_station_longitude,total_rides,share
0,22,42.352175,-71.055547,61578,2.783576
1,67,42.3581,-71.093198,58993,2.666724
2,74,42.373268,-71.118579,45167,2.041732
3,68,42.36507,-71.1031,42022,1.899566
4,109,42.365942,-71.060515,38725,1.750528
5,80,42.361962,-71.092053,38368,1.73439
6,36,42.349673,-71.077303,38133,1.723767
7,75,42.363465,-71.100573,38097,1.72214
8,60,42.360835,-71.07084,33413,1.510404
9,16,42.348074,-71.07657,32322,1.461086


Two stations seem to be exceptionally valuable as they have an overall share of over 2.5% of total trips (ID = 22 and 67). They are located at the main station in Boston and at the MIT which is the nearest station to the harvard bridge crossing the Charles river.

There are a number of station that have a share ranging from 1.5% to 2% of total trips. These stations are located between the universities and in down town area.

## Station Distribution in Boston

In [152]:
def heat_map(df, station_id):
    # define new map
    fig = folium.Figure(width = 800, height = 400)
    heat_map = folium.Map(
        location=[df['start_station_latitude'].mean(), df['start_station_longitude'].mean()], 
        tiles='OpenStreetMap', zoom_start=13, control_scale=True, max_zoom=20).add_to(fig)
    
    # add heat map
    heat_data = [[row['start_station_latitude'],row['start_station_longitude']] for index, row in df.iterrows()]
    HeatMap(heat_data).add_to(heat_map)
    #heat map for destination points looks pretty much identical
    return heat_map

fig_heat = heat_map(df_start, np.array(df_start['station_id']))
fig_heat

Coinciding with the volume of certain stations per year, there are also hotspots of station numbers in areas of frequent usage. Which means around both universitities, down-town and the area south of the harvard bridge.

# Most popular routes  
This section tries to explain which routes are most frequently used.

In [None]:
#prepare dataframe with information of number of trips between a start and end station

df_trips = df_boston.groupby(['start_station_id','end_station_id', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude']).size().reset_index(name='count_trips')
df_trips = df_trips.sort_values('count_trips', ascending = False)
df_trips.index = range(0, len(df_trips))
df_trips.head(10)

Unnamed: 0,start_station_id,end_station_id,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,count_trips
0,118,100,42.397828,-71.130516,42.396969,-71.123024,3011
1,100,118,42.396969,-71.123024,42.397828,-71.130516,2740
2,114,100,42.402763,-71.126908,42.396969,-71.123024,2042
3,67,53,42.3581,-71.093198,42.350851,-71.089886,2022
4,40,22,42.363871,-71.050877,42.352175,-71.055547,1979
5,67,68,42.3581,-71.093198,42.36507,-71.1031,1836
6,53,67,42.350851,-71.089886,42.3581,-71.093198,1783
7,22,40,42.352175,-71.055547,42.363871,-71.050877,1740
8,68,67,42.36507,-71.1031,42.3581,-71.093198,1607
9,20,22,42.35977,-71.051601,42.352175,-71.055547,1521


In [None]:
#in order to get information about the most frequently used routes we need to ignore the direction of the route i.e. irrelevance of start and end station
df_top_trips = df_trips.head(47).copy() #most common trips ignoring direction of trip, filter is 47 as we only use values above 1000 trips per direction

for j in range(0, len(df_top_trips)):
    if j < len(df_top_trips):
        for i in range (0, len(df_top_trips)):
            if i < len(df_top_trips):
                if df_top_trips.at[j,'start_station_id'] == int(df_top_trips.at[i, 'end_station_id']) and df_top_trips.at[i,'start_station_id'] == int(df_top_trips.at[j, 'end_station_id']):
                    df_top_trips.at[j, 'count_trips'] = df_top_trips.at[i, 'count_trips'] + df_top_trips.at[j, 'count_trips']
                    df_top_trips = df_top_trips.drop(i)
                    df_top_trips.index = range(0, len(df_top_trips))


df_top_trips = df_top_trips.sort_values('count_trips', ascending = False)
df_top_trips.index = range(0, len(df_top_trips))
df_top_trips = df_top_trips.rename(columns = {'start_station_id' : 'Station_A', 'end_station_id' : 'Station_B'})
df_top_trips = df_top_trips.head(16) #big gap between 16 and 17 from over 2100 to 1300
df_top_trips


Unnamed: 0,Station_A,Station_B,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,count_trips
0,118,100,42.397828,-71.130516,42.396969,-71.123024,5751
1,67,53,42.3581,-71.093198,42.350851,-71.089886,3805
2,40,22,42.363871,-71.050877,42.352175,-71.055547,3719
3,67,68,42.3581,-71.093198,42.36507,-71.1031,3443
4,114,100,42.402763,-71.126908,42.396969,-71.123024,3191
5,20,22,42.35977,-71.051601,42.352175,-71.055547,2980
6,67,75,42.3581,-71.093198,42.363465,-71.100573,2886
7,67,74,42.3581,-71.093198,42.373268,-71.118579,2692
8,67,107,42.3581,-71.093198,42.3625,-71.08822,2603
9,22,109,42.352175,-71.055547,42.365942,-71.060515,2380


In [None]:
#mapping the top trips 

avg_lat = df_trips['start_station_latitude'].mean()
avg_long = df_trips['start_station_longitude'].mean()

fig_trips = folium.Figure(width = 800, height = 500)
map_trips = folium.Map(location=[avg_lat, avg_long], zoom_start=12).add_to(fig_trips)

points = []

for i in range(0,len(df_top_trips)):
    points.append([df_top_trips.iloc[i]['start_station_latitude'],df_top_trips.iloc[i]['start_station_longitude']])
    points.append([df_top_trips.iloc[i]['end_station_latitude'],df_top_trips.iloc[i]['end_station_longitude']])
    folium.PolyLine(points, color="red", weight=2.5, opacity=1).add_to(map_trips)
    folium.Marker([df_top_trips.iloc[i]['start_station_latitude'],df_top_trips.iloc[i]['start_station_longitude']],popup=(df_top_trips.iloc[i]['Station_A'], df_top_trips.iloc[i]['count_trips' ]), icon = folium.Icon(color='blue',prefix='fa', icon='bicycle')).add_to(map_trips)
    folium.Marker([df_top_trips.iloc[i]['end_station_latitude'],df_top_trips.iloc[i]['end_station_longitude']],popup=(df_top_trips.iloc[i]['Station_B'], df_top_trips.iloc[i]['count_trips' ]), icon = folium.Icon(color='blue',prefix='fa', icon='bicycle')).add_to(map_trips)
    points = [] #if points is not reseted then each station will be connected with every other station in array

map_trips


Again we can observe, that the most frequently used trips happen to be in the areas around the city center, as well as inbetween and around the universitiets harvard and MIT. Especially, the station located directly at the MIT (ID = 67) and the station at the main station of boston (ID = 22) seem to be major connection points for bike usage. This is not surprising as they also have the biggest overall share of rides during the year. 

However, there are also two majorly used routes in the north-west of the city around the Davis square. This might be the due to its proximity to Tufts University in the north-west of boston as well as the underground connection from davis square to both other universities as well as the city center. 