In [8]:
import pandas as pd
import numpy as np
import folium
import plotly.express as px
import seaborn as sns
from folium import plugins
from folium.plugins import HeatMap

# Load processed dataframe

In [9]:
df_boston = pd.read_csv('./data/prepared/rides_data_prepared.csv', dtype={'start_station_id': np.int64, 'end_station_id': 'string', 'end_station_name': 'string', 'start_station_name': 'string', 'bike_id': np.int64, 'user_type': 'string'})
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
df_boston['end_time'] = pd.to_datetime(df_boston['end_time'], format='%Y-%m-%d %X')
df_boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122554 entries, 0 to 1122553
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   Unnamed: 0               1122554 non-null  int64         
 1   start_time               1122554 non-null  datetime64[ns]
 2   end_time                 1122554 non-null  datetime64[ns]
 3   start_station_id         1122554 non-null  int64         
 4   end_station_id           1122554 non-null  string        
 5   start_station_name       1122554 non-null  string        
 6   end_station_name         1122554 non-null  string        
 7   bike_id                  1122554 non-null  int64         
 8   user_type                1122554 non-null  string        
 9   trip_length              1122554 non-null  float64       
 10  start_station_latitude   1122554 non-null  float64       
 11  start_station_longitude  1122554 non-null  float64       
 12  

# Station analysis

## Starts stations volume  per year
Visualization which contains information on how many rides start at a respective station per year

In [10]:
df_start = df_boston.groupby(['start_station_id','start_station_latitude', 'start_station_longitude']).size().reset_index(name='counts_start')


color_scale = [(0, 'orange'), (1,'red')]

fig = px.scatter_mapbox(df_start, 
                        lat="start_station_latitude", 
                        lon="start_station_longitude", 
                        color='counts_start',
                        color_continuous_scale=color_scale,
                        size='counts_start',
                        zoom=8, 
                        height=800,
                        width=800)


fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## End station rides volume
Visualization which contains information on how many rides end at a respective station per year

In [11]:
df_end = df_boston.groupby(['end_station_id','end_station_latitude', 'end_station_longitude']).size().reset_index(name='counts_end')

color_scale = [(0, 'orange'), (1,'red')]

fig = px.scatter_mapbox(df_end, 
                        lat="end_station_latitude", 
                        lon="end_station_longitude", 
                        color='counts_end',
                        color_continuous_scale=color_scale,
                        size='counts_end',
                        zoom=8, 
                        height=800,
                        width=800)


fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Volume share per station 
What is the percentage of total of total rides a station is used? 

In [12]:
#share of different stations, most frequently used stations
df_station_share = df_start.copy()
df_station_share['total_rides'] = df_start['counts_start'] + df_end['counts_end']
yearly_total_rides = sum(df_station_share['total_rides'])
df_station_share['share'] = (df_station_share['total_rides'] / yearly_total_rides)*100
df_station_share.sort_values('share', ascending=False).head(5)

Unnamed: 0,start_station_id,start_station_latitude,start_station_longitude,counts_start,total_rides,share
75,81,42.352409,-71.062679,10343,41627,1.85412
19,22,42.352175,-71.055547,30798,37720,1.680097
55,60,42.360835,-71.07084,17291,33169,1.47739
121,130,42.317225,-71.065421,761,30666,1.365903
88,95,42.372969,-71.094445,10397,30271,1.348309


## Station Distribution in Boston

In [34]:
def heat_map(df, station_id):
    # define new map
    heat_map = folium.Map(
        location=[df['start_station_latitude'].mean(), df['start_station_longitude'].mean()], 
        tiles='OpenStreetMap', 
        zoom_start=13, 
        control_scale=True, 
        max_zoom=20)
    # add heat map
    heat_data = [[row['start_station_latitude'],row['start_station_longitude']] for index, row in df.iterrows()]
    HeatMap(heat_data).add_to(heat_map)
    #heat map for destination points looks pretty much identical
    return heat_map
m = heat_map(df_start, np.array(df_start['start_station_id']))
m

# Most popular routes  
This section tries to explain which routes are most often used.

In [None]:
#number of trips between a start and end station

df_trips = df_boston.groupby(['start_station_id','end_station_id', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude']).size().reset_index(name='count_trips')

df_trips = df_trips.sort_values('count_trips', ascending = False)

df_trips.index = range(0, len(df_trips))

df_trips.head(10)

In [None]:

avg_lat = df_trips['start_station_latitude'].mean()
avg_long = df_trips['start_station_longitude'].mean()

m = folium.Map(location=[avg_lat, avg_long], zoom_start=15)

points = []

df_top_trips = df_trips.head(10)


for i in range(0,len(df_top_trips)):
    points.append([df_top_trips.iloc[i]['start_station_latitude'],df_top_trips.iloc[i]['start_station_longitude']])
    points.append([df_top_trips.iloc[i]['end_station_latitude'],df_top_trips.iloc[i]['end_station_longitude']])
    folium.PolyLine(points, color="red", weight=2.5, opacity=1).add_to(m)
    folium.Marker([df_top_trips.iloc[i]['start_station_latitude'],df_top_trips.iloc[i]['start_station_longitude']],popup=(df_top_trips.iloc[i]['start_station_id'], df_top_trips.iloc[i]['count_trips' ]), icon = folium.Icon(color='green',prefix='fa', icon='bicycle')).add_to(m)
    folium.Marker([df_top_trips.iloc[i]['end_station_latitude'],df_top_trips.iloc[i]['end_station_longitude']],popup=(df_top_trips.iloc[i]['end_station_id'], df_top_trips.iloc[i]['count_trips' ]), icon = folium.Icon(color='red',prefix='fa', icon='bicycle')).add_to(m)
    points = [] #if points is not reseted then each station will be connected with every other station in array

m 

In [None]:
df_top_trips = df_trips.head(47).copy() #most common trips ignoring direction of trip, filter is 47 as we only use values above 1000 

for j in range(0, len(df_top_trips)):
    if j < len(df_top_trips):
        for i in range (0, len(df_top_trips)):
            if i < len(df_top_trips):
                if df_top_trips.at[j,'start_station_id'] == df_top_trips.at[i, 'end_station_id'] and df_top_trips.at[i,'start_station_id'] == df_top_trips.at[j, 'end_station_id']:
                    df_top_trips.at[j, 'count_trips'] = df_top_trips.at[i, 'count_trips'] + df_top_trips.at[j, 'count_trips']
                    df_top_trips = df_top_trips.drop(i)
                df_top_trips.index = range(0, len(df_top_trips))


df_top_trips = df_top_trips.sort_values('count_trips', ascending = False)
df_top_trips.index = range(0, len(df_top_trips))
df_top_trips = df_top_trips.rename(columns = {'start_station_id' : 'Station_A', 'end_station_id' : 'Station_B'})
df_top_trips = df_top_trips.head(11) #above 2300 total trips and close to top 10 
df_top_trips


In [None]:
#mapping the top trips 


avg_lat = df_trips['start_station_latitude'].mean()
avg_long = df_trips['start_station_longitude'].mean()

m = folium.Map(location=[avg_lat, avg_long], zoom_start=15)

points = []


for i in range(0,len(df_top_trips)):
    points.append([df_top_trips.iloc[i]['start_station_latitude'],df_top_trips.iloc[i]['start_station_longitude']])
    points.append([df_top_trips.iloc[i]['end_station_latitude'],df_top_trips.iloc[i]['end_station_longitude']])
    folium.PolyLine(points, color="red", weight=2.5, opacity=1).add_to(m)
    folium.Marker([df_top_trips.iloc[i]['start_station_latitude'],df_top_trips.iloc[i]['start_station_longitude']],popup=(df_top_trips.iloc[i]['Station_A'], df_top_trips.iloc[i]['count_trips' ]), icon = folium.Icon(color='blue',prefix='fa', icon='bicycle')).add_to(m)
    folium.Marker([df_top_trips.iloc[i]['end_station_latitude'],df_top_trips.iloc[i]['end_station_longitude']],popup=(df_top_trips.iloc[i]['Station_B'], df_top_trips.iloc[i]['count_trips' ]), icon = folium.Icon(color='blue',prefix='fa', icon='bicycle')).add_to(m)
    points = [] #if points is not reseted then each station will be connected with every other station in array

m 

#findings: near universities (harvard and mit) as well as around citycenter(haymarket) and the trainstations (north and south)