In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib as mpl
import matplotlib.colors as colors

import networkx as nx
import community as community_louvain

import folium
from folium.plugins import MarkerCluster, PolyLineTextPath
from collections import Counter

from datetime import datetime
import calendar
import holidays

import os
import requests
from zipfile import ZipFile
from io import BytesIO

### Download and open the dataset

In [2]:
base_url = "https://s3.amazonaws.com/hubway-data/"

def download_and_unzip(year, month):
    if year < 2018 or (year == 2018 and month < 5):
        brand = 'hubway'
    else:
        brand = 'bluebikes'

    filename = f"{year}{month:02d}-{brand}-tripdata.zip"
    filepath = os.path.join(f"{year}{month:02d}-{brand}-tripdata")  # replace with your local path

    # Download if the file does not exist in the device
    if not os.path.exists(filepath):
        print(f"Downloading {filename}...")
        response = requests.get(base_url + filename, stream=True)
        response.raise_for_status()
        
        # Unzip
        with ZipFile(BytesIO(response.content)) as thezip:
            print("Unzipping...")
            thezip.extractall(f"{year}{month:02d}-{brand}-tripdata") 
            print(f"Unzipped to {year}{month:02d}-{brand}-tripdata")
    else:
        print(filename,"already exists.")
    
    df_full = pd.read_csv(f"{year}{month:02d}-{brand}-tripdata/{year}{month:02d}-{brand}-tripdata.csv")
    df_full.replace('\\N', np.nan, inplace=True)
    df_full = df_full.dropna() #if the data contains NA, drop the row
    
    # Change the name of the column so that it is similar in all brand name
    if brand == 'bluebikes':
        df_full.rename(columns={'started_at': 'starttime'}, inplace=True)
        df_full.rename(columns={'ended_at': 'stoptime'}, inplace=True)
        df_full.rename(columns={'start_station_id': 'start station id'}, inplace=True)
        df_full.rename(columns={'end_station_id': 'end station id'}, inplace=True)
        df_full.rename(columns={'start_lat': 'start station latitude'}, inplace=True)
        df_full.rename(columns={'start_lng': 'start station longitude'}, inplace=True)
        df_full.rename(columns={'end_lat': 'end station latitude'}, inplace=True)
        df_full.rename(columns={'end_lng': 'end station longitude'}, inplace=True)
        df_full.rename(columns={'member_casual': 'usertype'}, inplace=True)
               
    return(df_full)

### Filter based on date and time

In [3]:
def filter_date_time(df, year, month, start_day, start_hour, start_minute, end_day, end_hour, end_minute):
    
   # Define holidays for Massachusetts, USA
    usa_holidays = holidays.UnitedStates(years=year, state='MA')
    
    # Ensure starttime and stoptime are in datetime format
    df['starttime'] = pd.to_datetime(df['starttime'])
    df['stoptime'] = pd.to_datetime(df['stoptime'])
    
    # Calculate trip duration in seconds
    if 'tripduration' not in df.columns:
        df['tripduration'] = (df['stoptime'] - df['starttime']).dt.total_seconds()
    
    # Filter trips within the specified time range
    filtered_data = df[
        (df['starttime'].dt.year == year) &
        (df['starttime'].dt.month == month) &
        (df['starttime'].dt.day >= start_day) & 
        (df['starttime'].dt.day <= end_day) &  # Ensure start and stop times fall within the same day
        (df['starttime'].dt.hour >= start_hour) &  # Starting at start_hour
        (df['starttime'].dt.minute >= start_minute) &  # Starting at or after start_minute
        (
            (df['stoptime'].dt.date == df['starttime'].dt.date) &  # Stop time is on the same day as start time
            (
                (df['stoptime'].dt.hour < end_hour) |  # Ending before end_hour
                ((df['stoptime'].dt.hour == end_hour) & (df['stoptime'].dt.minute <= end_minute))  # Ending at or after end_minute
            )
        ) &
        (df['starttime'].dt.dayofweek >= 0) &  # Weekdays only
        (df['starttime'].dt.dayofweek <= 4) &  # Weekdays only
        (~df['starttime'].dt.date.isin(usa_holidays)) &
        (df['tripduration'] > 60) &  # Longer than 60 seconds
        (df['tripduration'] <= 3600)  # No longer than 3600 seconds
    ]
    
    filtered_data.reset_index(drop=True, inplace=True)
      
    return filtered_data

### Full function filtering and information extraction

In [36]:
def initial_cleaning (month, year, start_hour, end_hour):
    '''
    INPUT
    month      : Integer signify the month of analysis
    year       : Integer signify the year of analysis
    start_hour : Integer signify the start hour of analysis 
    end_hour   : Integer signify the end hour of analysis
    
    OUTPUT 
    data_filt    : Dataframe containing complete data of the trips that already filtered 
    tc           : Dataframe containing information regarding the number of trips between stations 
    stations_loc : Dataframe containing information about the latitude and longitude of each stations 
    '''
    # Download/open the file 
    bsn_full = download_and_unzip(int(year), int(month))
    
    # Filter date and time 
    start_year = year
    start_month = month
    end_year = year
    end_month = month

    start_day = 1
    _, end_day = calendar.monthrange(year, month) # automatically take the last date of the month
    start_minute = 0
    end_minute = 59
    
    end_hour = end_hour-1

    data_filt = filter_date_time(bsn_full, year, month, start_day, start_hour, start_minute, end_day, end_hour, end_minute)
    
    # Filter self loop, member only
    data_filt = data_filt[data_filt['start station id'] != data_filt['end station id']]
    data_filt = data_filt[data_filt['usertype'].isin(['Subscriber', 'member'])]
    data_filt.reset_index(drop=True, inplace=True)
    
    # Trip_counts
    tc = data_filt.groupby(['start station id', 'end station id','start station latitude', 'start station longitude', 
                            'end station latitude', 'end station longitude']).size().reset_index(name='trip_count')
    
    # Location of the stations
    start_stations = data_filt[['start station id', 'start station latitude', 'start station longitude']].copy()
    end_stations = data_filt[['end station id', 'end station latitude', 'end station longitude']].copy()

    start_stations.columns = ['station id', 'latitude', 'longitude']
    end_stations.columns = ['station id', 'latitude', 'longitude']

    stations_loc= pd.concat([start_stations, end_stations], ignore_index=True)
    stations_loc = stations_loc.drop_duplicates(subset='station id').reset_index(drop=True)
    stations_loc['station id'] = stations_loc['station id']#.astype(int)
    stations_loc = stations_loc.sort_values(by='station id', ascending=True).reset_index(drop=True)# Delete any duplicates
    
    return data_filt, tc, stations_loc

For the current simple analysis, September is picked to be the best month because campuses are active and it is the period with less rain. 

In [199]:
data_161, trip_counts_161, stations_loc_161 = initial_cleaning (month = 11, year = 2016, start_hour = 8, end_hour = 9)

201611-hubway-tripdata.zip already exists.


In [200]:
data_162, trip_counts_162, stations_loc_162 = initial_cleaning (month = 11, year = 2016, start_hour = 7, end_hour = 8)

201611-hubway-tripdata.zip already exists.


In [201]:
# data
data_16 = pd.concat([data_161, data_162], ignore_index=True)

# Trip Count
trip_counts_16 = pd.merge(trip_counts_161, trip_counts_162, 
                          on=['start station id', 'end station id', 'start station latitude', 'start station longitude', 'end station latitude', 'end station longitude'], 
                          how='outer', suffixes=('_161', '_162'))

trip_counts_16['trip_count_161'] = trip_counts_16['trip_count_161'].fillna(0)
trip_counts_16['trip_count_162'] = trip_counts_16['trip_count_162'].fillna(0)

trip_counts_16['trip_count'] = trip_counts_16['trip_count_161'] + trip_counts_16['trip_count_162']

trip_counts_16

# Stations location
stations_loc_16=pd.concat([stations_loc_161, stations_loc_162], ignore_index=True)
stations_loc_16.drop_duplicates(subset=['station id', 'latitude', 'longitude'], inplace=True)

In [202]:
data_181, trip_counts_181, stations_loc_181 = initial_cleaning (month = 11, year = 2018, start_hour = 7, end_hour = 8)

201811-bluebikes-tripdata.zip already exists.


In [203]:
data_182, trip_counts_182, stations_loc_182 = initial_cleaning (month = 11, year = 2018, start_hour = 8, end_hour = 9)

201811-bluebikes-tripdata.zip already exists.


In [204]:
# data
data_18 = pd.concat([data_181, data_182], ignore_index=True)

# Trip Count
trip_counts_18 = pd.merge(trip_counts_181, trip_counts_182, 
                          on=['start station id', 'end station id', 'start station latitude', 'start station longitude', 'end station latitude', 'end station longitude'], 
                          how='outer', suffixes=('_181', '_182'))

trip_counts_18['trip_count_181'] = trip_counts_18['trip_count_181'].fillna(0)
trip_counts_18['trip_count_182'] = trip_counts_18['trip_count_182'].fillna(0)

trip_counts_18['trip_count'] = trip_counts_18['trip_count_181'] + trip_counts_18['trip_count_182']

trip_counts_18

# Stations location
stations_loc_18=pd.concat([stations_loc_181, stations_loc_182], ignore_index=True)
stations_loc_18.drop_duplicates(subset=['station id', 'latitude', 'longitude'], inplace=True)

In [309]:
data_16, trip_counts_16, stations_loc_16 = initial_cleaning (month = 9, year = 2016, start_hour = 16, end_hour = 18)

201609-hubway-tripdata.zip already exists.


In [310]:
data_18, trip_counts_18, stations_loc_181 = initial_cleaning (month = 9, year = 2018, start_hour = 16, end_hour = 18)

201809-bluebikes-tripdata.zip already exists.


### Network of the Bike Sharing

In [311]:
def create_network(trip_count_df, stations_locs, plot=False):
    '''
    INPUT 
    trip_count_df  : Dataframe containing the start station, end station, and trip count 
    stations_locs  : Dataframe containing the stations and its location coordinates
    
    OUPUT
    BSN     : Bidirectional graph 
    UNDIR   : Undirected graph
    adj_df  : Dataframe of the adjacency matrix of the bidirectional graph
    '''
    # Create Bidirectional graph
    BSN = nx.DiGraph()

    # Add edges for everytrip between the nodes(stations)
    for _, row in trip_count_df.iterrows():
        BSN.add_edge((row['start station id']), (row['end station id']), weight=(row['trip_count']))
        
    # Calculate the adjacency matrix
    adj_mat = nx.adjacency_matrix(BSN)
    # Reformating to pandas dataframe
    adj_df = pd.DataFrame(adj_mat.todense().astype(int), index=BSN.nodes(), columns=BSN.nodes()) 
    
    # Create the undirected network with the edges only register if exist both direction and the weight is the sum of both weights
    UNDIR = nx.Graph()
    for u, v, data in BSN.edges(data=True):
        ### Undirected with some nodes removed
        # if BSN.has_edge(u, v) and BSN.has_edge(v, u):
        #     if not UNDIR.has_edge(u, v):
        #         total_weight = BSN[u][v]['weight'] + BSN[v][u]['weight'] #Total weight is from both directions
        #         UNDIR.add_edge(u, v, weight=total_weight)
                
        #### Normal Undirected
        if UNDIR.has_edge(u, v):
            # If the edge already exists in the undirected graph, add the weight
            UNDIR[u][v]['weight'] += data['weight']
        else:
            # If the edge doesn't exist, create it with the current weight
            UNDIR.add_edge(u, v, weight=data['weight'])
    
    if plot == True:
        plt.figure(figsize=(17, 17)) 

        # pos = nx.kamada_kawai_layout(BSN)  # layout option 1
        pos = {row['station id']: (row['longitude'], row['latitude']) for idx, row in stations_locs.iterrows()} #layout based on coordinates
        # nx.draw(BSN, pos, with_labels=True, node_size=500, node_color='lightblue', arrows=True, edge_color='grey', alpha=1)
        nx.draw_networkx_nodes(BSN, pos, node_size=500, node_color='lightblue')
        nx.draw_networkx_edges(BSN, pos, edge_color='grey', arrows=True, alpha=0.2)
        # nx.draw_networkx_labels(BSN, pos)

        plt.axis('off')
        plt.title('Network of the Bike Sharing')

        plt.show()
    
    return BSN, UNDIR, adj_df      

In [312]:
BSN16, UNDIR16, adj16 = create_network(trip_counts_16, stations_loc_16, plot=False)

In [313]:
BSN18, UNDIR18, adj18 = create_network(trip_counts_18, stations_loc_18, plot=False)

In [314]:
print("Sanity check if the number of trip after data in the adjacency matrix match of that in the trips database:", adj16.sum().sum() == len(data_16))

Sanity check if the number of trip after data in the adjacency matrix match of that in the trips database: True


In [315]:
print("Sanity check if the number of trip before data in the adjacency matrix match of that in the trips database:", adj18.sum().sum() == len(data_18))

Sanity check if the number of trip before data in the adjacency matrix match of that in the trips database: True


## Plotting
## Option 1: With Marker Cluster

<span style="color:red;"> EDIT THIS PART TO MAKE FUNCTION! 

In [316]:
Plotting = False

In [317]:
if Plotting == True:
    map_boston = folium.Map(location=[42.3601, -71.0589], tiles='CartoDB positron', zoom_start=13)

    marker_cluster = MarkerCluster().add_to(map_boston) # To cluster the mark

    # MIT 
    folium.Marker(
        location=[42.3601, -71.0942],
        popup='MIT',
        icon=folium.Icon(color='darkblue', icon='info-sign')
    ).add_to(map_boston)

    # Harvard 
    folium.Marker(
        location=[42.3770, -71.1167],
        popup='Harvard University',
        icon=folium.Icon(color='darkred', icon='info-sign')
    ).add_to(map_boston)

    #Central Square
    folium.Marker(
        location=[42.3655, -71.1039],
        popup='Central Square',
        icon=folium.Icon(color='darkgreen', icon='info-sign')
    ).add_to(map_boston)

    max_weight = max(trip_counts_18['trip_count'])
    min_weight = min(trip_counts_18['trip_count'])

    # Start station (Green) & end station (Red)
    for idx, row in trip_counts_18.iterrows():
        start_coords = [row['start station latitude'], row['start station longitude']]
        end_coords = [row['end station latitude'], row['end station longitude']]

        folium.CircleMarker(
            location=start_coords,
            radius=3,
            color='green',
            fill=True,
            fill_color='green',
            fill_opacity=0.7,
        ).add_to(marker_cluster)

        folium.CircleMarker(
            location=end_coords,
            radius=3,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.7,
        ).add_to(marker_cluster)

    # Lines represents the bike trip 
        line = folium.PolyLine(
            locations=[start_coords, end_coords],
            color='purple',
            weight=(row['trip_count']-min_weight)/(max_weight-min_weight), 
    #         weight=0.1,

        )
        map_boston.add_child(line)


    # map_boston.save('map_boston.html')

    map_boston

### Option 2: Without the cluster mark

In [318]:
if Plotting == True:
    map_boston = folium.Map(location=[42.3601, -71.0589], tiles='CartoDB positron', zoom_start=13)

    # MIT 
    folium.Marker(
        location=[42.3601, -71.0942],
        popup='MIT',
        icon=folium.Icon(color='darkblue', icon='info-sign')
    ).add_to(map_boston)

    # Harvard 
    folium.Marker(
        location=[42.3770, -71.1167],
        popup='Harvard University',
        icon=folium.Icon(color='darkred', icon='info-sign')
    ).add_to(map_boston)

    #Central Square
    folium.Marker(
        location=[42.3655, -71.1039],
        popup='Central Square',
        icon=folium.Icon(color='darkgreen', icon='info-sign')
    ).add_to(map_boston)

    max_weight = max(trip_counts['trip_count'])
    min_weight = min(trip_counts['trip_count'])

    for idx, row in trip_counts.iterrows():
        start_coords = [row['start station latitude'], row['start station longitude']]
        end_coords = [row['end station latitude'], row['end station longitude']]

        folium.CircleMarker(
            location=start_coords,
            radius=3,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.7,
        ).add_to(map_boston)

        folium.CircleMarker(
            location=end_coords,
            radius=3,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.7,
        ).add_to(map_boston)

    # Lines represents the bike trip 
        start_coords = start_coords
        end_coords = end_coords 
        line = folium.PolyLine(
            locations=[start_coords, end_coords],
            color='purple',
            weight=(row['trip_count']-min_weight)/(max_weight-min_weight), 
    #         weight=0.2,

        )
        map_boston.add_child(line)

    # map_boston.save('map_boston2.html')

    map_boston

## Analysis 1 -- Community Comparison

#### Create Community - Louvain Algorithm

First, find the communities using the Louvain algorithm

Threshold : stopping criterion 
           If moving a node results in a modularity gain that is smaller than the threshold, 
           the algorithm concludes that it cannot meaningfully increase the modularity by further moves.
           When the modularity gain across all nodes is below this threshold, the algorithm stops.

Resolution : If resolution is less than 1, the algorithm favors larger communities. Greater than 1 favors smaller communities

In [319]:
# Function to find the best resolutions 
def com_finder (network,resolutions_range, plot=False):
    
    '''
    INPUT 
    network           : NetworkX format of Network 
    resolutions_range : List of possible values of resolution
    
    OUTPUT
    best_mod     : Value of the highest modularity 
    best_res     : Best resolution that produce best modularity
    commmunities : The list of nodes of communities 
    '''

    modularity_scores = []

    for res in resolutions_range:
        coms = nx.community.louvain_communities(network, weight='weight', resolution=res, threshold=1e-8, seed=123123)
        modularity = nx.community.modularity(network, coms, weight='weight')
        modularity_scores.append(modularity)
        
    best_mod = np.max(modularity_scores)
    best_res = resolutions_range[np.argmax(modularity_scores)]
    print("Best resolution is on", best_res, "with the value of modularity is", np.round(best_mod,2))
    
    communities = nx.community.louvain_communities(network, weight='weight',resolution=best_res, threshold=1e-8, seed=123123)
        
    if plot:    
        plt.figure(figsize=(10, 5))
        plt.plot(resolutions_range, modularity_scores, marker='o', color='red')
        plt.title('Modularity vs. Resolution')
        plt.xlabel('Resolution')
        plt.ylabel('Modularity Score')
        plt.grid(True)
        plt.show()
        
    return best_mod, best_res, communities

In [320]:
# bmod16, bres16, commun16 = com_finder(BSN16, resolutions_range = [0.2,0.5,0.8,0.9,1,1.1,1.2,1.5,1.8], plot=True)
bmod16, bres16, commun16 = com_finder(BSN16, resolutions_range = [1], plot=False)

Best resolution is on 1 with the value of modularity is 0.33


In [321]:
# bmod18, bres18, commun18 = com_finder(BSN18, resolutions_range = [0.2,0.5,0.8,0.9,1,1.1,1.2,1.5,1.8], plot=True)
bmod18, bres18, commun18 = com_finder(BSN18, resolutions_range = [1], plot=False)
# bmod18, bres18, commun18 = com_finder(BSN18, resolutions_range = [1.5], plot=False)

Best resolution is on 1 with the value of modularity is 0.32


### Plotting of the communities

In [322]:
# Coordinates of Galileo Galileo Way, Broadway Street
gal_way = [(42.364808, -71.089398), (42.365852, -71.089163),(42.366154, -71.088719),(42.366319, -71.088180)]
bdwy_str = [(42.364808, -71.089398), (42.364037, -71.087621)]

In [323]:
# Coordinates of Cambridge Street and Oxford Street 
cam_str = [(42.373918, -71.102282),(42.374924, -71.110130),(42.375454, -71.114163)]
ox_str = [(42.386056, -71.116185),(42.385192, -71.116162),(42.384312, -71.116105),(42.383633, -71.116101),(42.378770, -71.116624),(42.376446, -71.115758)]

In [324]:
# Coordinates of Beacon street
beacon_str = [(42.386197, -71.116081),(42.384711, -71.114328),(42.383195, -71.112392),(42.380188, -71.108679),(42.378308, -71.106383),
              (42.374322, -71.101545)]

In [325]:
colors = [
    'red', 'blue', 'green', 'yellow', 'purple', 'orange', 'pink', 'gray', 'brown', 'cyan',
     'magenta', 'lime', 'maroon', 'olive', 'navy', 'teal', 'aqua', 'fuchsia', 'silver'
]

In [326]:
def create_community_color_df(communities, colors):
    """
    INPUT
    communities : Each set contains the node IDs of a community.
    colors      : List of strings containing the colors to assign to each community.

    OUPUT
    community_color_df : DataFrame with columns containing the nodes, the community they belong, and the color associated
    """
    sorted_communities = sorted(communities, key=lambda x: min(x))
    
    community_color_data = []
    for idx, community in enumerate(sorted_communities):
        color = colors[idx % len(colors)]
        for node in sorted(community):
            community_color_data.append((node, idx, color))
    
    community_color_df = pd.DataFrame(community_color_data, columns=['Node', 'Community', 'Color'])
    
    if pd.api.types.is_float_dtype(community_color_df['Node']):
        community_color_df['Node'] = community_color_df['Node'].astype(int)  
    
    return community_color_df

In [327]:
def plot_community(communities, stations_locs_df, cam_str,ox_str):
    '''
    INPUT
    communities        : List of sets containing stations that are grouped based on the communities
    stations_locs_df   : DataFrame containing the stations and their location coordinates
    gal_way            : List of coordinates for Galileo Galilei Way
    bdwy_str           : List of coordinates for Broadway Street
    
    OUTPUT
    community_color_df : DataFrame mapping each node to its community and color
    map_boston         : Folium map object with plotted stations and their communities
    '''
    # Use the function to create the community color DataFrame
    community_color_df = create_community_color_df(communities, colors)

    # Convert the DataFrame into a dictionary for easy lookup
    node_to_color = community_color_df.set_index('Node')['Color'].to_dict()

    # Create the map
    map_boston = folium.Map(location=[42.3601, -71.0589], tiles='CartoDB positron', zoom_start=11)

    # Plot the stations, colored by community
    for _, row in stations_locs_df.iterrows():
        station_id = row['station id']
        station_coords = [row['latitude'], row['longitude']]
        station_color = node_to_color.get(station_id, "black")
        
        # Add a marker for the station
        folium.CircleMarker(
            location=station_coords,
            popup=f'Station ID: {station_id}',
            radius=5,
            color=station_color,
            fill=True,
            fill_color=station_color,
            fill_opacity=0.7
        ).add_to(map_boston)
       
    # Plot Galileo Galilei Way and Broadway Street
    # folium.PolyLine(gal_way, color='red', weight=5, opacity=1).add_to(map_boston)
    # folium.PolyLine(bdwy_str, color='blue', weight=5, opacity=1).add_to(map_boston)
    folium.PolyLine(cam_str, color='blue', weight=5, opacity=1).add_to(map_boston)
    folium.PolyLine(ox_str, color='orange', weight=5, opacity=1).add_to(map_boston)
    # folium.PolyLine(beacon_str, color='purple', weight=5, opacity=1).add_to(map_boston)

    return community_color_df, map_boston

In [328]:
comm_col_16, map_16 = plot_community(commun16, stations_loc_16, cam_str, ox_str)
map_16 

In [329]:
comm_col_18, map_18 = plot_community(commun18, stations_loc_18, cam_str, ox_str)
map_18

In [65]:
print('The number of communities are changing by', len(commun18)- len(commun16))

The number of communities are changing by 2


## Analysis 2 -- Streets effects on the trips between stations that have a high chance on passing

- Cambridge Street

In this case, the stations on the right side of the street are 88, 95, and 225 (for 2018) , while the stations on the left are 89, 104, 110 and 108. Particular interest are the trips that happened between these 2 sections.

- Oxford Street

In this case, the stations on the south side of the street are 110 and 108 , while the stations on the north are 99, 176, 115. Particular interest are the trips that happened between these 2 sections.

In [31]:
print(' The number of trips total increases by', np.sum(trip_counts_18['trip_count'])-np.sum(trip_counts_16['trip_count']))

 The number of trips total increases by 2567.0


In [66]:
sect_1 = [88,95,225, 90]
sect_2 = [89,104,108,110]
sect_3 = [110,108]
sect_4 = [99,115,176]

trip_sect1_16 = data_16.loc[data_16['start station id'].isin(sect_1 [:2]) & data_16['end station id'].isin(sect_2)]
trip_sect2_16 = data_16.loc[data_16['start station id'].isin(sect_2) & data_16['end station id'].isin(sect_1 [:2])]
trip_sect3_16 = data_16.loc[data_16['start station id'].isin(sect_4) & data_16['end station id'].isin(sect_3)]
trip_sect4_16 = data_16.loc[data_16['start station id'].isin(sect_3) & data_16['end station id'].isin(sect_4)]

trip_sect1_18 = data_18.loc[data_18['start station id'].isin(sect_1) & data_18['end station id'].isin(sect_2)]
trip_sect2_18 = data_18.loc[data_18['start station id'].isin(sect_2) & data_18['end station id'].isin(sect_1)]
trip_sect3_18 = data_18.loc[data_18['start station id'].isin(sect_4) & data_18['end station id'].isin(sect_3)]
trip_sect4_18 = data_18.loc[data_18['start station id'].isin(sect_3) & data_18['end station id'].isin(sect_4)]

In [67]:
## Number of trips analysis 
trip_sect_16 = len(trip_sect1_16)+len(trip_sect2_16)+len(trip_sect3_16)+len(trip_sect4_16)
trip_sect_18 = len(trip_sect1_18)+len(trip_sect2_18)+len(trip_sect3_18)+len(trip_sect4_18)
dif_trip = trip_sect_18 - trip_sect_16
print('The total number of trip changes by', dif_trip ,'trips')

The total number of trip changes by -38 trips


In [68]:
a1 = trip_sect1_16.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
a2 = trip_sect2_16.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
a3 = trip_sect3_16.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
a4 = trip_sect4_16.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
td16 =  pd.concat([a1,a2,a3,a4], ignore_index=True)
td16.rename(columns={'tripduration': 'avg duration 16'}, inplace=True)


b1 = trip_sect1_18.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
b2 = trip_sect2_18.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
b3 = trip_sect3_18.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
b4 = trip_sect4_18.groupby(['start station id', 'end station id'])['tripduration'].mean().reset_index().astype(int)
td18 =  pd.concat([b1,b2,b3,b4], ignore_index=True)
td18.rename(columns={'tripduration': 'avg duration 18'}, inplace=True)

In [69]:
td = pd.merge(td16, td18, on=['start station id', 'end station id'], how='outer')
td['avg duration 16'] = td['avg duration 16'].fillna(0)
td['avg duration 18'] = td['avg duration 18'].fillna(0)

# Only count the changes when then trips exist before and after the street 
td['changes_duration'] = td.apply(
    lambda row: row['avg duration 16'] - row['avg duration 18']
    if row['avg duration 16'] > 0 and row['avg duration 18'] > 0 else 0, axis=1
)
td

Unnamed: 0,start station id,end station id,avg duration 16,avg duration 18,changes_duration
0,88,108,345.0,397.0,-52.0
1,88,110,314.0,310.0,4.0
2,90,108,0.0,962.0,0.0
3,90,110,0.0,873.0,0.0
4,95,89,604.0,0.0,0.0
5,95,108,635.0,667.0,-32.0
6,95,110,486.0,560.0,-74.0
7,99,108,428.0,401.0,27.0
8,99,110,520.0,373.0,147.0
9,108,88,563.0,0.0,0.0


In [70]:
print('Total duration changes on the trips are', np.round(np.sum(td['changes_duration'])/sum(td['changes_duration']!=0),2), 'seconds')

Total duration changes on the trips are -6.27 seconds


8 April 24
Example
- Garden Street, Cambridge, MA 
- Mount Auburn, Cambridge, MA 

- Task:

1. Clean up (DONE)
	1. eliminate self loop
	2. business days only, certain activity hour 
	3. eliminate trips shorter than 60 secs and longer than 1 hr
	4. take data from subscriber/member only
    
2. Locate new bike lane (DONE)
3. Compare before after (clustering use Louvain, connections) -- package louvain community, heat diffusion 
4. Maybe email the people asking for data (new bike lane, when completed) (DONE)
5. Removing nodes/edge to see if there is a significant change