# Exploring Journey Network

Having cleaned the journey data, we will now do some initial exploration and visualisation of the network. 

To begin with, I will just randomly-sample from the dataset as per usual. Ideally, I will eventually do a **full scan** of the journeys and extract the relevant summaries. 

In [3]:
import pickle
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

In [5]:
journeys_path = r'..\data\cycle_journeys\JourneysDataCombined_CLEANSED.csv'
bp_to_name = pickle.load(open(r'..\data\cycle_journeys\bikepointid_to_commonname.p', 'rb'))
bp_to_latlon = pickle.load(open(r'..\data\cycle_journeys\bikepointid_to_latlongs.p', 'rb'))

## Creating a sample df and network

In [6]:
# We will sample just 1% of journeys, using skiprows
p = 0.01 
random.seed(16)

df_samp = pd.read_csv(
        journeys_path
        ,header=0
        ,sep=','
        ,skiprows=lambda i: i>0 and random.random() > p
        ,parse_dates=['Start Date', 'End Date']
        ,infer_datetime_format=True
)

In [7]:
df_samp.set_index('Rental Id', inplace=True)

In [8]:
df_samp.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 791827 entries, 50754299.0 to 47943912.0
Data columns (total 6 columns):
Duration           774933 non-null float64
Bike Id            779402 non-null float64
End Date           774933 non-null datetime64[ns]
EndStation Id      791827 non-null int64
Start Date         779405 non-null datetime64[ns]
StartStation Id    791827 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(2)
memory usage: 42.3 MB


group dataframe into weighted edge-list

In [9]:
df_samp['weight'] = 1

In [10]:
edge_weights = df_samp.groupby(['StartStation Id','EndStation Id'])['weight'].sum().reset_index()

Drop edges to/from -1

In [11]:
edge_weights.drop(
    edge_weights[(edge_weights['StartStation Id'] == -1) | (edge_weights['EndStation Id'] == -1)].index
    ,inplace=True
)

Convert to Network

In [12]:
G = nx.from_pandas_edgelist(edge_weights, source='StartStation Id', target='EndStation Id', edge_attr ='weight', create_using=nx.DiGraph)

## Defining a map

In [13]:
x = list(map(list, zip(*bp_to_latlon.values())))
longs = x.pop()
lats = x.pop()
boundary_box = (
    min(longs)
    ,max(longs)
    ,min(lats)
    ,max(lats)
)

In [14]:
boundary_box

(-0.236769, -0.002275, 51.454752, 51.549369)

In [15]:
latrange = [min(lats), max(lats)]
lonrange = [min(longs), max(longs)]

In [18]:
df_airports = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
df_airports.head()

Unnamed: 0,iata,airport,city,state,country,lat,long,cnt
0,ORD,Chicago O'Hare International,Chicago,IL,USA,41.979595,-87.904464,25129
1,ATL,William B Hartsfield-Atlanta Intl,Atlanta,GA,USA,33.640444,-84.426944,21925
2,DFW,Dallas-Fort Worth International,Dallas-Fort Worth,TX,USA,32.895951,-97.0372,20662
3,PHX,Phoenix Sky Harbor International,Phoenix,AZ,USA,33.434167,-112.008056,17290
4,DEN,Denver Intl,Denver,CO,USA,39.858408,-104.667002,13781


In [17]:
df_flight_paths = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_aa_flight_paths.csv')
df_flight_paths.head()

Unnamed: 0,start_lat,start_lon,end_lat,end_lon,airline,airport1,airport2,cnt
0,32.895951,-97.0372,35.040222,-106.609194,AA,DFW,ABQ,444
1,41.979595,-87.904464,30.194533,-97.669872,AA,ORD,AUS,166
2,32.895951,-97.0372,41.938874,-72.683228,AA,DFW,BDL,162
3,18.439417,-66.001833,41.938874,-72.683228,AA,SJU,BDL,56
4,32.895951,-97.0372,33.562943,-86.75355,AA,DFW,BHM,168


In [19]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = df_airports['long'],
    lat = df_airports['lat'],
    hoverinfo = 'text',
    text = df_airports['airport'],
    mode = 'markers',
    marker = dict(
        size = 2,
        color = 'rgb(255, 0, 0)',
        line = dict(
            width = 3,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))

flight_paths = []
for i in range(len(df_flight_paths)):
    fig.add_trace(
        go.Scattergeo(
            locationmode = 'USA-states',
            lon = [df_flight_paths['start_lon'][i], df_flight_paths['end_lon'][i]],
            lat = [df_flight_paths['start_lat'][i], df_flight_paths['end_lat'][i]],
            mode = 'lines',
            line = dict(width = 1,color = 'red'),
            opacity = float(df_flight_paths['cnt'][i]) / float(df_flight_paths['cnt'].max()),
        )
    )

fig.update_layout(
    title_text = 'Feb. 2011 American Airline flight paths<br>(Hover for airport names)',
    showlegend = False,
    geo = dict(
        scope = 'north america',
        projection_type = 'azimuthal equal area',
        showland = True,
        landcolor = 'rgb(243, 243, 243)',
        countrycolor = 'rgb(204, 204, 204)',
    ),
)

fig.show()

## Reproducing for Clycle Data

In [30]:
stn = pd.DataFrame.from_dict(bp_to_name, orient='index', columns=['Station Name'])

In [31]:
stloc = pd.DataFrame.from_dict(bp_to_latlon, orient='index', columns=['Lat','Long'])

In [41]:
station_df = stn.merge(stloc, left_index=True, right_index=True)
station_df.head()

Unnamed: 0,Station Name,Lat,Long
1,"River Street , Clerkenwell",51.529163,-0.10997
2,"Phillimore Gardens, Kensington",51.499606,-0.197574
3,"Christopher Street, Liverpool Street",51.521283,-0.084605
4,"St. Chad's Street, King's Cross",51.530059,-0.120973
5,"Sedding Street, Sloane Square",51.49313,-0.156876


In [43]:
journey_df = edge_weights

In [44]:
journey_df['start_coords'] = journey_df['StartStation Id'].map(bp_to_latlon)
journey_df['end_coords'] = journey_df['EndStation Id'].map(bp_to_latlon)

In [63]:
journey_df_top = journey_df.sort_values(by='weight', ascending=False).iloc[:1000,:]

In [79]:
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    #locationmode = 'USA-states',
    lon = station_df['Long'],
    lat = station_df['Lat'],
    hoverinfo = 'text',
    text = station_df['Station Name'],
    mode = 'markers',
    marker = dict(
        size = 2,
        color = 'rgb(0, 0, 255)',
        line = dict(
            width = 3,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))

journeys = []
for i in range(len(journey_df_top)):
    fig.add_trace(
        go.Scattergeo(
            #locationmode = 'USA-states',
            lon = [journey_df_top['start_coords'].iloc[i][1], journey_df_top['end_coords'].iloc[i][1]],
            lat = [journey_df_top['start_coords'].iloc[i][0], journey_df_top['end_coords'].iloc[i][0]],
            mode = 'lines',
            line = dict(width = 2,color = 'red'),
            opacity = float(journey_df_top['weight'].iloc[i]) / float(journey_df_top['weight'].max()),
            hoverinfo = 'skip'
        )
    )

# automatically zoom
fig.update_geos(fitbounds="locations")

fig.update_layout(showlegend=False)

fig.show()