# London Cycling Project

### Load datasets

In [1]:
# Import packages
import numpy as np
import pandas as pd
import json

In [2]:
# London data 
inner = pd.read_csv('London/inner.csv')
outer = pd.read_csv('London/Outer London.csv')
central = pd.read_csv('London/central.csv')
sites = pd.read_excel('London/Biking sites.xlsx')

In [3]:
# New York data 
ny_count = pd.read_csv('New York/Bicycle_Counts.csv')
ny_park = pd.read_csv('New York/Bicycle_Parking.csv')
ny_routes = pd.read_excel('New York/Bicycle_Routes.xlsx')
# Sydney data 
syd_site = pd.read_csv('Sydney/Bicycle_count_sites.csv')
syd_survey = pd.read_csv('Sydney/Bicycle_count_surveys.csv')

In [2]:
# JSON data (crossing file) 
with open('London/crossing.json', 'r') as f:
    crossing_json = json.loads(f.read())
crossing = pd.json_normalize(crossing_json, record_path = ['features'])
crossing.head()

Unnamed: 0,type,geometry.type,geometry.coordinates,properties.FEATURE_ID,properties.SVDATE,properties.CRS_SIGNAL,properties.CRS_SEGREG,properties.CRS_CYGAP,properties.CRS_PEDEST,properties.CRS_LEVEL,properties.BOROUGH,properties.PHOTO1_URL,properties.PHOTO2_URL
0,Feature,LineString,"[[-0.0813618204, 51.6368433881], [-0.081492329...",RWG150471,2017-09-17,True,False,False,False,False,Enfield,https://cycleassetimages.data.tfl.gov.uk/RWG15...,https://cycleassetimages.data.tfl.gov.uk/RWG15...
1,Feature,LineString,"[[-0.1237165558, 51.6516975845], [-0.123747692...",RWG150537,2017-09-27,True,False,False,False,False,Enfield,https://cycleassetimages.data.tfl.gov.uk/RWG15...,https://cycleassetimages.data.tfl.gov.uk/RWG15...
2,Feature,LineString,"[[-0.1097495774, 51.6155949778], [-0.109740965...",RWG187222,2017-09-28,True,True,False,False,False,Enfield,https://cycleassetimages.data.tfl.gov.uk/RWG18...,https://cycleassetimages.data.tfl.gov.uk/RWG18...
3,Feature,LineString,"[[-0.1019413172, 51.614532404], [-0.1018617852...",RWG187223,2017-09-28,True,False,False,False,False,Enfield,https://cycleassetimages.data.tfl.gov.uk/RWG18...,https://cycleassetimages.data.tfl.gov.uk/RWG18...
4,Feature,LineString,"[[-0.0457269989, 51.6270701469], [-0.045848418...",RWG187224,2018-01-19,True,False,False,False,False,Enfield,https://cycleassetimages.data.tfl.gov.uk/RWG18...,https://cycleassetimages.data.tfl.gov.uk/RWG18...


In [5]:
# Add columns 
central['area'] = 'Central'
inner['area'] = 'Inner'
outer['area'] = 'Outer'
central['Survey wave (year)'] = [central['Survey wave (calendar quarter)'][i].split()[0] 
                                 for i in range(central.shape[0])]

In [6]:
# Join London dataframes 
london = pd.concat([central, inner, outer]).reset_index(drop=True)

In [7]:
# Get day of week and date from survey date column

london['Survey date'].replace(' ', np.NaN, inplace=True)
london['Survey date'] = london['Survey date'].fillna(', ')

london['days'] = [london['Survey date'][i].split(', ')[0] for i in range(london.shape[0])]
london['date'] = [london['Survey date'][i].split(', ')[1] for i in range(london.shape[0])]

london['Survey date'].replace(', ', np.NaN, inplace=True)

day_mapping = {
    'lun' : 'Monday',
    'mar' : 'Tuesday',
    'mer' : 'Wednesday',
    'jeu' : 'Thursday',
    'ven' : 'Friday',
    'sam' : 'Saturday',
    'dim' : 'Sunday',
    '' : np.NaN
}

london['day_of_week'] = london['days'].map(day_mapping)
london.date.replace('', np.nan, inplace=True)
london.drop(['Survey date', 'days'], axis=1, inplace=True)

In [8]:
# Drop columns that may be less important in analysis 
london = london.drop(['Survey wave (calendar quarter)', 'Equivalent financial quarter', 'Location', 
                      'Time', 'Direction', 'Start minute', 'Number of unknown cycles'], axis=1)

In [9]:
# Rename columns 
london.rename(columns={'Site ID':'site', 'Weather':'weather', 'Period':'period', 'Start hour':'hour', 
                       'Number of private cycles':'private_cycles', 'Number of cycle hire bikes':'hire_cycles', 
                       'Total cycles':'total_cycles', 'Survey wave (year)':'year', 'Number of male cycles':'male_cycles',
                       'Number of female cycles':'female_cycles'}, inplace=True)
london.year = london.year.str.strip()

In [10]:
# Standardise weather 
london.weather = london.weather.str.lower()
london.weather = london.weather.fillna('')

wet_list = ['wet', 'rain', 'drizz', 'shower', 'damp', 'mizzle', 'down pour', 'deluge', 'spit', ' shr']
good_list = ['sun', 'fine', 'fair', 'warm', 'hot', 'good', 'mild', 'clear', 'bright']
poor_list = ['cloud', 'overcast', 'cold', 'cool', 'wind', 'dull', 'dark', 'chill', 'hazy', 'foggy', 
             'blustery', 'mist', 'snow', 'hail', 'sleet', 'storm', 'thunder']

weather_cat = []
for weather_text in london.weather:
    if any(word in weather_text for word in wet_list) and 'dry' in weather_text:
        weather_cat.append('mixed')
    elif 'dry' in weather_text:
        weather_cat.append('dry')
    elif any(word in weather_text for word in wet_list):
        weather_cat.append('wet')
    elif (any(word in weather_text for word in good_list) and any(word in weather_text for word in poor_list)) or weather_text == 'mixed':
        weather_cat.append('mixed')       
    elif any(word in weather_text for word in good_list):
        weather_cat.append('good')        
    elif any(word in weather_text for word in poor_list):
        weather_cat.append('poor')       
    else:
        weather_cat.append('unknown')    
        
london['weather'] = weather_cat

In [11]:
# Convert date data 
london['date'] = pd.to_datetime(london['date'], errors='coerce')
london['month'] = london['date'].dt.month

In [12]:
#Checking for unexpected values 
for col in london.columns:
    print(london[col].value_counts())

# Checking null values 
london.isna().sum()

CENCY008    3714
CENCY001    3712
CENCY136    3712
CENCY125    3712
CENCY126    3712
            ... 
INNCY479     640
INNCY492     640
INNCY455     640
INNCY463     640
INNCY502     640
Name: site, Length: 1258, dtype: int64
dry        1385589
wet         194965
good         37002
unknown      22362
poor         13044
mixed         4637
Name: weather, dtype: int64
Inter-peak (10:00-16:00)       621642
AM peak (07:00-10:00)          310824
PM peak (16:00-19:00)          310781
Evening (19:00-22:00)          310738
Early Morning (06:00-07:00)    103608
Name: period, dtype: int64
6.0     103608
7.0     103608
8.0     103608
9.0     103608
10.0    103608
11.0    103608
12.0    103608
13.0    103608
14.0    103605
15.0    103605
16.0    103604
18.0    103589
17.0    103588
19.0    103584
20.0    103580
21.0    103574
Name: hour, dtype: int64
0.0      200509
1.0      144984
2.0      117913
3.0       93606
4.0       80113
          ...  
476.0         1
362.0         1
428.0         1
512.0 

site                    0
weather                 0
period                  6
hour                    6
private_cycles     375724
hire_cycles        375724
total_cycles            0
area                    0
year               899436
male_cycles       1281939
female_cycles     1281939
date                14332
day_of_week         14332
month               14332
dtype: int64

In [13]:
# View dataset
london.head()

Unnamed: 0,site,weather,period,hour,private_cycles,hire_cycles,total_cycles,area,year,male_cycles,female_cycles,date,day_of_week,month
0,CENCY001,dry,Early Morning (06:00-07:00),6.0,0.0,0.0,0,Central,2014,,,2014-01-24,Friday,1.0
1,CENCY001,dry,Early Morning (06:00-07:00),6.0,15.0,0.0,15,Central,2014,,,2014-01-24,Friday,1.0
2,CENCY001,dry,Early Morning (06:00-07:00),6.0,35.0,0.0,35,Central,2014,,,2014-01-24,Friday,1.0
3,CENCY001,dry,Early Morning (06:00-07:00),6.0,59.0,2.0,61,Central,2014,,,2014-01-24,Friday,1.0
4,CENCY001,dry,AM peak (07:00-10:00),7.0,73.0,0.0,73,Central,2014,,,2014-01-24,Friday,1.0


### Next steps on the data:

- Look into the features that predict cycling levels and start creating visualisations
- Figure out how to open and apply JSON files

### Next steps to research:

- May need to find the coordinates the of count sites 
- May need to find some vehicle count figures for comparison

In [16]:
# Start looking at data 
display(london.groupby(['area']).mean()[['total_cycles']])
display(london.groupby(['day_of_week']).mean()[['total_cycles']])
display(london.groupby(['weather']).mean()[['total_cycles']])

Unnamed: 0_level_0,total_cycles
area,Unnamed: 1_level_1
Central,16.65304
Inner,6.196408
Outer,1.589695


Unnamed: 0_level_0,total_cycles
day_of_week,Unnamed: 1_level_1
Friday,8.897934
Monday,11.555405
Saturday,4.970703
Sunday,5.015363
Thursday,9.435291
Tuesday,9.975308
Wednesday,9.591853


Unnamed: 0_level_0,total_cycles
weather,Unnamed: 1_level_1
dry,10.195013
good,11.830388
mixed,10.252965
poor,7.820914
unknown,10.781728
wet,7.765127


In [15]:
# Investigate whether number of cyclists is related to:
# year, month, day of week, time of day, weather, central/inner/outer
# proximity to various cycling infrastructure 