# Data modelling on Bike Sharing data in London

Data aquired from: https://www.kaggle.com/edenau/london-bike-sharing-system-data  
Data project initiated: 25/01/2019  
Author: Sedar Olmez

Data modelling:  
    GDS1: Data Gathering, Preparation and Exploration.  
    GDS2: Data Representation and Transformation.  
    GDS3: Computing with Data.   
    GDS4: Data Visualisation and Presentation.   
    GDS5: Data Modelling.   
    GDS6: Science about Data Science. 

Assessment:
![Assessment](assessment.png)

In [44]:
# Libraries
from __future__ import print_function
import matplotlib.pyplot as plt
import seaborn as sea
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor as DTR

In [45]:
%%time
# Load up the journeys.csv data into dataframes
dataframe_journeys = pd.read_csv('data/journeys.csv')
dataframe_journeys.columns = ['journey duration', 'journey ID', 'End date', 'End month', 'End year', 
                   'End hour', 'End minute', 'End station ID', 'Start date', 'Start month', 'Start year', 
                   'Start hour', 'Start minute', 'Start station ID']
dataframe_journeys.dropna()

CPU times: user 3.01 s, sys: 420 ms, total: 3.43 s
Wall time: 1.82 s


In [46]:
dataframe_journeys

Unnamed: 0,journey duration,journey ID,End date,End month,End year,End hour,End minute,End station ID,Start date,Start month,Start year,Start hour,Start minute,Start station ID
0,2040.0,953,19,9,17,18,0,478,19,9,17,17,26,251
1,1800.0,12581,19,9,17,15,21,122,19,9,17,14,51,550
2,1140.0,1159,15,9,17,17,1,639,15,9,17,16,42,212
3,420.0,2375,14,9,17,12,16,755,14,9,17,12,9,163
4,1200.0,14659,13,9,17,19,33,605,13,9,17,19,13,36
5,1320.0,2351,14,9,17,14,53,514,14,9,17,14,31,589
6,720.0,7252,17,9,17,17,12,484,17,9,17,17,0,478
7,720.0,9782,17,9,17,17,12,484,17,9,17,17,0,478
8,540.0,13500,15,9,17,13,42,367,15,9,17,13,33,153
9,960.0,11205,15,9,17,16,3,350,15,9,17,15,47,396


In [47]:
list(dataframe_journeys)

['journey duration',
 'journey ID',
 'End date',
 'End month',
 'End year',
 'End hour',
 'End minute',
 'End station ID',
 'Start date',
 'Start month',
 'Start year',
 'Start hour',
 'Start minute',
 'Start station ID']

In [48]:
dataframe_journeys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1542844 entries, 0 to 1542843
Data columns (total 14 columns):
journey duration    1542844 non-null float64
journey ID          1542844 non-null int64
End date            1542844 non-null int64
End month           1542844 non-null int64
End year            1542844 non-null int64
End hour            1542844 non-null int64
End minute          1542844 non-null int64
End station ID      1542844 non-null int64
Start date          1542844 non-null int64
Start month         1542844 non-null int64
Start year          1542844 non-null int64
Start hour          1542844 non-null int64
Start minute        1542844 non-null int64
Start station ID    1542844 non-null int64
dtypes: float64(1), int64(13)
memory usage: 164.8 MB


In [49]:
stations_dataframe = pd.read_csv('data/stations.csv')
stations_dataframe.columns = ['Station ID', 'Capacity', 'Latitude', 'Longitude', 'Station Name']
stations_dataframe

Unnamed: 0,Station ID,Capacity,Latitude,Longitude,Station Name
0,1,19,51.529163,-0.109970,"River Street , Clerkenwell"
1,2,37,51.499606,-0.197574,"Phillimore Gardens, Kensington"
2,3,32,51.521283,-0.084605,"Christopher Street, Liverpool Street"
3,4,23,51.530059,-0.120973,"St. Chad's Street, King's Cross"
4,5,27,51.493130,-0.156876,"Sedding Street, Sloane Square"
5,6,18,51.518117,-0.144228,"Broadcasting House, Marylebone"
6,7,16,51.534300,-0.168074,"Charlbert Street, St. John's Wood"
7,8,18,51.528341,-0.170134,"Lodge Road, St. John's Wood"
8,9,19,51.507385,-0.096440,"New Globe Walk, Bankside"
9,10,18,51.505974,-0.092754,"Park Street, Bankside"


In [50]:
stations_dataframe.dropna()

Unnamed: 0,Station ID,Capacity,Latitude,Longitude,Station Name
0,1,19,51.529163,-0.109970,"River Street , Clerkenwell"
1,2,37,51.499606,-0.197574,"Phillimore Gardens, Kensington"
2,3,32,51.521283,-0.084605,"Christopher Street, Liverpool Street"
3,4,23,51.530059,-0.120973,"St. Chad's Street, King's Cross"
4,5,27,51.493130,-0.156876,"Sedding Street, Sloane Square"
5,6,18,51.518117,-0.144228,"Broadcasting House, Marylebone"
6,7,16,51.534300,-0.168074,"Charlbert Street, St. John's Wood"
7,8,18,51.528341,-0.170134,"Lodge Road, St. John's Wood"
8,9,19,51.507385,-0.096440,"New Globe Walk, Bankside"
9,10,18,51.505974,-0.092754,"Park Street, Bankside"


In [51]:
list(stations_dataframe)

['Station ID', 'Capacity', 'Latitude', 'Longitude', 'Station Name']

In [52]:
stations_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 773 entries, 0 to 772
Data columns (total 5 columns):
Station ID      773 non-null int64
Capacity        773 non-null int64
Latitude        773 non-null float64
Longitude       773 non-null float64
Station Name    773 non-null object
dtypes: float64(2), int64(2), object(1)
memory usage: 30.3+ KB


In [53]:
dataframe_journeys.head()

Unnamed: 0,journey duration,journey ID,End date,End month,End year,End hour,End minute,End station ID,Start date,Start month,Start year,Start hour,Start minute,Start station ID
0,2040.0,953,19,9,17,18,0,478,19,9,17,17,26,251
1,1800.0,12581,19,9,17,15,21,122,19,9,17,14,51,550
2,1140.0,1159,15,9,17,17,1,639,15,9,17,16,42,212
3,420.0,2375,14,9,17,12,16,755,14,9,17,12,9,163
4,1200.0,14659,13,9,17,19,33,605,13,9,17,19,13,36


In [61]:
# Convert date columns from float to integer
column_date = ['End date', 'End month', 'End year']
dataframe_journeys[column_date] = dataframe_journeys[column_date].astype(int)
dataframe_journeys.astype(int)

Unnamed: 0,journey duration,journey ID,End date,End month,End year,End hour,End minute,End station ID,Start date,Start month,Start year,Start hour,Start minute,Start station ID
0,2040,953,19,9,17,18,0,478,19,9,17,17,26,251
1,1800,12581,19,9,17,15,21,122,19,9,17,14,51,550
2,1140,1159,15,9,17,17,1,639,15,9,17,16,42,212
3,420,2375,14,9,17,12,16,755,14,9,17,12,9,163
4,1200,14659,13,9,17,19,33,605,13,9,17,19,13,36
5,1320,2351,14,9,17,14,53,514,14,9,17,14,31,589
6,720,7252,17,9,17,17,12,484,17,9,17,17,0,478
7,720,9782,17,9,17,17,12,484,17,9,17,17,0,478
8,540,13500,15,9,17,13,42,367,15,9,17,13,33,153
9,960,11205,15,9,17,16,3,350,15,9,17,15,47,396


In [62]:
# Merge the date columns into a single column (data clean-up)
from datetime import datetime
dataframe_journeys['Date'] = dataframe_journeys.apply(lambda row: datetime(
                              row['End date'], row['End month'], row['End year']), axis=1)

TypeError: ('integer argument expected, got float', u'occurred at index 0')

In [None]:
columns_of_interest = ['journey duration', '']
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sea.set_palette(flatui)
sea.pairplot(dataframe_journeys)
plt.tight_layout()
plt.show()