# Data modelling on Bike Sharing data in London

Data aquired from: https://www.kaggle.com/edenau/london-bike-sharing-system-data  
Data project initiated: 25/01/2019  
Author: Sedar Olmez

Data modelling:  
    GDS1: Data Gathering, Preparation and Exploration.  
    GDS2: Data Representation and Transformation.  
    GDS3: Computing with Data.   
    GDS4: Data Visualisation and Presentation.   
    GDS5: Data Modelling.   
    GDS6: Science about Data Science. 

Assessment:
![Assessment](assessment.png)

In [36]:
# Libraries
from __future__ import print_function
import matplotlib.pyplot as plt
import seaborn as sea
import pandas as pd
import numpy as np
from datetime import date
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor as DTR

In [37]:
%%time
# Load up the journeys.csv dataset into dataframes
dataframe_journeys = pd.read_csv('data/journeys.csv')
dataframe_journeys.columns = ['journey duration', 'journey ID', 'End date', 'End month', 'End year', 
                   'End hour', 'End minute', 'End station ID', 'Start date', 'Start month', 'Start year', 
                   'Start hour', 'Start minute', 'Start station ID']
dataframe_journeys.dropna()

CPU times: user 3.03 s, sys: 373 ms, total: 3.41 s
Wall time: 1.8 s


In [38]:
dataframe_journeys

Unnamed: 0,journey duration,journey ID,End date,End month,End year,End hour,End minute,End station ID,Start date,Start month,Start year,Start hour,Start minute,Start station ID
0,2040.0,953,19,9,17,18,0,478,19,9,17,17,26,251
1,1800.0,12581,19,9,17,15,21,122,19,9,17,14,51,550
2,1140.0,1159,15,9,17,17,1,639,15,9,17,16,42,212
3,420.0,2375,14,9,17,12,16,755,14,9,17,12,9,163
4,1200.0,14659,13,9,17,19,33,605,13,9,17,19,13,36
5,1320.0,2351,14,9,17,14,53,514,14,9,17,14,31,589
6,720.0,7252,17,9,17,17,12,484,17,9,17,17,0,478
7,720.0,9782,17,9,17,17,12,484,17,9,17,17,0,478
8,540.0,13500,15,9,17,13,42,367,15,9,17,13,33,153
9,960.0,11205,15,9,17,16,3,350,15,9,17,15,47,396


In [39]:
list(dataframe_journeys)

['journey duration',
 'journey ID',
 'End date',
 'End month',
 'End year',
 'End hour',
 'End minute',
 'End station ID',
 'Start date',
 'Start month',
 'Start year',
 'Start hour',
 'Start minute',
 'Start station ID']

In [40]:
dataframe_journeys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1542844 entries, 0 to 1542843
Data columns (total 14 columns):
journey duration    1542844 non-null float64
journey ID          1542844 non-null int64
End date            1542844 non-null int64
End month           1542844 non-null int64
End year            1542844 non-null int64
End hour            1542844 non-null int64
End minute          1542844 non-null int64
End station ID      1542844 non-null int64
Start date          1542844 non-null int64
Start month         1542844 non-null int64
Start year          1542844 non-null int64
Start hour          1542844 non-null int64
Start minute        1542844 non-null int64
Start station ID    1542844 non-null int64
dtypes: float64(1), int64(13)
memory usage: 164.8 MB


In [41]:
#stations_dataframe = pd.read_csv('data/stations.csv')
#stations_dataframe.columns = ['Station ID', 'Capacity', 'Latitude', 'Longitude', 'Station Name']
#stations_dataframe

In [42]:
#stations_dataframe.dropna()

In [43]:
#list(stations_dataframe)

In [44]:
#stations_dataframe.info()

In [45]:
dataframe_journeys.head()

Unnamed: 0,journey duration,journey ID,End date,End month,End year,End hour,End minute,End station ID,Start date,Start month,Start year,Start hour,Start minute,Start station ID
0,2040.0,953,19,9,17,18,0,478,19,9,17,17,26,251
1,1800.0,12581,19,9,17,15,21,122,19,9,17,14,51,550
2,1140.0,1159,15,9,17,17,1,639,15,9,17,16,42,212
3,420.0,2375,14,9,17,12,16,755,14,9,17,12,9,163
4,1200.0,14659,13,9,17,19,33,605,13,9,17,19,13,36


In [57]:
# Function developed to parse the date segments into a new column.
def date_parser(year, month, day, new_column_name):
    y = dataframe_journeys[year].astype(str)     
    m = dataframe_journeys[month].fillna(1).astype(int).astype(str)
    d = dataframe_journeys[day].fillna(1).astype(int).astype(str)
    x = y +' '+ m +' '+ d
    dataframe_journeys[new_column_name] = pd.to_datetime(x)

In [53]:
# We use the newly created date_parser method to create a new end date column.
date_parser('End year', 'End month', 'End date', 'End Date')

In [54]:
# We use the newly created date_parser method to create a new start date column.
date_parser('Start year', 'Start month', 'Start date', 'Start Date')

In [55]:
dataframe_journeys.head()

Unnamed: 0,journey duration,journey ID,End date,End month,End year,End hour,End minute,End station ID,Start date,Start month,Start year,Start hour,Start minute,Start station ID,End Date,Start Date
0,2040.0,953,19,9,17,18,0,478,19,9,17,17,26,251,2019-09-17,2019-09-17
1,1800.0,12581,19,9,17,15,21,122,19,9,17,14,51,550,2019-09-17,2019-09-17
2,1140.0,1159,15,9,17,17,1,639,15,9,17,16,42,212,2015-09-17,2015-09-17
3,420.0,2375,14,9,17,12,16,755,14,9,17,12,9,163,2014-09-17,2014-09-17
4,1200.0,14659,13,9,17,19,33,605,13,9,17,19,13,36,2013-09-17,2013-09-17


In [58]:
columns_to_drop = ['End date', 'End month', 'End year', 'Start date', 'Start month', 'Start year']
dataframe_journeys.drop(columns_to_drop, axis = 1)

Unnamed: 0,journey duration,journey ID,End hour,End minute,End station ID,Start hour,Start minute,Start station ID,End Date,Start Date
0,2040.0,953,18,0,478,17,26,251,2019-09-17,2019-09-17
1,1800.0,12581,15,21,122,14,51,550,2019-09-17,2019-09-17
2,1140.0,1159,17,1,639,16,42,212,2015-09-17,2015-09-17
3,420.0,2375,12,16,755,12,9,163,2014-09-17,2014-09-17
4,1200.0,14659,19,33,605,19,13,36,2013-09-17,2013-09-17
5,1320.0,2351,14,53,514,14,31,589,2014-09-17,2014-09-17
6,720.0,7252,17,12,484,17,0,478,2017-09-17,2017-09-17
7,720.0,9782,17,12,484,17,0,478,2017-09-17,2017-09-17
8,540.0,13500,13,42,367,13,33,153,2015-09-17,2015-09-17
9,960.0,11205,16,3,350,15,47,396,2015-09-17,2015-09-17


In [46]:
#columns_of_interest = ['journey duration', '']
#flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
#sea.set_palette(flatui)
#sea.pairplot(dataframe_journeys)
#plt.tight_layout()
#plt.show()