In [1]:
# Import all the necessary packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
london_central = pd.read_csv('central_london_edit.csv')

london_central.head()

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0,,,
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15,,,
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35,,,
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61,,,
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73,,,


The original London Central data was edited in Excel. The original dataframe had day and date in one column with french abbreviation for the day. 

This information was split to clearly show dys of the week in English and then the dates are in a column of their own

## Cleaning Weather Column

Looking at the below code can see there is 283 different classifications for weather. Looking into the Excel file can see that there are a lot of Weather classifications some which are the same just spelt differently and some which are spelt incorrectly. This needs cleaning to categorise the data better and improve processes. 

In [3]:
london_central['Weather'].nunique()

283

In [4]:
# Change relevant values to Cloudy
london_central['Weather'] = london_central['Weather'].replace(['Cloudy/dry', 'Cloudy And Warm', 
                                                               'Dry/cloudy', 'Overcast', 
                                                          'Overcast And Dull','Partly Cloudy'], 
                                                         'Cloudy')

In [5]:
# Check that values have been changed
london_central[london_central['Weather'] == 'Cloudy And Warm']

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles,Unnamed: 14,Unnamed: 15,Unnamed: 16


In [6]:
# Change relevant values to Cold
london_central['Weather'] = london_central['Weather'].replace(['Cold Dry', 'Coldish', 'Cold/dry', 'Dry/cold',
                                                               'Dry & Cold','Fine Cold', 'Fine V Cold',
                                                              'Now Starts To Get Chilly', 'Very Cold/dry'], 
                                                         'Cold')

In [7]:
# Change relevant values to Damp
london_central['Weather'] = london_central['Weather'].replace(['Almost Dry'], 'Damp')

In [8]:
# Change relevant values to Dark
london_central['Weather'] = london_central['Weather'].replace(['Dark Sunny', 'Dry Dark', 'Dark Dry'],
                                                             'Dark')

In [9]:
# Change relevant values to Dry
london_central['Weather'] = london_central['Weather'].replace(['2 Snowflakes Otherwise Dry',
                                                              'A Bit Chilly At First', 'dry', 'Ddry',
                                                              'Drty', 'Dry & Mild', 'Dry 3/4 Dry',
                                                              'Dry And Fine', 'Dry And Overcast',
                                                              'Dry But Rain Threatening', 'Dry Fri',
                                                              'Dry Mon', 'Dry Thu', 'Dry Wed', 'Dry Y',
                                                              'Dry/good', 'Dry/sunny/cold', 'Drying Up',
                                                              'Dryish', 'Dull', 'Dy', 'Fair', 'Fine',
                                                              'Fine + Dry Chilly At First', 'Fine And Dry',
                                                              'Good/dry', 'Kdry', 'Mild', 'Rain Looking Likely',
                                                              'Rain Stopped', 'Rain Stopped-dry'], 'Dry')

In [10]:
# Change relevant values Dry & Rain
london_central['Weather'] = london_central['Weather'].replace(['Cold & Dry Early Rain Later',
                                                              'Dry - Rain', 'Rain Dry', 'Rain/dry']
                                                              , 'Dry & Rain')

In [11]:
# Change relevant values to Dry & Wet
london_central['Weather'] = london_central['Weather'].replace(['Dry - Wet', 'Dry + Wet',
                                                              'Dry + Wet', 'Dry A.m Wet P.m',
                                                              'Dry But Wet Road', 'Dry But Wet Roads',
                                                              'Dry Road Still Wet', 'Dry Road Wet With Leaves',
                                                              'Dry Wet Road', 'Dry-Wet', 'Dry-wet', 'Dry/wet Road Surface',
                                                              'Wet - Dry', 'Wet + Dry', 'Wet First Then Dry', 
                                                              'Wet-dry', 'Wet/ Dry', 'Wet/dry', 'Wetr First Then Dry',
                                                              'Dry/wet'], 'Dry & Wet')

In [12]:
# Change relevant value to Frost / Fog
london_central['Weather'] = london_central['Weather'].replace(['Dry (frost & Fog)', 'Foggy',
                                                              'Foogy/v Cold', 'Foggy/v Cold'], 'Frost / Fog')

In [13]:
# Change relevant value to Hail / Sleet
london_central['Weather'] = london_central['Weather'].replace(['Cloudy/hail', 'Hail', 'Hail',
                                                              'Hail Shower', 'Sleet', 
                                                               'Wet Light Hailstone'], 'Hail / Sleet')

In [14]:
# Change relevant values to Heavy Rain
london_central['Weather'] = london_central['Weather'].replace(['Deluge', 'Down Pour',
                                                              'H Rain', 'Rain-heavy', 
                                                              'Very Heavy Rain'], 'Heavy Rain')

In [15]:
# Change relevant values to Hot
london_central['Weather'] = london_central['Weather'].replace(['Dry/hot', 'Hot + Humid',
                                                              'Hot And Humid', 'Hot And Sunny',
                                                              'Hot/dry', 'Very Hot Dry'], 'Hot')

In [16]:
# Change relevant values to Light Rain
london_central['Weather'] = london_central['Weather'].replace(['(drizzle)', 'A Few Drops Of Rain',
                                                              'A Few Rain Showers', 'Cloudy/drizzle',
                                                              'Cold/showery', 'Damp & Drizzly', 'Drizzle',
                                                              'Drizzle Damp', 'Drizzle/rain', 'Drizzle/showers',
                                                              'Drizzle/wet', 'Drizzling', 'Drizzly', 'Drizzly Rain',
                                                              'Dry (+brief Speels Of Drizzle)', 'Dry With Intermitent Rain', 'Dry/drizzle',  
                                                              'Dry/drizzly', 'Fine Drizzle', 'Intermitent Light Rain',
                                                              'Intermitent Light Showers', 'Light Drizzle', 'Light Shower',
                                                              'Light Showers', 'Light Showers Inc Some Hail', 'Light Shrs',
                                                              'Lt Rain', 'Shower', 'Shower/wet', 'Showers', 'Showers Mix',
                                                              'Showery', 'Slight Drizzle', 'Slight Drizzle Till End', 'Some Showers',
                                                              'Spitting', 'Spitting ', 'V Cold Showers', 'V Light Drizzle', 'V Light Rain',
                                                              'V Light Showers', 'V Light Shrs', 'V Lt Rain', 'V. Light Rain',
                                                              'V.light Drizzle', 'V.light Rain', 'Very Light Rain', 'V.wet', 
                                                              'Very Light Drizzle', 'Wet Drizzle',
                                                              'Wet/drizzle', 'Wet/light Showers'], 'Light Rain')

In [17]:
# Change relevant values to Mist
london_central['Weather'] = london_central['Weather'].replace(['Damp/misty', 'Damp/misty/wet'],
                                                             'Mist')

In [18]:
# Change relevant values to Rain
london_central['Weather'] = london_central['Weather'].replace(['(rain After)', 'Cloud/rain',
                                                              'Cloudy/ Rain', 'Cloudy/rain',
                                                              'Cold/ Rain', 'Cold/rain',
                                                              'Damp - Rain', 'Heavy Shower',
                                                              'Heavy Showers', 'Heavy Showers Throughout Day',
                                                              'L/rain', 'Rain Damp',
                                                              'Rain/cloudy', 'Rain/drizzle',
                                                              'Rain/wind', 'Rains', 'Rainy', 'Some Heavy Showers',
                                                              'Steady Rain', 'Wet/rain'], 'Rain')

In [19]:
# Change relevant values to Snow
london_central['Weather'] = london_central['Weather'].replace(['Heavy Snow', 'Occasional Lt Snow Shrs',
                                                              'Snowing', 'Wet/ Snowing'], 'Snow')

In [20]:
# Change relevant values to Sun
london_central['Weather'] = london_central['Weather'].replace(['Cloudy/sunny', 'Dry & Sunny',
                                                              'Dry + Sunny', 'Dry And Sunny',
                                                              'Dry Sunny', 'Dry/sunny', 'Fine & Sunny',
                                                              'Hot & Sunny', 'Mild And Sunny',
                                                              'Road Drying Sun Out', 'Sun/clouds',
                                                              'Sun/cloudy', 'Sunny', 'Sunny Cloudy',
                                                              'Sunny Cold', 'Sunny Periods And Warm',
                                                              'Sunny/cloudy', 'Sunny/dry',
                                                              'Warm & Sunny Chilly Later', 
                                                               'Warm + Sunny'], 'Sun')

In [21]:
# Change relevant values to Sun & Rain
london_central['Weather'] = london_central['Weather'].replace(['Cold Sunny Rain', 'Sun/rain',
                                                              'Sunny/rainy'], 'Sun & Wet')

In [22]:
# Change relevant values to Sun & Wet
london_central['Weather'] = london_central['Weather'].replace(['Wet/sunny'], 'Sun & Wet')

In [23]:
# Change relevant values to Sun & Wind
london_central['Weather'] = london_central['Weather'].replace(['Sunny & Windy', 'Sunny But Very Windy',
                                                              'Sunny Until Evening But Windy',
                                                              'Very Cold Sunny But Windy',
                                                              'Warm + Sunny Cloudy + Windy',
                                                              'Warm Sunny And Windy',
                                                              'Warm & Sunny But Windy & Cold'], 'Sun & Wind')

In [24]:
# Change relevant values to Unknown
london_central['Weather'] = london_central['Weather'].replace(['D', 'Wed', 'X'], 'Unknown')

In [25]:
# Change relevant values to Warm
london_central['Weather'] = london_central['Weather'].replace(['Cold At First Then Warm/sunny',
                                                              'Dry And Warm', 'Warm + Dry',
                                                              'Warm And Humid', 'Warm And Overcast'],
                                                             'Warm')

In [26]:
# Change relevant values to Warm & Wind 
london_central['Weather'] = london_central['Weather'].replace(['Warm With A Slight Wind',
                                                              'Warm And Windy'], 'Warm & Wind')

In [27]:
# Change relevant values to Wet
london_central['Weather'] = london_central['Weather'].replace(['Getting Wet', 'No Rain Wet Roads',
                                                              'Really Wet', 'Road Wet', 'S. Wet',
                                                              'Slightly Wet', 'V Wet', 'V. Wet',
                                                              'Very Wet', 'Wert', 'Wet (spitting)',
                                                              'Wet (windy)', 'Wet Again', 'Wet Damp',
                                                              'Wet Intermittently', 'Wet Road', 'Wet T',
                                                              'Wet/cloudy', 'Wet/damp', 'Wetish', 
                                                              'Wetr', 'Wetter', 'Winds Rather Chilly', 'Wet '],
                                                             'Wet')

In [28]:
# Change relevant values to Wet & Wind
london_central['Weather'] = london_central['Weather'].replace(['Wet And Very Windy',
                                                              'Wet And Windy',
                                                              'Wet & Windy',
                                                              'Wet + Windy'], 'Wet & Wind')

In [29]:
# Change relevant values to Wind
london_central['Weather'] = london_central['Weather'].replace(['Blustery', 'Cloudy/windy',
                                                              'Cold Wind', 'Cold Windy Dry',
                                                              'Dry (windy)', 'Dry & Very Windy',
                                                              'Dry And Very Windy', 'Dry And Windy',
                                                              'Dry Windy', 'Dry/gusty', 'Dry/very Windy',
                                                              'Dry/windy', 'Dry/windy/strong Wind',
                                                              'Fine (windy)', 'Fine Windy',
                                                              'High Winds & Spits Of Rain',
                                                              'Periods of Rain Quite Windy',
                                                              'Very Windy & Cold', 'Windy',
                                                              'Windy Dry', 'Windy Showery',
                                                              'Windy/cloidy', 'Windy/cloudy', 'Windy/drizzle',
                                                              'Windy/dry', 'Cold Then Dry And Windy',
                                                              'Dry & Windy', 'Dry But A Bit Windy'], 'Wind')

In [30]:
# Change relevant values to Wind & Rain
london_central['Weather'] = london_central['Weather'].replace(['Windy/ Rain',
                                                              'Periods Of Rain Quite Windy'], 'Wind & Rain')

In [31]:
london_central['Weather'].nunique()

29

In [32]:
london_central['Weather'].value_counts()

Dry               641187
Wet                79468
Rain                6916
Cloudy              4872
Sun                 3928
Light Rain          3556
Wind                1232
Dry & Wet           1212
Cold                 912
Damp                 643
Hot                  412
Sun & Wind           278
Sun & Wet            234
Snow                 192
Warm                 190
Dry & Rain           147
Wet & Wind           142
Frost / Fog          138
Unknown              134
Wind & Rain          128
Warm & Wind          112
Heavy Rain           106
Hail / Sleet          88
Hazy                  64
Dark                  18
Mist                  12
Storm                  4
Thunder                2
Rain & Thunder         2
Name: Weather, dtype: int64

## Data Cleaning 

Process of data cleaning - removing unecessary rows and replacing values 

## Looking at NaN values - which columns and potential causes / solutions

In [33]:
# Check if there are any missing data presented in the dataframe.
london_central.isnull().sum()

Survey wave (calendar quarter)         0
Equivalent financial quarter           0
Site ID                                0
Location                               0
Survey day                             0
Survey date                        10412
Weather                            11834
Time                                   0
Period                                 0
Direction                              0
Start hour                             0
Start minute                           0
Number of private cycles              64
Number of cycle hire bikes            64
Total cycles                           0
Unnamed: 14                       758163
Unnamed: 15                       758163
Unnamed: 16                       758163
dtype: int64

In [34]:
# Remove redundant columns ('Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16').
london_central = london_central.drop(['Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'], axis = 1)

# View column names.
print(london_central.shape)
london_central.head()

(758163, 15)


Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73


In [35]:
# Check columns have been dropped
london_central.isnull().sum()

Survey wave (calendar quarter)        0
Equivalent financial quarter          0
Site ID                               0
Location                              0
Survey day                            0
Survey date                       10412
Weather                           11834
Time                                  0
Period                                0
Direction                             0
Start hour                            0
Start minute                          0
Number of private cycles             64
Number of cycle hire bikes           64
Total cycles                          0
dtype: int64

## Deeper investigation of NaN values 

Created a subset of the data to look further into the NaN values - see if there is a pattern of which are missing and to decide how best to replace the values 

In [36]:
# Subset of the data of only null values
null_data = london_central[london_central.isna().any(axis=1)]
null_data

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
35264,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,6.0,1.0,7
35265,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,9.0,0.0,9
35266,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,15.0,3.0,18
35267,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,20.0,1.0,21
35268,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,29.0,1.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718838,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2045 - 2100,Evening (19:00-22:00),Eastbound,20,45,,,0
718839,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2100 - 2115,Evening (19:00-22:00),Eastbound,21,0,,,0
718840,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2115 - 2130,Evening (19:00-22:00),Eastbound,21,15,,,0
718841,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2130 - 2145,Evening (19:00-22:00),Eastbound,21,30,,,0


In [37]:
# Columns which have null values and the number of null values 
null_data[null_data.columns[null_data.isnull().any()]].isnull().sum()

Survey date                   10412
Weather                       11834
Number of private cycles         64
Number of cycle hire bikes       64
dtype: int64

In [38]:
# Subset of null data frame to look weather null values 
# Wanted to see if there was a sensible way to replace the null based on loc or quarter
weather_null = null_data[['Location', 'Survey wave (calendar quarter)', 'Weather']]
weather_null

Unnamed: 0,Location,Survey wave (calendar quarter),Weather
35264,York Road,2014 Q2 (April-June),
35265,York Road,2014 Q2 (April-June),
35266,York Road,2014 Q2 (April-June),
35267,York Road,2014 Q2 (April-June),
35268,York Road,2014 Q2 (April-June),
...,...,...,...
718838,Grosvenor Street,2021 Q3 (July-September),Dry
718839,Grosvenor Street,2021 Q3 (July-September),Dry
718840,Grosvenor Street,2021 Q3 (July-September),Dry
718841,Grosvenor Street,2021 Q3 (July-September),Dry


In [39]:
# All the entries with null weather data 
w_null = weather_null[weather_null['Weather'].isna()]
w_null

Unnamed: 0,Location,Survey wave (calendar quarter),Weather
35264,York Road,2014 Q2 (April-June),
35265,York Road,2014 Q2 (April-June),
35266,York Road,2014 Q2 (April-June),
35267,York Road,2014 Q2 (April-June),
35268,York Road,2014 Q2 (April-June),
...,...,...,...
678587,Poland Street,2020 Q3 (July-September),
678588,Poland Street,2020 Q3 (July-September),
678589,Poland Street,2020 Q3 (July-September),
678590,Poland Street,2020 Q3 (July-September),


Can see the 11834 rows with null values for Weather

In [40]:
# Check which locations have the most NaN values 
print(w_null['Location'].value_counts())

Welbeck Street                               256
Wimpole Street                               256
Blackfriars Road                             256
Buckingham Gate                              256
Millbank (north of Great Peter Street)       256
                                            ... 
Hastings Street                               46
Old Compton Street                            40
Black Prince Road (east of Gibson Street)      8
Great Queen Street                             2
Goswell Road (south)                           2
Name: Location, Length: 86, dtype: int64


able to see which locations have the most NaN values. Allows us to identify where data recording could be improved 

In [41]:
# Exported to new csv file 
w_null.to_csv('w_null.csv')

Exported the whole dataframe as a Csv to take a deeper look in Excel - see if there were any obvious trends / analysis to be identified. 

Can see which locations have the null values but also can see which time frames they have been recorded in - might be sensible to assume that if it was summer then it would be dry and if it was winter it would be wet / cold

After looking at the Csv in Excel it is difficult to definitely say what the weather was at that time as locations have different weather conditions recorded for the same period. Therefore, NaN values will be replaced with unknown to maintain record and show where data recording could be improved.

## Replacing Values 

In [42]:
# Make a copy of london central dataframe: 
ldn_cent = london_central.copy()
ldn_cent

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758158,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2045 - 2100,Evening (19:00-22:00),Southbound,20,45,22.0,1.0,23
758159,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2100 - 2115,Evening (19:00-22:00),Southbound,21,0,20.0,0.0,20
758160,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2115 - 2130,Evening (19:00-22:00),Southbound,21,15,16.0,1.0,17
758161,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2130 - 2145,Evening (19:00-22:00),Southbound,21,30,10.0,1.0,11


In [43]:
# Replace NaN data entries with Unknown value
ldn_cent['Weather'].fillna('Unknown', inplace = True)
ldn_cent

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758158,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2045 - 2100,Evening (19:00-22:00),Southbound,20,45,22.0,1.0,23
758159,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2100 - 2115,Evening (19:00-22:00),Southbound,21,0,20.0,0.0,20
758160,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2115 - 2130,Evening (19:00-22:00),Southbound,21,15,16.0,1.0,17
758161,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2130 - 2145,Evening (19:00-22:00),Southbound,21,30,10.0,1.0,11


In [44]:
# Check all NaN values have been replaced to Unknown
ldn_cent[ldn_cent['Weather'] == 'Unknown']

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
35264,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,Unknown,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,6.0,1.0,7
35265,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,Unknown,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,9.0,0.0,9
35266,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,Unknown,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,15.0,3.0,18
35267,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,Unknown,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,20.0,1.0,21
35268,2014 Q2 (April-June),2014-15 Q1,CENCY079,York Road,Friday,09/05/2014,Unknown,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,29.0,1.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678587,2020 Q3 (July-September),2020-21 Q2,CENCY196,Poland Street,Empty,,Unknown,2045 - 2100,Evening (19:00-22:00),Southbound,20,45,0.0,0.0,0
678588,2020 Q3 (July-September),2020-21 Q2,CENCY196,Poland Street,Empty,,Unknown,2100 - 2115,Evening (19:00-22:00),Southbound,21,0,0.0,0.0,0
678589,2020 Q3 (July-September),2020-21 Q2,CENCY196,Poland Street,Empty,,Unknown,2115 - 2130,Evening (19:00-22:00),Southbound,21,15,0.0,0.0,0
678590,2020 Q3 (July-September),2020-21 Q2,CENCY196,Poland Street,Empty,,Unknown,2130 - 2145,Evening (19:00-22:00),Southbound,21,30,0.0,0.0,0


Can see that there are now 11834 records of unknown for the weather column - all NaN values have successfully been replaced 

In [45]:
# View NaN data entries in the ldn_cent dataframe for number of private cycles 
ldn_cent[ldn_cent['Number of private cycles'].isna()]

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
718779,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Wednesday,18/08/2021,Dry,0600 - 0615,Early Morning (06:00-07:00),Eastbound,6,0,,,0
718780,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Wednesday,18/08/2021,Dry,0615 - 0630,Early Morning (06:00-07:00),Eastbound,6,15,,,0
718781,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Wednesday,18/08/2021,Dry,0630 - 0645,Early Morning (06:00-07:00),Eastbound,6,30,,,0
718782,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Wednesday,18/08/2021,Dry,0645 - 0700,Early Morning (06:00-07:00),Eastbound,6,45,,,0
718783,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Wednesday,18/08/2021,Dry,0700 - 0715,AM peak (07:00-10:00),Eastbound,7,0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718838,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2045 - 2100,Evening (19:00-22:00),Eastbound,20,45,,,0
718839,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2100 - 2115,Evening (19:00-22:00),Eastbound,21,0,,,0
718840,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2115 - 2130,Evening (19:00-22:00),Eastbound,21,15,,,0
718841,2021 Q3 (July-September),2021-22 Q2,CENCY112,Grosvenor Street,Tuesday,24/08/2021,Dry,2130 - 2145,Evening (19:00-22:00),Eastbound,21,30,,,0


In [46]:
# Run a test of the skewness 
ldn_cent.skew(axis = 0, skipna = True)

  ldn_cent.skew(axis = 0, skipna = True)


Start hour                    0.000214
Start minute                  0.000003
Number of private cycles      5.136676
Number of cycle hire bikes    5.150004
Total cycles                  5.037641
dtype: float64

Skewness test is run to guage the distribution of the data. If the values are less than -1 or greater than +1 indicates that our data is heavily skewed. 

For the columns in question being Number of private cycles and Number of cucle hire bikes we can see than both are comfortably over +1 and therefore suggests the data is skewed. 

The puurpose of determining the skew is to see which measure of central tendancy (mean, mode, median) would be best to use to replace the missing values. Due to the data being skewed we will replace values using the median. 

In [47]:
# Median value for Number of private cycles
ldn_cent['Number of private cycles'].median()

7.0

In [48]:
# Replace all NaN values with the median 
ldn_cent['Number of private cycles'].fillna(7, inplace = True)
ldn_cent

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758158,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2045 - 2100,Evening (19:00-22:00),Southbound,20,45,22.0,1.0,23
758159,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2100 - 2115,Evening (19:00-22:00),Southbound,21,0,20.0,0.0,20
758160,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2115 - 2130,Evening (19:00-22:00),Southbound,21,15,16.0,1.0,17
758161,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2130 - 2145,Evening (19:00-22:00),Southbound,21,30,10.0,1.0,11


In [49]:
# Check that all NaN values have been replaced 
ldn_cent[ldn_cent['Number of private cycles'].isna()]

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles


No output so all NaN values have been replaced.

In [50]:
# Median value for hire cycles 
ldn_cent['Number of cycle hire bikes'].median()

1.0

In [51]:
# Replace all NaN values with the average 
ldn_cent['Number of cycle hire bikes'].fillna(1, inplace = True)
ldn_cent

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,24/01/2014,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758158,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2045 - 2100,Evening (19:00-22:00),Southbound,20,45,22.0,1.0,23
758159,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2100 - 2115,Evening (19:00-22:00),Southbound,21,0,20.0,0.0,20
758160,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2115 - 2130,Evening (19:00-22:00),Southbound,21,15,16.0,1.0,17
758161,2021 Q4 (October-December),2021-22 Q3,CENCY702,Haymarket,Tuesday,21/12/2021,Dry,2130 - 2145,Evening (19:00-22:00),Southbound,21,30,10.0,1.0,11


In [52]:
# Check that all NaN values have been replaced 
ldn_cent[ldn_cent['Number of cycle hire bikes'].isna()]

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles


Again no output so we can see that all the values have been replaced

In [53]:
ldn_cent['Weather'].value_counts()

Dry               641187
Wet                79468
Unknown            11968
Rain                6916
Cloudy              4872
Sun                 3928
Light Rain          3556
Wind                1232
Dry & Wet           1212
Cold                 912
Damp                 643
Hot                  412
Sun & Wind           278
Sun & Wet            234
Snow                 192
Warm                 190
Dry & Rain           147
Wet & Wind           142
Frost / Fog          138
Wind & Rain          128
Warm & Wind          112
Heavy Rain           106
Hail / Sleet          88
Hazy                  64
Dark                  18
Mist                  12
Storm                  4
Thunder                2
Rain & Thunder         2
Name: Weather, dtype: int64

## Missing dates 

In [54]:
# View missing data
ldn_cent.isnull().sum()

Survey wave (calendar quarter)        0
Equivalent financial quarter          0
Site ID                               0
Location                              0
Survey day                            0
Survey date                       10412
Weather                               0
Time                                  0
Period                                0
Direction                             0
Start hour                            0
Start minute                          0
Number of private cycles              0
Number of cycle hire bikes            0
Total cycles                          0
dtype: int64

Only missing entries that are left are in the Survey date column

In [55]:
# Check data types to see what dates are stored as 
ldn_cent.dtypes

Survey wave (calendar quarter)     object
Equivalent financial quarter       object
Site ID                            object
Location                           object
Survey day                         object
Survey date                        object
Weather                            object
Time                               object
Period                             object
Direction                          object
Start hour                          int64
Start minute                        int64
Number of private cycles          float64
Number of cycle hire bikes        float64
Total cycles                        int64
dtype: object

In [56]:
# Change data type to survey date 
ldn_cent['Survey date']= pd.to_datetime(ldn_cent['Survey date'])
ldn_cent.dtypes

Survey wave (calendar quarter)            object
Equivalent financial quarter              object
Site ID                                   object
Location                                  object
Survey day                                object
Survey date                       datetime64[ns]
Weather                                   object
Time                                      object
Period                                    object
Direction                                 object
Start hour                                 int64
Start minute                               int64
Number of private cycles                 float64
Number of cycle hire bikes               float64
Total cycles                               int64
dtype: object

## Create a new column ffil to clean survey dates column

This is done to keep the original column, not loose any data but ensure we have a column with complete entries for any date analysis

In [57]:
ldn_cent['ffill_date'] = ldn_cent['Survey date'].ffill()
ldn_cent.head()

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles,ffill_date
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0,2014-01-24
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15,2014-01-24
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35,2014-01-24
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61,2014-01-24
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73,2014-01-24


In [58]:
# View missing data
ldn_cent.isnull().sum()

Survey wave (calendar quarter)        0
Equivalent financial quarter          0
Site ID                               0
Location                              0
Survey day                            0
Survey date                       10412
Weather                               0
Time                                  0
Period                                0
Direction                             0
Start hour                            0
Start minute                          0
Number of private cycles              0
Number of cycle hire bikes            0
Total cycles                          0
ffill_date                            0
dtype: int64

Can see that all the missing values for the survey dates for the Survey date column 

## Check for Duplicates 

In [59]:
ldn_cent[ldn_cent.duplicated()].count()

Survey wave (calendar quarter)    0
Equivalent financial quarter      0
Site ID                           0
Location                          0
Survey day                        0
Survey date                       0
Weather                           0
Time                              0
Period                            0
Direction                         0
Start hour                        0
Start minute                      0
Number of private cycles          0
Number of cycle hire bikes        0
Total cycles                      0
ffill_date                        0
dtype: int64

No true duplicates in the dataframe

In [60]:
# View Dataframe
ldn_cent.head()

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey day,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles,ffill_date
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,0.0,0.0,0,2014-01-24
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,15.0,0.0,15,2014-01-24
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,35.0,0.0,35,2014-01-24
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,59.0,2.0,61,2014-01-24
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),Friday,2014-01-24,Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,73.0,0.0,73,2014-01-24


In [61]:
# View missing data
ldn_cent.isnull().sum()

Survey wave (calendar quarter)        0
Equivalent financial quarter          0
Site ID                               0
Location                              0
Survey day                            0
Survey date                       10412
Weather                               0
Time                                  0
Period                                0
Direction                             0
Start hour                            0
Start minute                          0
Number of private cycles              0
Number of cycle hire bikes            0
Total cycles                          0
ffill_date                            0
dtype: int64

Now have a clean dataframe to work from

In [None]:
# Exported to new csv file 
ldn_cent.to_csv('ldn_cent_clean.csv')