### This script contains:

#### 1. Import libraries & dataset
#### 2. Splitting out a subset
#### 3. Creating a map using Folium
#### 4. Removing superflous column
#### 5. Exporting the dataset

## 1. Import libraries & dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json
from folium.plugins import MarkerCluster

In [2]:
#Folder path into usable string
path = r'C:\Users\willm\Dropbox\1 Data Analytics Course\1 New York Citibike Hire'

In [3]:
#Import Hire Dataset
NYB2020_1day = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_1day_Script2.pkl'))

In [4]:
#This command propts matplotlib visuals to appear in the notebook 
%matplotlib inline

In [5]:
#Set Pandas to show all columns
pd.set_option("display.max_columns", None)

In [6]:
#Ensuring we see the full Latitudes and Longitudes
pd.set_option('display.precision', 10)

## 2. Splitting out a subset

In [7]:
NYB2020_1day.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,start_hour,start_date2,temp,rain_snow,day_of_week,day_of_week_number,month,month_number,TripMins,age,Day_or_More,Over_99,1col,uses_by_age,age_use
0,789,2020-01-01 00:00:55.3900,2020-01-01 00:14:05.1470,504,1 Ave & E 16 St,40.73221853,-73.98165557,307,Canal St & Rutgers St,40.71427487,-73.98990025,30326,Subscriber,1992,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,13,28,Less than a Day,N,1,802960.0,High Use Age
1,1541,2020-01-01 00:01:08.1020,2020-01-01 00:26:49.1780,3423,West Drive & Prospect Park West,40.6610633719,-73.9794525504,3300,Prospect Park West & 8 St,40.6651468153,-73.9763760567,17105,Customer,1969,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,26,51,Less than a Day,N,1,2212026.0,High Use Age
2,1464,2020-01-01 00:01:42.1400,2020-01-01 00:26:07.0110,3687,E 33 St & 1 Ave,40.7432268143,-73.9744978398,259,South St & Whitehall St,40.70122128,-74.01234218,40177,Subscriber,1963,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,24,57,Less than a Day,N,1,197375.0,Mid Use Age
3,592,2020-01-01 00:01:45.5610,2020-01-01 00:11:38.1550,346,Bank St & Hudson St,40.73652889,-74.00618026,490,8 Ave & W 33 St,40.751551,-73.993934,27690,Subscriber,1980,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,10,40,Less than a Day,N,1,374103.0,Mid Use Age
4,702,2020-01-01 00:01:45.7880,2020-01-01 00:13:28.2400,372,Franklin Ave & Myrtle Ave,40.6945460872,-73.9580136538,3637,Fulton St & Waverly Ave,40.6832386546,-73.9659959078,32583,Subscriber,1982,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,12,38,Less than a Day,N,1,444645.0,High Use Age


In [8]:
#Creating a GroupBy to give total uses by each 'start_station_id' in a new column.
NYB2020_1day["uses_by_station"] = NYB2020_1day.groupby("start_station_id")['1col'].transform("sum")

In [9]:
NYB2020_1day['uses_by_station'] = NYB2020_1day['uses_by_station'].astype('int64')

In [10]:
#This new column has to be a STR for Folium to show the numbers.
NYB2020_1day['uses_by_stationSTR'] = NYB2020_1day['uses_by_station'].astype('str')

In [11]:
NYB2020_1day.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,start_hour,start_date2,temp,rain_snow,day_of_week,day_of_week_number,month,month_number,TripMins,age,Day_or_More,Over_99,1col,uses_by_age,age_use,uses_by_station,uses_by_stationSTR
0,789,2020-01-01 00:00:55.3900,2020-01-01 00:14:05.1470,504,1 Ave & E 16 St,40.73221853,-73.98165557,307,Canal St & Rutgers St,40.71427487,-73.98990025,30326,Subscriber,1992,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,13,28,Less than a Day,N,1,802960.0,High Use Age,59052,59052
1,1541,2020-01-01 00:01:08.1020,2020-01-01 00:26:49.1780,3423,West Drive & Prospect Park West,40.6610633719,-73.9794525504,3300,Prospect Park West & 8 St,40.6651468153,-73.9763760567,17105,Customer,1969,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,26,51,Less than a Day,N,1,2212026.0,High Use Age,48810,48810
2,1464,2020-01-01 00:01:42.1400,2020-01-01 00:26:07.0110,3687,E 33 St & 1 Ave,40.7432268143,-73.9744978398,259,South St & Whitehall St,40.70122128,-74.01234218,40177,Subscriber,1963,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,24,57,Less than a Day,N,1,197375.0,Mid Use Age,73511,73511
3,592,2020-01-01 00:01:45.5610,2020-01-01 00:11:38.1550,346,Bank St & Hudson St,40.73652889,-74.00618026,490,8 Ave & W 33 St,40.751551,-73.993934,27690,Subscriber,1980,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,10,40,Less than a Day,N,1,374103.0,Mid Use Age,38585,38585
4,702,2020-01-01 00:01:45.7880,2020-01-01 00:13:28.2400,372,Franklin Ave & Myrtle Ave,40.6945460872,-73.9580136538,3637,Fulton St & Waverly Ave,40.6832386546,-73.9659959078,32583,Subscriber,1982,1,0,2020-01-01,3,0.0,Wednesday,3,January,1,12,38,Less than a Day,N,1,444645.0,High Use Age,8059,8059


In [12]:
NYB2020_1day.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19487603 entries, 0 to 19506856
Data columns (total 32 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tripduration             int64  
 1   starttime                object 
 2   stoptime                 object 
 3   start_station_id         int16  
 4   start_station_name       object 
 5   start_station_latitude   float64
 6   start_station_longitude  float64
 7   end_station_id           int16  
 8   end_station_name         object 
 9   end_station_latitude     float64
 10  end_station_longitude    float64
 11  bikeid                   int32  
 12  usertype                 object 
 13  birth_year               int16  
 14  gender                   int8   
 15  start_hour               int8   
 16  start_date2              object 
 17  temp                     int8   
 18  rain_snow                float16
 19  day_of_week              object 
 20  day_of_week_number       int8   
 21  month 

In [13]:
#Requesting to just view 2 columns to see the new flag working
NYB2020_1day[['start_station_id', 'uses_by_station']].tail(20)

Unnamed: 0,start_station_id,uses_by_station
19506837,464,24810
19506838,312,47032
19506839,3628,18869
19506840,4099,1488
19506841,3143,38724
19506842,490,64620
19506843,3288,32554
19506844,3977,1314
19506845,372,8059
19506846,3169,24391


In [14]:
%%time
#Creating a subset of the Start Stations
NYB2020_UniqueStation = NYB2020_1day.drop_duplicates(subset = ["start_station_id"])

Wall time: 530 ms


In [15]:
NYB2020_UniqueStation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1213 entries, 0 to 19504238
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tripduration             1213 non-null   int64  
 1   starttime                1213 non-null   object 
 2   stoptime                 1213 non-null   object 
 3   start_station_id         1213 non-null   int16  
 4   start_station_name       1213 non-null   object 
 5   start_station_latitude   1213 non-null   float64
 6   start_station_longitude  1213 non-null   float64
 7   end_station_id           1213 non-null   int16  
 8   end_station_name         1213 non-null   object 
 9   end_station_latitude     1213 non-null   float64
 10  end_station_longitude    1213 non-null   float64
 11  bikeid                   1213 non-null   int32  
 12  usertype                 1213 non-null   object 
 13  birth_year               1213 non-null   int16  
 14  gender              

In [16]:
#Requesting to just view 2 columns to check if the drop_duplicates worked
NYB2020_UniqueStation[['start_station_id', 'uses_by_station']].tail(20)

Unnamed: 0,start_station_id,uses_by_station
18598472,4250,11
18698498,4239,385
18707832,4248,235
18737188,4215,450
18748081,537,773
18876390,232,495
19073219,4214,260
19085737,3705,362
19143265,4304,53
19192287,4280,172


## 3. Creating a map using Folium

In [17]:
%%time
m = folium.Map(location = [40.719587, -74.046089], tiles = 'OpenStreetMap', zoom_start=12)

markerCluster = MarkerCluster().add_to(m)

for i, row in NYB2020_UniqueStation.iterrows():
    lat = NYB2020_UniqueStation.at[i, 'start_station_latitude']
    lng = NYB2020_UniqueStation.at[i, 'start_station_longitude']
    
    popup = 'Station : ' + str(NYB2020_UniqueStation.at[i, 'start_station_name']) + '<br>' + 'Uses : ' + NYB2020_UniqueStation.at[i, 'uses_by_stationSTR']

    folium.Marker(location = [lat, lng], popup= popup, icon = folium.Icon(color='blue', icon='fire')).add_to(markerCluster)
m

Wall time: 158 ms


## 4. Removing superflous column

In [18]:
#Removing the new Minutes, Seconds, start_time and starttimecopy columns as they are now superfluous 
NYB2020_1dayPT3 = NYB2020_1day.drop(columns = ['uses_by_stationSTR'])

## 5. Exporting the dataset

In [19]:
NYB2020_1dayPT3.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'NYB2020_1dayScript3.pkl'))