In [2]:
import pandas as pd
from sqlalchemy import create_engine
pd.set_option("display.max_rows", None)

In [10]:
# read in csv file to df to check for any errors
csv_file = "Resources/Bike Share Toronto Ridership_Q1 2018.csv"
biker_data_df = pd.read_csv(csv_file)
biker_data_df.head()

Unnamed: 0,trip_id,trip_duration_seconds,from_station_id,trip_start_time,from_station_name,trip_stop_time,to_station_id,to_station_name,user_type
0,2383648,393,7018,1/1/2018 0:47,Bremner Blvd / Rees St,1/1/2018 0:54,7176,Bathurst St / Fort York Blvd,Annual Member
1,2383649,625,7184,1/1/2018 0:52,Ossington Ave / College St,1/1/2018 1:03,7191,Central Tech (Harbord St),Annual Member
2,2383650,233,7235,1/1/2018 0:55,Bay St / College St (West Side) - SMART,1/1/2018 0:59,7021,Bay St / Albert St,Annual Member
3,2383651,1138,7202,1/1/2018 0:57,Queen St W / York St (City Hall),1/1/2018 1:16,7020,Phoebe St / Spadina Ave,Annual Member
4,2383652,703,7004,1/1/2018 1:00,University Ave / Elm St,1/1/2018 1:12,7060,Princess St / Adelaide St E,Annual Member


In [11]:
# create df from all 4 quarters of data
df = pd.concat(map(pd.read_csv, ['Resources/Bike Share Toronto Ridership_Q1 2018.csv', 'Resources/Bike Share Toronto Ridership_Q2 2018.csv','Resources/Bike Share Toronto Ridership_Q3 2018.csv', 'Resources/Bike Share Toronto Ridership_Q4 2018.csv']), ignore_index=True)
df.tail(3)

Unnamed: 0,trip_id,trip_duration_seconds,from_station_id,trip_start_time,from_station_name,trip_stop_time,to_station_id,to_station_name,user_type
1922952,4581275,340,7020,12/31/2018 23:49,Phoebe St / Spadina Ave,12/31/2018 23:55,7000,Fort York Blvd / Capreol Ct,Annual Member
1922953,4581276,1466,7014,12/31/2018 23:52,Sherbourne St / Carlton St (Allan Gardens),1/1/2019 0:17,7269,Toronto Eaton Centre (Yonge St),Annual Member
1922954,4581277,333,7299,12/31/2018 23:58,Mill St / Parliament St,1/1/2019 0:04,7013,Scott St / The Esplanade,Annual Member


In [24]:
# create copy of df using only columns we desire
clean_df = df[["trip_id","trip_duration_seconds", "trip_start_time"]].copy()
clean_df.head()

Unnamed: 0,trip_id,trip_duration_seconds,trip_start_time
0,2383648,393,1/1/2018 0:47
1,2383649,625,1/1/2018 0:52
2,2383650,233,1/1/2018 0:55
3,2383651,1138,1/1/2018 0:57
4,2383652,703,1/1/2018 1:00


In [25]:
# rename trip start time column to date
clean_df.rename(columns={'trip_start_time':'date'}, inplace=True)

In [26]:
clean_df.head()

Unnamed: 0,trip_id,trip_duration_seconds,date
0,2383648,393,1/1/2018 0:47
1,2383649,625,1/1/2018 0:52
2,2383650,233,1/1/2018 0:55
3,2383651,1138,1/1/2018 0:57
4,2383652,703,1/1/2018 1:00


In [27]:
# convert the date column to datetime for later merging
clean_df['date'] = pd.to_datetime(clean_df['date'])
clean_df.head()

Unnamed: 0,trip_id,trip_duration_seconds,date
0,2383648,393,2018-01-01 00:47:00
1,2383649,625,2018-01-01 00:52:00
2,2383650,233,2018-01-01 00:55:00
3,2383651,1138,2018-01-01 00:57:00
4,2383652,703,2018-01-01 01:00:00


In [28]:
# strip off the hours. minutes, and seconds from the date col
clean_df['date'] = clean_df['date'].dt.date

In [29]:
clean_df.head()

Unnamed: 0,trip_id,trip_duration_seconds,date
0,2383648,393,2018-01-01
1,2383649,625,2018-01-01
2,2383650,233,2018-01-01
3,2383651,1138,2018-01-01
4,2383652,703,2018-01-01


In [30]:
# convert the cleaned df into a csv for later use
clean_df.to_csv('Resources/bikeshare_clean.csv', index=False)