In [42]:
# Dependencies

import pandas as pd
from pathlib import Path

from calendar import monthrange


In [43]:
# Setting File Path for Csv

filepath = Path('../data/202004-divvy-tripdata/202004-divvy-tripdata.csv')

# Read and display Csv data

trip_data = pd.read_csv(filepath)

trip_data.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,A847FADBBC638E45,docked_bike,2020-04-26 17:45:14,2020-04-26 18:12:03,Eckhart Park,86,Lincoln Ave & Diversey Pkwy,152.0,41.8964,-87.661,41.9322,-87.6586,member
1,5405B80E996FF60D,docked_bike,2020-04-17 17:08:54,2020-04-17 17:17:03,Drake Ave & Fullerton Ave,503,Kosciuszko Park,499.0,41.9244,-87.7154,41.9306,-87.7238,member
2,5DD24A79A4E006F4,docked_bike,2020-04-01 17:54:13,2020-04-01 18:08:36,McClurg Ct & Erie St,142,Indiana Ave & Roosevelt Rd,255.0,41.8945,-87.6179,41.8679,-87.623,member
3,2A59BBDF5CDBA725,docked_bike,2020-04-07 12:50:19,2020-04-07 13:02:31,California Ave & Division St,216,Wood St & Augusta Blvd,657.0,41.903,-87.6975,41.8992,-87.6722,member
4,27AD306C119C6158,docked_bike,2020-04-18 10:22:59,2020-04-18 11:15:54,Rush St & Hubbard St,125,Sheridan Rd & Lawrence Ave,323.0,41.8902,-87.6262,41.9695,-87.6547,casual


In [44]:
# Data Quality Testing

# display column count

display(trip_data.count())

# display column types

display(trip_data.dtypes)

# check for null/ duplicates

display(trip_data.isnull().sum())
display(trip_data.duplicated().value_counts())


ride_id               84776
rideable_type         84776
started_at            84776
ended_at              84776
start_station_name    84776
start_station_id      84776
end_station_name      84677
end_station_id        84677
start_lat             84776
start_lng             84776
end_lat               84677
end_lng               84677
member_casual         84776
dtype: int64

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id        int64
end_station_name       object
end_station_id        float64
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

ride_id                0
rideable_type          0
started_at             0
ended_at               0
start_station_name     0
start_station_id       0
end_station_name      99
end_station_id        99
start_lat              0
start_lng              0
end_lat               99
end_lng               99
member_casual          0
dtype: int64

False    84776
Name: count, dtype: int64

In [45]:
# add new TRIP LENGTH column

trip_data['started_at'] = pd.to_datetime(trip_data['started_at'])
trip_data['ended_at'] = pd.to_datetime(trip_data['ended_at'])

trip_data['trip_length'] = trip_data['ended_at'] - trip_data['started_at']

# translate started at 

trip_data['start_day'] = trip_data['started_at'].dt.date
trip_data['start_hour'] = trip_data['started_at'].dt.hour

# Sort by trip length and display

trip_data = trip_data.sort_values('trip_length')

trip_data


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,trip_length,start_day,start_hour
25124,3C19503CC3A81CCE,docked_bike,2020-04-29 16:54:01,2020-04-29 16:51:05,Sheffield Ave & Webster Ave,327,Clark St & Wellington Ave,156.0,41.9215,-87.6538,41.9365,-87.6475,member,-1 days +23:57:04,2020-04-29,16
65987,F6F91F2D50F2B535,docked_bike,2020-04-27 18:49:03,2020-04-27 18:47:52,Columbus Dr & Randolph St,195,Clinton St & Madison St,77.0,41.8847,-87.6195,41.8822,-87.6411,member,-1 days +23:58:49,2020-04-27,18
22637,00ED4786F962B827,docked_bike,2020-04-28 06:55:20,2020-04-28 06:54:33,Orleans St & Hubbard St,636,Wells St & Huron St,53.0,41.8900,-87.6366,41.8947,-87.6344,member,-1 days +23:59:13,2020-04-28,6
58835,AD5373DC1F4D6B59,docked_bike,2020-04-01 14:18:09,2020-04-01 14:17:26,Clark St & Elm St,176,Sedgwick St & Schiller St,236.0,41.9030,-87.6313,41.9076,-87.6386,member,-1 days +23:59:17,2020-04-01,14
54011,C6F50A326A5F883E,docked_bike,2020-04-19 18:51:46,2020-04-19 18:51:10,Kedzie Ave & Milwaukee Ave,260,Humboldt Blvd & Armitage Ave,507.0,41.9296,-87.7079,41.9175,-87.7018,member,-1 days +23:59:24,2020-04-19,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75669,DEA089D56E87CE35,docked_bike,2020-04-03 23:59:19,2020-05-04 19:34:24,Wabash Ave & Grand Ave,199,Michigan Ave & Washington St,43.0,41.8915,-87.6268,41.8840,-87.6247,casual,30 days 19:35:05,2020-04-03,23
34340,A2210AEAD28F5ED2,docked_bike,2020-04-20 16:05:31,2020-05-22 17:30:07,Wood St & Augusta Blvd,657,Eckhart Park,86.0,41.8992,-87.6722,41.8964,-87.6610,casual,32 days 01:24:36,2020-04-20,16
32054,610943B07C04C09A,docked_bike,2020-04-04 23:10:33,2020-05-12 13:35:00,Latrobe Ave & Chicago Ave,642,Laramie Ave & Madison St,540.0,41.8947,-87.7569,41.8802,-87.7553,member,37 days 14:24:27,2020-04-04,23
83799,F423D1055877936F,docked_bike,2020-04-03 16:33:09,2020-05-12 08:37:02,Sedgwick St & Schiller St,236,LaSalle St & Jackson Blvd,283.0,41.9076,-87.6386,41.8782,-87.6319,casual,38 days 16:03:53,2020-04-03,16


In [46]:
# remove irrelevant data columns

trip_sorted_data = trip_data[['ride_id', 'start_day', 'start_hour', 'trip_length', 'start_lat', 'start_lng', 'member_casual', 'rideable_type']]

# remove bad data

trip_data_clean = trip_sorted_data.loc[trip_data['trip_length'] > "P0DT0H0M0S", :]

# sort by started_at and fix index

trip_data_clean = trip_data_clean.sort_values(['start_day', 'start_hour']).set_index('ride_id').reset_index()

trip_data_clean

Unnamed: 0,ride_id,start_day,start_hour,trip_length,start_lat,start_lng,member_casual,rideable_type
0,07F785C9DDA3404C,2020-04-01,0,0 days 00:00:33,41.8708,-87.6257,member,docked_bike
1,545019BF3EF4B419,2020-04-01,0,0 days 00:03:58,41.8576,-87.6615,member,docked_bike
2,643593E85E46A45C,2020-04-01,0,0 days 00:05:23,41.8938,-87.6417,member,docked_bike
3,432C76DCFB84366A,2020-04-01,0,0 days 00:05:26,41.9542,-87.6544,member,docked_bike
4,782CEA3C6968D2A6,2020-04-01,0,0 days 00:05:28,41.8938,-87.6417,member,docked_bike
...,...,...,...,...,...,...,...,...
84712,66C254E16E3A28C5,2020-04-30,23,0 days 00:19:33,41.8715,-87.6699,member,docked_bike
84713,467F335339D13DC0,2020-04-30,23,0 days 00:22:43,41.8846,-87.6319,member,docked_bike
84714,7DD8DE9332472655,2020-04-30,23,0 days 00:33:32,41.9215,-87.6538,casual,docked_bike
84715,B36FDA6B10E35E91,2020-04-30,23,0 days 00:33:58,41.9437,-87.6640,member,docked_bike


In [None]:
# Limit Rideshare Location data to Central Chicago 1 decimals = rough 7 x 11.1km square

trip_data_clean['start_lat'] = round(trip_data_clean['start_lat'], 1)
trip_data_clean['start_lng'] = round(trip_data_clean['start_lng'], 1)

chicago_city = trip_data_clean.loc[(trip_data_clean['start_lat'] == 41.8) 
                                & (trip_data_clean['start_lng'] == -87.6), :]

chicago_city.count()

ride_id          4771
start_day        4771
start_hour       4771
trip_length      4771
start_lat        4771
start_lng        4771
member_casual    4771
rideable_type    4771
dtype: int64

In [None]:
# save file

export_filepath = Path('../cleaned_data/202004-cleaned-tripdata.csv')

chicago_city.to_csv(export_filepath, index = False)
