In [2]:
import pandas as pd

trips = pd.concat([pd.read_csv('Divvy_Trips_2017_Q3.csv'), pd.read_csv('Divvy_Trips_2017_Q4.csv')], ignore_index=True)

# Metadata for Trips:

Variables:

trip_id: ID attached to each trip taken

start_time: day and time trip started, in CST

stop_time: day and time trip ended, in CST

bikeid: ID attached to each bike

tripduration: time of trip in seconds

from_station_name: name of station where trip originated

to_station_name: name of station where trip terminated

from_station_id: ID of station where trip originated

to_station_id: ID of station where trip terminated

usertype: "Customer" is a rider who purchased a 24-Hour Pass; "Subscriber" is a rider who purchased an Annual Membership

gender: gender of rider

birthyear: birth year of rider


Notes:

* First row contains column names
* Trips that did not include a start or end date are excluded
* Trips less than 1 minute in duration are excluded
* Trips greater than 24 hours in duration are excluded
* Gender and birthday are only available for Subscribers
* Divvy_Trips_2017_Q3 has 1,608,270 rows
* Divvy_Trips_2017_Q4 has 669,239 rows

In [3]:
trips

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,16734065,9/30/2017 23:59:58,10/1/2017 00:05:47,1411,349,216,California Ave & Division St,259,California Ave & Francis Pl,Subscriber,Male,1985.0
1,16734064,9/30/2017 23:59:53,10/1/2017 00:05:47,3048,354,216,California Ave & Division St,259,California Ave & Francis Pl,Subscriber,Male,1979.0
2,16734063,9/30/2017 23:59:06,10/1/2017 00:02:52,2590,226,141,Clark St & Lincoln Ave,144,Larrabee St & Webster Ave,Subscriber,Male,1993.0
3,16734062,9/30/2017 23:58:56,10/1/2017 00:07:37,551,521,96,Desplaines St & Randolph St,217,Racine Ave (May St) & Fulton St,Customer,,
4,16734061,9/30/2017 23:58:47,10/1/2017 00:07:37,1287,530,96,Desplaines St & Randolph St,217,Racine Ave (May St) & Fulton St,Subscriber,Female,1994.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2277504,16734070,10/1/2017 0:01,10/1/2017 0:15,1099,837,289,Wells St & Concord Ln,226,Racine Ave & Belmont Ave,Subscriber,Male,1991.0
2277505,16734069,10/1/2017 0:00,10/1/2017 0:07,3688,366,45,Michigan Ave & Congress Pkwy,52,Michigan Ave & Lake St,Customer,,
2277506,16734068,10/1/2017 0:00,10/1/2017 0:05,1416,264,520,Greenview Ave & Jarvis Ave,447,Glenwood Ave & Morse Ave,Customer,,
2277507,16734067,10/1/2017 0:00,10/1/2017 0:06,5396,361,288,Larrabee St & Armitage Ave,289,Wells St & Concord Ln,Subscriber,Female,1984.0


In [4]:
# count nulls in each column
trips.isnull().sum(axis=0)

trip_id                   0
start_time                0
end_time                  0
bikeid                    0
tripduration              0
from_station_id           0
from_station_name         0
to_station_id             0
to_station_name           0
usertype                  0
gender               519960
birthyear            520075
dtype: int64

In [5]:
stations = pd.read_csv('Divvy_Stations_2017_Q3Q4.csv')

In [6]:
stations

Unnamed: 0,id,name,city,latitude,longitude,dpcapacity,online_date,Unnamed: 7
0,2,Buckingham Fountain,Chicago,41.876393,-87.620328,27,6/10/2013 10:43,
1,3,Shedd Aquarium,Chicago,41.867226,-87.615355,55,6/10/2013 10:44,
2,4,Burnham Harbor,Chicago,41.856268,-87.613348,23,6/10/2013 10:46,
3,5,State St & Harrison St,Chicago,41.874053,-87.627716,23,6/10/2013 10:46,
4,6,Dusable Harbor,Chicago,41.885041,-87.612794,39,6/10/2013 11:18,
...,...,...,...,...,...,...,...,...
580,622,California Ave & Cortez St,Chicago,41.900363,-87.696704,15,4/27/2017 9:17,
581,623,Michigan Ave & 8th St,Chicago,41.872773,-87.623981,31,6/13/2017 15:23,
582,624,Dearborn St & Van Buren St (*),Chicago,41.876268,-87.629155,16,7/26/2017 21:25,
583,625,Chicago Ave & Dempster St,Evanston,42.041691,-87.680687,15,8/11/2017 16:53,


In [7]:
stations = stations.drop(['name','online_date','Unnamed: 7'], axis=1)
from_stations = stations
from_stations = from_stations.rename(columns = {'id': 'from_station_id', 'city' : 'from_city', 'latitude' : 'from_station_latitude', 'longitude' : 'from_station_longitude'})
from_stations = from_stations.drop_duplicates()
df = pd.merge(trips, from_stations, how='left', on=['from_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)
to_stations = stations.drop(['dpcapacity'], axis=1)
to_stations = to_stations.rename(columns = {'id': 'to_station_id', 'city' : 'to_city', 'latitude' : 'to_station_latitude', 'longitude' : 'to_station_longitude'})
to_stations = to_stations.drop_duplicates()
df = pd.merge(df, to_stations, how='left', on=['to_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)

# Metadata for Stations:

Variables:

id: ID attached to each station
name: station name    
latitude: station latitude
longitude: station longitude
dpcapacity: number of total docks at each station as of 12/31/2017
online_date: date the station was created in the system

Notes:

* Divvy_Stations_2017_Q3Q4 has 586 rows

In [8]:
df

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_city,from_station_latitude,from_station_longitude,dpcapacity,to_city,to_station_latitude,to_station_longitude
0,16733952,9/30/2017 23:25:25,9/30/2017 23:27:14,5280,109,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.876393,-87.620328,27,Chicago,41.876393,-87.620328
1,16733950,9/30/2017 23:25:11,9/30/2017 23:27:34,5962,143,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.876393,-87.620328,27,Chicago,41.876393,-87.620328
2,16732033,9/30/2017 18:51:08,9/30/2017 19:39:47,361,2919,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.876393,-87.620328,27,Chicago,41.876393,-87.620328
3,16731858,9/30/2017 18:41:54,9/30/2017 19:40:31,1010,3517,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.876393,-87.620328,27,Chicago,41.876393,-87.620328
4,16704981,9/29/2017 09:51:47,9/29/2017 12:01:06,5439,7759,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.876393,-87.620328,27,Chicago,41.876393,-87.620328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277504,17534545,12/29/2017 21:11,12/29/2017 21:29,2928,1046,287,Franklin St & Monroe St,626,Delano Ct & Roosevelt Rd,Subscriber,Male,1992.0,Chicago,41.880317,-87.635185,27,Chicago,41.867491,-87.632190
2277505,17535695,12/31/2017 10:08,12/31/2017 10:14,6128,343,321,Wabash Ave & 9th St,626,Delano Ct & Roosevelt Rd,Subscriber,Male,1962.0,Chicago,41.870769,-87.625734,19,Chicago,41.867491,-87.632190
2277506,17532632,12/28/2017 17:05,12/28/2017 17:10,4907,296,321,Wabash Ave & 9th St,626,Delano Ct & Roosevelt Rd,Subscriber,Male,1992.0,Chicago,41.870769,-87.625734,19,Chicago,41.867491,-87.632190
2277507,17532112,12/28/2017 13:33,12/28/2017 13:39,1918,341,321,Wabash Ave & 9th St,626,Delano Ct & Roosevelt Rd,Subscriber,Male,1983.0,Chicago,41.870769,-87.625734,19,Chicago,41.867491,-87.632190


In [9]:
# count nulls in each column
df.isnull().sum(axis=0)

trip_id                        0
start_time                     0
end_time                       0
bikeid                         0
tripduration                   0
from_station_id                0
from_station_name              0
to_station_id                  0
to_station_name                0
usertype                       0
gender                    519960
birthyear                 520075
from_city                      0
from_station_latitude          0
from_station_longitude         0
dpcapacity                     0
to_city                        0
to_station_latitude            0
to_station_longitude           0
dtype: int64

In [10]:
df.to_csv('Divvy_2017_Q3Q4.csv')