In [30]:
import pandas as pd

trips = pd.concat([pd.read_csv('Divvy_Trips_2017_Q1.csv'), pd.read_csv('Divvy_Trips_2017_Q2.csv')], ignore_index=True)

# Metadata for Trips:

Variables:

trip_id: ID attached to each trip taken

start_time: day and time trip started, in CST

stop_time: day and time trip ended, in CST

bikeid: ID attached to each bike

tripduration: time of trip in seconds

from_station_name: name of station where trip originated

to_station_name: name of station where trip terminated

from_station_id: ID of station where trip originated

to_station_id: ID of station where trip terminated

usertype: "Customer" is a rider who purchased a 24-Hour Pass; "Subscriber" is a rider who purchased an Annual Membership

gender: gender of rider

birthyear: birth year of rider


Notes:

* First row contains column names
* Trips that did not include a start or end date are excluded
* Trips less than 1 minute in duration are excluded
* Trips greater than 24 hours in duration are excluded
* Gender and birthday are only available for Subscribers
* Divvy_Trips_2017_Q1 has 431,691 rows
* Divvy_Trips_2017_Q2 has 1,119,814 rows

In [31]:
trips

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,13518905,3/31/2017 23:59:07,4/1/2017 00:13:24,5292,857,66,Clinton St & Lake St,171,May St & Cullerton St,Subscriber,Male,1989.0
1,13518904,3/31/2017 23:56:25,4/1/2017 00:00:21,4408,236,199,Wabash Ave & Grand Ave,26,McClurg Ct & Illinois St,Subscriber,Male,1990.0
2,13518903,3/31/2017 23:55:33,4/1/2017 00:01:21,696,348,520,Greenview Ave & Jarvis Ave,432,Clark St & Lunt Ave,Subscriber,Female,1979.0
3,13518902,3/31/2017 23:54:46,3/31/2017 23:59:34,4915,288,110,Dearborn St & Erie St,142,McClurg Ct & Erie St,Subscriber,Male,1985.0
4,13518901,3/31/2017 23:53:33,4/1/2017 00:00:28,4247,415,327,Sheffield Ave & Webster Ave,331,Halsted St & Blackhawk St (*),Subscriber,Female,1989.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1551500,13518910,4/1/2017 00:06:30,4/1/2017 00:11:56,1796,326,296,Broadway & Belmont Ave,232,Pine Grove Ave & Waveland Ave,Subscriber,Male,1960.0
1551501,13518909,4/1/2017 00:06:00,4/1/2017 00:20:53,22,893,199,Wabash Ave & Grand Ave,68,Clinton St & Tilden St,Subscriber,Male,1979.0
1551502,13518908,4/1/2017 00:04:13,4/1/2017 00:19:20,1296,907,56,Desplaines St & Kinzie St,69,Damen Ave & Pierce Ave,Subscriber,Male,1993.0
1551503,13518907,4/1/2017 00:03:08,4/1/2017 00:08:24,2695,316,332,Halsted St & Diversey Pkwy,226,Racine Ave & Belmont Ave,Subscriber,Male,1986.0


In [32]:
# count nulls in each column
trips.isnull().sum(axis=0)

trip_id                   0
start_time                0
end_time                  0
bikeid                    0
tripduration              0
from_station_id           0
from_station_name         0
to_station_id             0
to_station_name           0
usertype                  0
gender               316867
birthyear            316683
dtype: int64

In [33]:
stations = pd.read_csv('Divvy_Stations_2017_Q1Q2.csv')

In [34]:
stations

Unnamed: 0,id,name,city,latitude,longitude,dpcapacity,online_date
0,456,2112 W Peterson Ave,Chicago,41.991178,-87.683593,15,2/10/2015 14:04:42
1,101,63rd St Beach,Chicago,41.781016,-87.576120,23,7/16/2013 01:27:50
2,109,900 W Harrison St,Chicago,41.874675,-87.650019,19,7/18/2013 16:45:02
3,21,Aberdeen St & Jackson Blvd,Chicago,41.877726,-87.654787,15,6/22/2013 19:07:12
4,80,Aberdeen St & Madison (Monroe) St,Chicago,41.881567,-87.655056,19,6/26/2013 19:00:18
...,...,...,...,...,...,...,...
577,610,Marion St & South Blvd,Oak Park,41.886810,-87.802870,19,6/23/2016 12:24:30
578,616,Oak Park Ave & Harrison St,Oak Park,41.872987,-87.793945,19,6/23/2016 12:27:31
579,611,Oak Park Ave & South Blvd,Oak Park,41.886923,-87.793899,19,6/23/2016 12:25:00
580,612,Ridgeland Ave & Lake St,Oak Park,41.888085,-87.785236,15,6/23/2016 12:25:30


In [37]:
stations = stations.drop(['name','online_date'], axis=1)
from_stations = stations
from_stations = from_stations.rename(columns = {'id': 'from_station_id', 'city' : 'from_city', 'latitude' : 'from_station_latitude', 'longitude' : 'from_station_longitude'})
from_stations = from_stations.drop_duplicates()
df = pd.merge(trips, from_stations, how='left', on=['from_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)
to_stations = stations.drop(['dpcapacity'], axis=1)
to_stations = to_stations.rename(columns = {'id': 'to_station_id', 'city' : 'to_city', 'latitude' : 'to_station_latitude', 'longitude' : 'to_station_longitude'})
to_stations = to_stations.drop_duplicates()
df = pd.merge(df, to_stations, how='left', on=['to_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)

# Metadata for Stations:

Variables:

id: ID attached to each station
name: station name    
latitude: station latitude
longitude: station longitude
dpcapacity: number of total docks at each station as of 6/30/2017
online_date: date the station was created in the system

Notes:

* Divvy_Stations_2017_Q1Q2 has 582 rows

In [38]:
df

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_city,from_station_latitude,from_station_longitude,dpcapacity,to_city,to_station_latitude,to_station_longitude
0,13518771,3/31/2017 22:02:23,3/31/2017 22:37:31,5453,2108,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.881060,-87.619486,27,Chicago,41.881060,-87.619486
1,13518768,3/31/2017 22:01:25,3/31/2017 22:37:31,5003,2166,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.881060,-87.619486,27,Chicago,41.881060,-87.619486
2,13496439,3/28/2017 12:28:25,3/28/2017 12:43:59,5789,934,2,Buckingham Fountain,2,Buckingham Fountain,Subscriber,Female,1969.0,Chicago,41.881060,-87.619486,27,Chicago,41.881060,-87.619486
3,13491648,3/27/2017 17:59:39,3/27/2017 18:02:00,5237,141,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.881060,-87.619486,27,Chicago,41.881060,-87.619486
4,13488885,3/27/2017 13:10:04,3/27/2017 13:38:57,1021,1733,2,Buckingham Fountain,2,Buckingham Fountain,Customer,,,Chicago,41.881060,-87.619486,27,Chicago,41.881060,-87.619486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551500,14543612,6/15/2017 06:12:28,6/15/2017 07:38:40,4401,5172,623,Michigan Ave & 8th St,623,Michigan Ave & 8th St,Customer,,,Chicago,41.872773,-87.623981,23,Chicago,41.872773,-87.623981
1551501,14542821,6/14/2017 22:20:51,6/14/2017 22:41:09,1288,1218,623,Michigan Ave & 8th St,623,Michigan Ave & 8th St,Customer,,,Chicago,41.872773,-87.623981,23,Chicago,41.872773,-87.623981
1551502,14542819,6/14/2017 22:20:35,6/14/2017 22:41:23,4906,1248,623,Michigan Ave & 8th St,623,Michigan Ave & 8th St,Customer,,,Chicago,41.872773,-87.623981,23,Chicago,41.872773,-87.623981
1551503,14536693,6/14/2017 12:45:18,6/14/2017 13:05:23,5415,1205,623,Michigan Ave & 8th St,623,Michigan Ave & 8th St,Customer,,,Chicago,41.872773,-87.623981,23,Chicago,41.872773,-87.623981


In [39]:
# count nulls in each column
df.isnull().sum(axis=0)

trip_id                        0
start_time                     0
end_time                       0
bikeid                         0
tripduration                   0
from_station_id                0
from_station_name              0
to_station_id                  0
to_station_name                0
usertype                       0
gender                    316867
birthyear                 316683
from_city                      0
from_station_latitude          0
from_station_longitude         0
dpcapacity                     0
to_city                        0
to_station_latitude            0
to_station_longitude           0
dtype: int64

In [41]:
df.to_csv('Divvy_2017_Q1Q2.csv')