In [1]:
import pandas as pd

trips = pd.read_csv('Divvy_Trips_2014_Q1Q2.csv')
stations = pd.read_csv('Divvy_Stations_2014-Q1Q2.csv')

# Metadata for Trips Table:

Variables:

trip_id: ID attached to each trip taken

starttime: day and time trip started, in CST

stoptime: day and time trip ended, in CST

bikeid: ID attached to each bike

tripduration: time of trip in seconds

from_station_name: name of station where trip originated

to_station_name: name of station where trip terminated

from_station_id: ID of station where trip originated

to_station_id: ID of station where trip terminated

usertype: "Customer" is a rider who purchased a 24-Hour Pass; "Subscriber" is a rider who purchased an Annual Membership

gender: gender of rider

birthyear: birth year of rider


Notes:

* First row contains column names
* Total records = 905,699
* Trips that did not include a start or end date were removed from original table.
* Gender and birthday are only available for Subscribers

In [4]:
trips

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,2355134,6/30/2014 23:57,7/1/2014 0:07,2006,604,131,Lincoln Ave & Belmont Ave,303,Broadway & Cornelia Ave,Subscriber,Male,1988.0
1,2355133,6/30/2014 23:56,7/1/2014 0:00,2217,263,282,Halsted St & Maxwell St,22,May St & Taylor St,Subscriber,Male,1992.0
2,2355130,6/30/2014 23:33,6/30/2014 23:35,2798,126,327,Sheffield Ave & Webster Ave,225,Halsted St & Dickens Ave,Subscriber,Male,1993.0
3,2355129,6/30/2014 23:26,7/1/2014 0:24,173,3481,134,Peoria St & Jackson Blvd,194,State St & Wacker Dr,Subscriber,Female,1988.0
4,2355128,6/30/2014 23:16,6/30/2014 23:26,173,638,320,Loomis St & Lexington St,134,Peoria St & Jackson Blvd,Subscriber,Female,1988.0
...,...,...,...,...,...,...,...,...,...,...,...,...
905694,1109432,1/1/2014 1:43,1/1/2014 1:53,823,652,113,Bissell St & Armitage Ave,94,Clark St & Armitage Ave,Subscriber,Male,1988.0
905695,1109431,1/1/2014 1:43,1/1/2014 1:53,348,650,113,Bissell St & Armitage Ave,94,Clark St & Armitage Ave,Subscriber,Male,1988.0
905696,1109427,1/1/2014 1:12,1/1/2014 1:18,1818,346,240,Sheridan Rd & Irving Park Rd,245,Clarendon Ave & Junior Ter,Subscriber,Male,1961.0
905697,1109421,1/1/2014 0:45,1/1/2014 0:55,2981,608,69,Damen Ave & Pierce Ave,216,California Ave & Division St,Customer,,


In [5]:
# count nulls in each column
trips.isnull().sum(axis=0)

trip_id                   0
starttime                 0
stoptime                  0
bikeid                    0
tripduration              0
from_station_id           0
from_station_name         0
to_station_id             0
to_station_name           0
usertype                  0
gender               314022
birthyear            313977
dtype: int64

# Metadata for Stations table:

Variables:

name: station name    

latitude: station latitude

longitude: station longitude

dpcapacity: number of total docks at each station as of 8/20/2014

online date: date the station went live in the system


In [6]:
stations.head()

Unnamed: 0,id,name,latitude,longitude,dpcapacity,online date
0,43,Michigan Ave & Washington St,41.883893,-87.624649,43,6/16/2013
1,44,State St & Randolph St,41.88473,-87.627734,27,6/16/2013
2,33,State St & Van Buren St,41.877181,-87.627844,27,6/25/2013
3,199,Wabash Ave & Grand Ave,41.891738,-87.626937,15,8/10/2013
4,51,Clark St & Randolph St,41.884576,-87.63189,31,6/17/2013


In [7]:
# count nulls in each column
stations.isnull().sum(axis=0)

id             0
name           0
latitude       0
longitude      0
dpcapacity     0
online date    0
dtype: int64

In [8]:
stations = stations.drop(['name', 'online date'], axis=1)
from_stations = stations
from_stations = from_stations.rename(columns = {'id': 'from_station_id', 'latitude' : 'from_station_latitude', 'longitude' : 'from_station_longitude'})
from_stations = from_stations.drop_duplicates()
df = pd.merge(trips, from_stations, how='left', on=['from_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)
df.head()

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity
0,2355020,6/30/2014 21:16,6/30/2014 21:23,314,433,5,State St & Harrison St,52,Michigan Ave & Lake St,Customer,,,41.873958,-87.627739,19
1,2354741,6/30/2014 20:00,6/30/2014 20:16,200,930,5,State St & Harrison St,66,Clinton St & Lake St,Subscriber,Female,1982.0,41.873958,-87.627739,19
2,2354594,6/30/2014 19:08,6/30/2014 19:14,2824,410,5,State St & Harrison St,49,Dearborn St & Monroe St,Subscriber,Female,1988.0,41.873958,-87.627739,19
3,2354310,6/30/2014 18:42,6/30/2014 18:46,2070,218,5,State St & Harrison St,45,Michigan Ave & Congress Pkwy,Subscriber,Female,1989.0,41.873958,-87.627739,19
4,2354168,6/30/2014 18:34,6/30/2014 18:51,632,985,5,State St & Harrison St,84,Green St & Milwaukee Ave,Subscriber,Male,1988.0,41.873958,-87.627739,19


In [9]:
# counting unique longitude values to make sure it merged correctly
# this works too: len(pd.unique(new['from_station_longitude']))
print(df.from_station_longitude.nunique())

300


In [11]:
to_stations = stations.drop(['dpcapacity'],axis=1)
to_stations = to_stations.rename(columns = {'id': 'to_station_id', 'latitude' : 'to_station_latitude', 'longitude' : 'to_station_longitude'})
to_stations = to_stations.drop_duplicates()
df = pd.merge(df, to_stations, how='left', on=['to_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity,to_station_latitude,to_station_longitude
0,2340422,6/29/2014 19:48,6/29/2014 21:41,2870,6835,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
1,2340356,6/29/2014 19:44,6/29/2014 21:42,2462,7070,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
2,2330419,6/29/2014 12:44,6/29/2014 14:18,2398,5640,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
3,2323885,6/28/2014 22:10,6/28/2014 22:50,2356,2425,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
4,2315406,6/28/2014 14:25,6/28/2014 14:36,2255,656,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739


In [12]:
df

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity,to_station_latitude,to_station_longitude
0,2340422,6/29/2014 19:48,6/29/2014 21:41,2870,6835,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
1,2340356,6/29/2014 19:44,6/29/2014 21:42,2462,7070,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
2,2330419,6/29/2014 12:44,6/29/2014 14:18,2398,5640,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
3,2323885,6/28/2014 22:10,6/28/2014 22:50,2356,2425,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
4,2315406,6/28/2014 14:25,6/28/2014 14:36,2255,656,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905694,1517029,4/27/2014 12:49,4/27/2014 13:10,1002,1274,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Subscriber,Male,1947.0,41.803038,-87.606615,15,41.803038,-87.606615
905695,1495231,4/24/2014 19:47,4/25/2014 9:28,2569,49255,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615
905696,1409078,4/13/2014 10:45,4/13/2014 11:32,451,2808,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615
905697,1315877,3/31/2014 14:51,3/31/2014 19:09,596,15474,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615


In [13]:
pd.unique(df['usertype'])

array(['Customer', 'Subscriber'], dtype=object)

In [14]:
# changing boolean strings so that they are in line with our actual variable names

df.loc[df.usertype=='Customer', 'usertype'] = 'casual'
df.loc[df.usertype=='Subscriber', 'usertype'] = 'member'
pd.unique(df['usertype'])

array(['casual', 'member'], dtype=object)

In [15]:
df

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity,to_station_latitude,to_station_longitude
0,2340422,6/29/2014 19:48,6/29/2014 21:41,2870,6835,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
1,2340356,6/29/2014 19:44,6/29/2014 21:42,2462,7070,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
2,2330419,6/29/2014 12:44,6/29/2014 14:18,2398,5640,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
3,2323885,6/28/2014 22:10,6/28/2014 22:50,2356,2425,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
4,2315406,6/28/2014 14:25,6/28/2014 14:36,2255,656,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905694,1517029,4/27/2014 12:49,4/27/2014 13:10,1002,1274,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,member,Male,1947.0,41.803038,-87.606615,15,41.803038,-87.606615
905695,1495231,4/24/2014 19:47,4/25/2014 9:28,2569,49255,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615
905696,1409078,4/13/2014 10:45,4/13/2014 11:32,451,2808,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615
905697,1315877,3/31/2014 14:51,3/31/2014 19:09,596,15474,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615


In [17]:
df.to_csv('Divvy 2014 Q1Q2 merged.csv')