In [20]:
import pandas as pd

Q3 = pd.concat([pd.read_csv('Divvy_Trips_2014-Q3-07.csv'), pd.read_csv('Divvy_Trips_2014-Q3-0809.csv')], ignore_index=True)
Q3.to_csv('Divvy_Trips_2014_Q3.csv')
Q3Q4 = pd.concat([pd.read_csv('Divvy_Trips_2014_Q3.csv'), pd.read_csv('Divvy_Trips_2014-Q4.csv')], ignore_index=True)
Q3Q4.to_csv('Divvy_Trips_2014_Q3Q4.csv')

# Metadata for Trips Table:

Variables:

trip_id: ID attached to each trip taken

starttime: day and time trip started, in CST

stoptime: day and time trip ended, in CST

bikeid: ID attached to each bike

tripduration: time of trip in seconds

from_station_name: name of station where trip originated

to_station_name: name of station where trip terminated

from_station_id: ID of station where trip originated

to_station_id: ID of station where trip terminated

usertype: "Customer" is a rider who purchased a 24-Hour Pass; "Subscriber" is a rider who purchased an Annual Membership

gender: gender of rider

birthyear: birth year of rider


Notes:

* First row contains column names
* Total records = 
Q3-07 = 410,340
Q3-0809 = 700,630
Q4 = 437,965
* Trips that did not include a start or end date were removed from original table.
* Gender and birthday are only available for Subscribers

In [21]:
trips = Q3
stations = pd.read_csv('Divvy_Stations_2014-Q3Q4.csv')

In [22]:
trips

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,2886259,7/31/2014 23:56,8/1/2014 0:03,2602,386,291,Wells St & Evergreen Ave,53,Wells St & Erie St,Subscriber,Female,1979.0
1,2886258,7/31/2014 23:58,8/1/2014 0:07,2403,495,98,LaSalle St & Washington St,106,State St & Pearson St,Subscriber,Male,1974.0
2,2886257,7/31/2014 23:58,8/1/2014 2:10,669,7947,240,Sheridan Rd & Irving Park Rd,240,Sheridan Rd & Irving Park Rd,Customer,,
3,2886256,7/31/2014 23:58,8/1/2014 0:19,2431,1282,47,State St & Kinzie St,14,Morgan St & 18th St,Customer,,
4,2886255,7/31/2014 23:57,8/1/2014 2:10,2885,7972,240,Sheridan Rd & Irving Park Rd,240,Sheridan Rd & Irving Park Rd,Customer,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1110965,2886264,8/1/2014 0:01,8/1/2014 0:20,67,1159,310,Damen Ave & Charleston St,113,Bissell St & Armitage Ave,Customer,,
1110966,2886263,8/1/2014 0:00,8/1/2014 0:13,1452,797,330,Lincoln Ave & Addison St,239,Western Ave & Leland Ave,Subscriber,Male,1977.0
1110967,2886262,8/1/2014 0:00,8/1/2014 0:20,334,1172,310,Damen Ave & Charleston St,113,Bissell St & Armitage Ave,Customer,,
1110968,2886261,8/1/2014 0:00,8/1/2014 0:05,1645,299,166,Ashland Ave & Wrightwood Ave,152,Lincoln Ave & Diversey Pkwy,Subscriber,Male,1995.0


In [23]:
# count nulls in each column
trips.isnull().sum(axis=0)

trip_id                   0
starttime                 0
stoptime                  0
bikeid                    0
tripduration              0
from_station_id           0
from_station_name         0
to_station_id             0
to_station_name           0
usertype                  0
gender               407669
birthyear            407685
dtype: int64

# Metadata for Stations table:

Variables:

name: station name

latitude: station latitude

longitude: station longitude

dpcapacity: number of total docks at each station as of 12/31/2014

online date: date the station went live in the system

In [24]:
stations.head()

Unnamed: 0,id,name,latitude,longitude,dpcapacity,dateCreated
0,5,State St & Harrison St,41.873958,-87.627739,19,6/10/2013 10:46
1,13,Wilton Ave & Diversey Pkwy,41.9325,-87.652681,19,6/22/2013 18:29
2,14,Morgan St & 18th St,41.858086,-87.651073,15,6/22/2013 18:33
3,15,Racine Ave & 19th St,41.856453,-87.656471,15,6/22/2013 18:35
4,16,Wood St & North Ave,41.910329,-87.672516,15,6/22/2013 18:55


In [25]:
# count nulls in each column
stations.isnull().sum(axis=0)

id             0
name           0
latitude       0
longitude      0
dpcapacity     0
dateCreated    0
dtype: int64

In [26]:
stations = stations.drop(['name', 'dateCreated'], axis=1)
from_stations = stations
from_stations = from_stations.rename(columns = {'id': 'from_station_id', 'latitude' : 'from_station_latitude', 'longitude' : 'from_station_longitude'})
from_stations = from_stations.drop_duplicates()
df = pd.merge(trips, from_stations, how='left', on=['from_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)

In [27]:
df.head()

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity
0,2885586,7/31/2014 22:17,7/31/2014 22:39,866,1297,5,State St & Harrison St,171,May St & Cullerton St,Subscriber,Female,1985.0,41.873958,-87.627739,19
1,2884955,7/31/2014 21:15,7/31/2014 21:22,2375,437,5,State St & Harrison St,77,Clinton St & Madison St,Subscriber,Male,1984.0,41.873958,-87.627739,19
2,2884939,7/31/2014 21:18,7/31/2014 21:24,1338,363,5,State St & Harrison St,168,Michigan Ave & 14th St,Subscriber,Male,1983.0,41.873958,-87.627739,19
3,2884725,7/31/2014 20:58,7/31/2014 21:09,1783,691,5,State St & Harrison St,273,Michigan Ave & 18th St,Subscriber,Female,1986.0,41.873958,-87.627739,19
4,2884640,7/31/2014 20:49,7/31/2014 21:15,1454,1557,5,State St & Harrison St,291,Wells St & Evergreen Ave,Subscriber,Male,1985.0,41.873958,-87.627739,19


In [28]:
# counting unique longitude values to make sure it merged correctly
# this works too: len(pd.unique(new['from_station_longitude']))
print(df.from_station_longitude.nunique())

299


In [33]:
# count nulls in each column
trips.isnull().sum(axis=0)

trip_id                   0
starttime                 0
stoptime                  0
bikeid                    0
tripduration              0
from_station_id           0
from_station_name         0
to_station_id             0
to_station_name           0
usertype                  0
gender               407669
birthyear            407685
dtype: int64

In [34]:
to_stations = stations.drop(['dpcapacity'],axis=1)
to_stations = to_stations.rename(columns = {'id': 'to_station_id', 'latitude' : 'to_station_latitude', 'longitude' : 'to_station_longitude'})
to_stations = to_stations.drop_duplicates()
df = pd.merge(df, to_stations, how='left', on=['to_station_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True)

In [36]:
df

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity,to_station_latitude,to_station_longitude
0,2857636,7/30/2014 14:25,7/30/2014 14:27,838,137,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
1,2857578,7/30/2014 14:21,7/30/2014 14:28,2835,370,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
2,2856592,7/30/2014 13:07,7/30/2014 13:16,2130,545,5,State St & Harrison St,5,State St & Harrison St,Subscriber,Male,1975.0,41.873958,-87.627739,19,41.873958,-87.627739
3,2838968,7/29/2014 11:55,7/29/2014 13:06,1117,4257,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
4,2838543,7/29/2014 11:24,7/29/2014 11:53,1971,1748,5,State St & Harrison St,5,State St & Harrison St,Customer,,,41.873958,-87.627739,19,41.873958,-87.627739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110965,2900341,8/1/2014 19:00,8/1/2014 19:16,2865,987,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615
1110966,2899805,8/1/2014 18:31,8/1/2014 18:59,2865,1654,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615
1110967,2899064,8/1/2014 17:57,8/1/2014 18:30,2865,1986,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615
1110968,2897582,8/1/2014 17:03,8/1/2014 17:32,2865,1726,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,Customer,,,41.803038,-87.606615,15,41.803038,-87.606615


In [35]:
pd.unique(df['usertype'])

array(['Customer', 'Subscriber'], dtype=object)

In [37]:
# changing boolean strings so that they are in line with our actual variable names

df.loc[df.usertype=='Customer', 'usertype'] = 'casual'
df.loc[df.usertype=='Subscriber', 'usertype'] = 'member'
pd.unique(df['usertype'])

array(['casual', 'member'], dtype=object)

In [38]:
df

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,from_station_latitude,from_station_longitude,dpcapacity,to_station_latitude,to_station_longitude
0,2857636,7/30/2014 14:25,7/30/2014 14:27,838,137,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
1,2857578,7/30/2014 14:21,7/30/2014 14:28,2835,370,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
2,2856592,7/30/2014 13:07,7/30/2014 13:16,2130,545,5,State St & Harrison St,5,State St & Harrison St,member,Male,1975.0,41.873958,-87.627739,19,41.873958,-87.627739
3,2838968,7/29/2014 11:55,7/29/2014 13:06,1117,4257,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
4,2838543,7/29/2014 11:24,7/29/2014 11:53,1971,1748,5,State St & Harrison St,5,State St & Harrison St,casual,,,41.873958,-87.627739,19,41.873958,-87.627739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110965,2900341,8/1/2014 19:00,8/1/2014 19:16,2865,987,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615
1110966,2899805,8/1/2014 18:31,8/1/2014 18:59,2865,1654,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615
1110967,2899064,8/1/2014 17:57,8/1/2014 18:30,2865,1986,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615
1110968,2897582,8/1/2014 17:03,8/1/2014 17:32,2865,1726,351,Cottage Grove Ave & 51st St,351,Cottage Grove Ave & 51st St,casual,,,41.803038,-87.606615,15,41.803038,-87.606615


In [40]:
df.to_csv('Divvy 2014 Q3Q4 merged.csv')