# Bay Area Bikeshare: Raw Data
https://www.dropbox.com/s/jb3q97i27ujtxsg/babs.zip?dl=0

In [41]:
from datetime import datetime, date, time
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ggplot import *

In [42]:
# lambdas & functions
lambda_city = lambda x: station_data['city'].ix[x]

def convert_float(val):
    try:
        return float(val)
    except ValueError:
        return 0

# dictionary reference
seasons_dict = {
    1: 'Winter',2: 'Spring',3: 'Spring',4: 'Spring',
    5: 'Summer',6: 'Summer',7: 'Summer',8: 'Autumn',
    9: 'Autumn',10: 'Autumn',11: 'Winter',12: 'Winter'}

city_dict = {
    94107: 'San Francisco',
    94063: 'Redwood City',
    94301: 'Palo Alto', 
    94041: 'Mountain View', 
    95113: 'San Jose'}

# header reference
station_headers = ['station_id','station_name','lat','long',
                   'dock_count','city','install_date']

status_headers = ['station_id','bikes_free','docks_free','time']

trip_headers = ['trip_id','trip_time',
                'start_dt','start_station','start_id',
                'end_dt','end_station','end_id',
                'bike_num','user_type','ZIP']

weather_headers = ['date',
                   'max_temp','mean_temp','min_temp',
                   'max_dp','mean_dp','min_dp',
                   'max_hum','mean_hum','min_hum',
                   'max_sea','mean_sea','min_sea',
                   'max_vis','mean_vis','min_vis',
                   'max_wind','mean_wind','max_gust',
                   'rain_inches','cloud_cover','events',
                   'wind_dir_degrees','ZIP']

In [43]:
# STATION_DATA
station_data = pd.read_csv('201508_station_data.csv')
station_data.columns = station_headers
station_data = station_data.sort_values('station_id',ascending=True)
# current merged station dataset
station_data = station_data.set_index('station_id',drop=True)

In [51]:
# STATUS_DATA
raw_data_1 = pd.read_csv('201402_status_data.csv', parse_dates=['time'])
raw_data_1.columns = status_headers
raw_data_2 = pd.read_csv('201408_status_data.csv', parse_dates=['time'])
raw_data_2.columns = status_headers
raw_data_3 = pd.read_csv('201508_status_data.csv', parse_dates=['time'])
raw_data_3.columns = status_headers
# current merged status dataset
status_data = pd.concat([raw_data_1, raw_data_2, raw_data_3])
# added columns to status_data
status_data['season'] = status_data['time'].dt.month.map(seasons_dict)
status_data['date'] = status_data['time'].dt.date
status_data['weekday'] = status_data['time'].dt.weekday_name

In [49]:
# TRIP_DATA
raw_data_1a = pd.read_csv('201402_trip_data.csv', parse_dates=['Start Date'])
raw_data_1a.columns = trip_headers
raw_data_2a = pd.read_csv('201408_trip_data.csv', parse_dates=['Start Date'])
raw_data_2a.columns = trip_headers
raw_data_3a = pd.read_csv('201508_trip_data.csv', parse_dates=['Start Date'])
raw_data_3a.columns = trip_headers
# current merged trip dataset
trip_data = pd.concat([raw_data_1a, raw_data_2a, raw_data_3a])
# added columns to trip_data
trip_data['start_city'] = trip_data['start_id'].map(lambda_city)
trip_data['end_city'] = trip_data['end_id'].map(lambda_city)
trip_data['date'] = trip_data['start_dt'].dt.date
trip_data['weekday'] = trip_data['start_dt'].dt.weekday_name
# dropped columns from trip_data
trip_data.drop('ZIP', axis=1, inplace=True)
# sorting & dropping & reformatting
trip_data = trip_data.set_index(pd.DatetimeIndex(trip_data['date']))
trip_data = trip_data.drop(['2013-08-29','2013-08-30','2013-08-31'])
trip_data = trip_data.sort_values('date', ascending=True)

In [50]:
# WEATHER_DATA
raw_data_1b = pd.read_csv('201402_weather_data.csv', parse_dates=['Date'])
raw_data_1b.columns = weather_headers
raw_data_2b = pd.read_csv('201408_weather_data.csv', parse_dates=['PDT'])
raw_data_2b.columns = weather_headers
raw_data_3b = pd.read_csv('201508_weather_data.csv', parse_dates=['PDT'])
raw_data_3b.columns = weather_headers
# current merged weather dataset
weather_data = pd.concat([raw_data_1b, raw_data_2b, raw_data_3b])
# added columns to weather_data
weather_data['season'] = weather_data['date'].dt.month.map(seasons_dict)
weather_data['city'] = weather_data['ZIP'].map(city_dict)
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data['day_of_week'] = weather_data['date'].dt.weekday_name
# scrubbed columns from weather_data
weather_data['rain_inches'] = weather_data['rain_inches'].map(convert_float)
# dropped columns from weather_data
weather_data.drop(['events'], axis=1, inplace=True)
# index on weather_data
weather_data.set_index('date',drop=True,inplace=True)
# sorting & dropping & reformatting
weather_data = weather_data.sort_index(axis=0, ascending=True)
weather_data = weather_data.drop(['2013-08-29','2013-08-30','2013-08-31'])
# fill NaN = '' 
weather_data = weather_data.fillna('')

In [18]:
# ./data subdirectory
# skip to next cell if it already exists
!mkdir data

In [52]:
# final output
station_data.to_csv('./data/station_data_new.csv')

In [53]:
# final output
weather_data.to_csv('./data/weather_data_new.csv')

In [56]:
# final output
trip_data.to_csv('./data/trip_data_new.csv',header=True,cols=['trip_id','trip_time',
                'start_dt','start_station','start_id',
                'end_dt','end_station','end_id',
                'bike_num','user_type','ZIP'], parse_dates=['start_dt'])

In [57]:
# final output
status_data.to_csv('./data/status_data_new.csv', parse_dates=['date'])

In [45]:
weathers = pd.read_csv('./data/weather_data_new.csv', parse_dates=['date'])