In [1]:
import csv
from datetime import datetime
import numpy as np
import pandas as pd 
%matplotlib inline

In [2]:
def create_station_mapping(station_data):
    station_map={}
    for data_file in station_data:
        with open(data_file,'r') as f_in:
            # 设置 csv 读取对象 - 注意，我们使用的是 DictReader，他会将
            # 文档第一行作为表头，即每一行的字典键值
            weather_reader=csv.DictReader(f_in)
            
            for row in weather_reader:
                station_map[row['station_id']]=row['landmark']
    return station_map

In [3]:
def summarise_data(trip_in, station_data, trip_out):
     # 生成车站字典 - 城市映射
    station_map=create_station_mapping(station_data)
    
    with open(trip_out,'wb') as f_out:
        # 设置 csv 写入对象 
        out_colnames=['duration','start_date','start_year',
                      'start_month','start_hour','weekday',
                      'start_city','end_city','start_station','end_station','Bike','subscription_type']
        trip_writer=csv.DictWriter(f_out,fieldnames=out_colnames)
        trip_writer.writeheader()
        
        for data_file in trip_in:
            with open(data_file,'r') as f_in:
                #设置csv读取对象
                trip_reader=csv.DictReader(f_in)
                # 从每行数据中收集和处理
                for row in trip_reader:
                    new_point={}
                    new_point['duration']=float(row['Duration'])/60
                    trip_date=datetime.strptime(row['Start Date'],'%m/%d/%Y %H:%M')
                    new_point['start_date'] =trip_date.strftime('%m-%d-%Y')
                    new_point['start_year'] =trip_date.strftime('%Y')
                    new_point['start_month']=trip_date.strftime('%m')
                    new_point['start_hour'] =trip_date.strftime('%H')
                    new_point['weekday']    =trip_date.strftime('%A')
                    # 基于起始城市重新映射起始站点
                    new_point['start_city']=station_map[row['Start Terminal']]
                    new_point['end_city']=station_map[row['End Terminal']]
                    #bike
                    new_point['Bike']=row['Bike #']
                    #起-终站
                    new_point['start_station']=row['Start Station']
                    new_point['end_station']=row['End Station']
                    # 定阅者的两个不同列名称
                    if 'Subscription Type' in row:
                        new_point['subscription_type']=row['Subscription Type']
                    else:
                        new_point['subscription_type']=row['Subscriber Type']
                        
                    # 导出处理完成的信息 
                    trip_writer.writerow(new_point)
                    

In [4]:
# 运行我们上面写的程序来处理数据
station_data = ['201408_station_data.csv']
trip_in = ['201408_trip_data.csv']
trip_out = '201408_trip_summary.csv'
summarise_data(trip_in, station_data, trip_out)

# 载入数据文件并打印开头几行
sample_data=pd.read_csv(trip_out)
sample_data.head()

Unnamed: 0,duration,start_date,start_year,start_month,start_hour,weekday,start_city,end_city,start_station,end_station,Bike,subscription_type
0,6.766667,08-31-2014,2014,8,22,Sunday,Mountain View,Mountain View,Mountain View Caltrain Station,Castro Street and El Camino Real,17,Subscriber
1,7.8,08-31-2014,2014,8,22,Sunday,San Francisco,San Francisco,Beale at Market,Market at 4th,509,Customer
2,8.9,08-31-2014,2014,8,22,Sunday,San Francisco,San Francisco,Beale at Market,Market at 4th,342,Customer
3,17.35,08-31-2014,2014,8,21,Sunday,San Francisco,San Francisco,Embarcadero at Sansome,Steuart at Market,603,Customer
4,18.183333,08-31-2014,2014,8,21,Sunday,San Francisco,San Francisco,Embarcadero at Sansome,Steuart at Market,598,Customer


In [5]:
len(sample_data)

171792

In [10]:
#sample_data['start_date']

In [7]:
import datetime

In [21]:
Bike_max=sample_data.groupby('start_date')['Bike'].count()

In [31]:
Bike_max[Bike_max==117]

start_date
03-29-2014    117
Name: Bike, dtype: int64

In [32]:
Bike_max.sum()

171792

In [25]:
sample_data1=sample_data.loc[sample_data['start_city']=='San Francisco']

In [28]:
sample_data2=sample_data1.loc[sample_data['end_city']=='San Francisco']

In [29]:
sample_data2.groupby('start_station')['start_station'].count()

start_station
2nd at Folsom                                     4165
2nd at South Park                                 4569
2nd at Townsend                                   6824
5th at Howard                                     3182
Beale at Market                                   4293
Broadway St at Battery St                         2433
Civic Center BART (7th at Market)                 3268
Clay at Battery                                   2535
Commercial at Montgomery                          3039
Davis at Jackson                                  2563
Embarcadero at Bryant                             3831
Embarcadero at Folsom                             3527
Embarcadero at Sansome                            7010
Embarcadero at Vallejo                            2770
Golden Gate at Polk                               1939
Grant Avenue at Columbus Avenue                   3965
Harry Bridges Plaza (Ferry Building)              8336
Howard at 2nd                                     3

In [30]:
sample_data1.groupby('start_station')['start_station'].count()

start_station
2nd at Folsom                                     4165
2nd at South Park                                 4569
2nd at Townsend                                   6824
5th at Howard                                     3183
Beale at Market                                   4293
Broadway St at Battery St                         2433
Civic Center BART (7th at Market)                 3268
Clay at Battery                                   2535
Commercial at Montgomery                          3040
Davis at Jackson                                  2563
Embarcadero at Bryant                             3831
Embarcadero at Folsom                             3527
Embarcadero at Sansome                            7010
Embarcadero at Vallejo                            2770
Golden Gate at Polk                               1939
Grant Avenue at Columbus Avenue                   3965
Harry Bridges Plaza (Ferry Building)              8336
Howard at 2nd                                     3

In [31]:
sample_data3=sample_data.groupby(['start_city','start_station'])['start_station'].count()

In [32]:
sample_data3

start_city     start_station                                
Mountain View  Castro Street and El Camino Real                   673
               Evelyn Park and Ride                               471
               Mountain View Caltrain Station                    1810
               Mountain View City Hall                            771
               Rengstorff Avenue / California Street              380
               San Antonio Caltrain Station                       694
               San Antonio Shopping Center                        641
Palo Alto      California Ave Caltrain Station                    329
               Cowper at University                               443
               Palo Alto Caltrain Station                         564
               Park at Olive                                      185
               University and Emerson                             473
Redwood City   Broadway at Main                                    22
               Franklin at Ma

In [33]:
sample_data4=sample_data.groupby(['end_city','end_station'])['end_station'].count()

In [37]:
sample_data4.ix['San Francisco']

end_station
2nd at Folsom                                     2414
2nd at South Park                                 4159
2nd at Townsend                                   7411
5th at Howard                                     3613
Beale at Market                                   3263
Broadway St at Battery St                         1994
Civic Center BART (7th at Market)                 3438
Clay at Battery                                   2604
Commercial at Montgomery                          3028
Davis at Jackson                                  2825
Embarcadero at Bryant                             3369
Embarcadero at Folsom                             3438
Embarcadero at Sansome                            8141
Embarcadero at Vallejo                            3162
Golden Gate at Polk                               1602
Grant Avenue at Columbus Avenue                   2241
Harry Bridges Plaza (Ferry Building)              7908
Howard at 2nd                                     352