In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Import Data

In [2]:
df_fhvhv = pd.read_parquet('../fhvhv_tripdata_2019-02.parquet', engine='pyarrow')
df_fhvhv.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02867,B02867,2019-02-01 00:01:26,2019-02-01 00:02:55,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,2.45,...,0.83,0.0,,0.0,7.48,Y,N,N,N,
1,HV0003,B02879,B02879,2019-02-01 00:26:08,2019-02-01 00:41:29,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,1.71,...,0.7,0.0,,2.0,7.93,N,N,N,N,
2,HV0005,B02510,,2019-02-01 00:48:58,NaT,2019-02-01 00:51:34,2019-02-01 01:28:29,261,234,5.01,...,3.99,0.0,,0.0,35.97,N,Y,N,N,
3,HV0005,B02510,,2019-02-01 00:02:15,NaT,2019-02-01 00:03:51,2019-02-01 00:07:16,87,87,0.34,...,0.64,0.0,,3.0,5.39,N,Y,N,N,
4,HV0005,B02510,,2019-02-01 00:06:17,NaT,2019-02-01 00:09:44,2019-02-01 00:39:56,87,198,6.84,...,2.16,0.0,,4.0,17.07,N,Y,N,N,


In [3]:
df_fhvhv.shape

(20159102, 24)

共有24個columns，分兩部分查看column info

In [4]:
df_fhvhv.iloc[:, :11].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20159102 entries, 0 to 20159101
Data columns (total 11 columns):
 #   Column                Non-Null Count     Dtype         
---  ------                --------------     -----         
 0   hvfhs_license_num     20159102 non-null  object        
 1   dispatching_base_num  20158697 non-null  object        
 2   originating_base_num  14483914 non-null  object        
 3   request_datetime      20050204 non-null  datetime64[ns]
 4   on_scene_datetime     13505053 non-null  datetime64[ns]
 5   pickup_datetime       20159102 non-null  datetime64[ns]
 6   dropoff_datetime      20159102 non-null  datetime64[ns]
 7   PULocationID          20159102 non-null  int64         
 8   DOLocationID          20159102 non-null  int64         
 9   trip_miles            20159102 non-null  float64       
 10  trip_time             20159102 non-null  int64         
dtypes: datetime64[ns](4), float64(1), int64(3), object(3)
memory usage: 1.7+ GB


In [5]:
df_fhvhv.iloc[:, 12:].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20159102 entries, 0 to 20159101
Data columns (total 12 columns):
 #   Column                Non-Null Count     Dtype  
---  ------                --------------     -----  
 0   tolls                 20159102 non-null  float64
 1   bcf                   20159102 non-null  float64
 2   sales_tax             20159102 non-null  float64
 3   congestion_surcharge  19646061 non-null  float64
 4   airport_fee           0 non-null         object 
 5   tips                  20159102 non-null  float64
 6   driver_pay            20159102 non-null  float64
 7   shared_request_flag   20159102 non-null  object 
 8   shared_match_flag     20159102 non-null  object 
 9   access_a_ride_flag    20159102 non-null  object 
 10  wav_request_flag      20159102 non-null  object 
 11  wav_match_flag        0 non-null         object 
dtypes: float64(6), object(6)
memory usage: 1.8+ GB


原始資料有兩千多萬筆，避免處理時間過長，僅擷取2/1資料

In [6]:
df_0201 = df_fhvhv[df_fhvhv['pickup_datetime'].dt.day == 1]
df_0201.shape

(860161, 24)

## 計算總車資

不包含driver_pay，driver_pay為該趟行程Uber支付給司機的費用

In [7]:
df_0201.total_amount = df_0201[['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips']].sum(axis=1)
df_0201.total_amount

  df_0201.total_amount = df_0201[['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips']].sum(axis=1)


0         10.41
1         10.81
2         50.07
3         11.01
4         31.13
          ...  
860156    47.22
860157    19.36
860158    11.35
860159    44.16
860160    13.71
Length: 860161, dtype: float64

稅的計算是根據哪些費用?

In [8]:
df_0201.cal_tax_perc = df_0201.sales_tax / (df_0201.total_amount - df_0201.sales_tax - df_0201.tips)
df_0201.cal_tax_perc

  df_0201.cal_tax_perc = df_0201.sales_tax / (df_0201.total_amount - df_0201.sales_tax - df_0201.tips)


0         0.086639
1         0.086313
2         0.086589
3         0.086839
4         0.086504
            ...   
860156    0.086516
860157    0.086420
860158    0.086124
860159    0.081823
860160    0.086371
Length: 860161, dtype: float64

## EDA

查看每種license_num各有多少record

In [9]:
print(df_0201.hvfhs_license_num.value_counts())
print(df_0201.hvfhs_license_num.value_counts(normalize=True))

HV0003    571580
HV0005    197357
HV0002     47785
HV0004     43439
Name: hvfhs_license_num, dtype: int64
HV0003    0.664504
HV0005    0.229442
HV0002    0.055554
HV0004    0.050501
Name: hvfhs_license_num, dtype: float64


由於Juno(HV0002)和Via(HV0004)資料量共占10%左右，與Uber(HV0003)、Lyft(HV0005)有較大差距，僅繼續分析**Uber**和**Lyft**的資料  
將Uber和Lyft的資料分開

In [10]:
df_uber = df_0201[df_0201['hvfhs_license_num'] == 'HV0003']
df_lyft = df_0201[df_0201['hvfhs_license_num'] == 'HV0005']

FHV，看星期幾的搭乘次數、小費最多  
行程距離與小費畫成散點圖