## Short, medium, and long taxi rides

In [36]:
import pandas as pd
import numpy as np

In [37]:
df = pd.read_csv('../data/nyc_taxi_2019-07.csv',
                 usecols=['tpep_pickup_datetime',
                          'tpep_dropoff_datetime',
                          'trip_distance',
                          'passenger_count',
                          'total_amount'],
                 parse_dates=['tpep_pickup_datetime',
                              'tpep_dropoff_datetime'])


In [38]:
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3


In [39]:
df.dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
total_amount                    float64
dtype: object

In [40]:
df['trip_time'] = (df['tpep_dropoff_datetime'] -df['tpep_pickup_datetime'])

In [41]:
df.loc[df['trip_time'] < '1 minute','trip_time'].count()

np.int64(70212)

In [42]:
df.loc[df['trip_time'] < '1 minute', 'trip_time'].count() / df['trip_time'].count() * 100

np.float64(1.1126361022936828)

In [43]:
df.loc[df['trip_time'] < '1 minute','total_amount'].mean()

np.float64(30.397584031219733)

In [44]:
df.loc[df['trip_time'] > '10 hours', 'trip_time'].count()

np.int64(16698)

In [45]:
df.loc[df['trip_time'] > '10 hours', 'trip_time'].count() / df['trip_time'].count() * 100

np.float64(0.2646100045020782)

In [46]:
df['trip_time_group'] = pd.cut(df['trip_time'],
                                bins=[pd.to_timedelta(arg) for arg in ['0 seconds','10 minutes','1 hour','100 hours']],
                                labels=['short', 'medium', 'long'])


In [47]:
df.groupby('trip_time_group', observed=False)['passenger_count'].mean()

trip_time_group
short     1.552411
medium    1.585806
long      1.700859
Name: passenger_count, dtype: float64

In [48]:
len(df[(df['tpep_pickup_datetime'] < '2019-07-01') | (df['tpep_pickup_datetime'] > '2019-07-31 23:59' )].index)

387

In [49]:
df['trip_time'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df.groupby('passenger_count')['trip_time'].mean()

passenger_count
0.0   0 days 00:14:18.929810752
1.0   0 days 00:17:46.148103924
2.0   0 days 00:18:34.024342704
3.0   0 days 00:19:02.079604271
4.0   0 days 00:20:10.057290100
5.0   0 days 00:22:29.870464324
6.0   0 days 00:20:54.109564300
7.0   0 days 00:16:38.206896551
8.0      0 days 00:11:00.500000
9.0      0 days 00:49:16.125000
Name: trip_time, dtype: timedelta64[ns]

In [50]:
df2 = pd.read_csv('../data/nyc_taxi_2020-07.csv',
                  usecols=['tpep_pickup_datetime',
                           'tpep_dropoff_datetime',
                           'trip_distance', 
                           'passenger_count', 
                           'total_amount'],
                    parse_dates=['tpep_pickup_datetime', 
                                 'tpep_dropoff_datetime'])

all_df = pd.concat([df,df2])

In [51]:
all_df.groupby([all_df['tpep_pickup_datetime'].dt.year, 'passenger_count'])['total_amount'].mean()

tpep_pickup_datetime  passenger_count
2002                  1.0                18.002500
                      2.0                18.800000
2008                  1.0                18.340000
                      2.0                42.860000
                      5.0                11.966667
2009                  1.0                23.923571
                      2.0                45.316000
2010                  2.0                18.360000
2019                  0.0                18.981793
                      1.0                19.284646
                      2.0                20.097442
                      3.0                20.208111
                      4.0                21.063172
                      5.0                19.419311
                      6.0                19.386516
                      7.0                70.080690
                      8.0                74.760455
                      9.0                93.509375
2020                  0.0                16.