In [138]:
import pandas as pd

### Import the taxi info from both January and July 2019.

In [139]:
files = [ '../pandas-workout-data/data/nyc_taxi_2019-01.csv', '../pandas-workout-data/data/nyc_taxi_2019-07.csv']
columns = ['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
           'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge']

In [140]:
dfs = [pd.read_csv(filepath_or_buffer=file, usecols=columns, parse_dates=['tpep_pickup_datetime']) for file in files]

In [141]:
df = pd.concat(dfs)
df

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.50,7.00,0.50,0.5,1.65,0.00,0.3,9.95,
1,2019-01-01 00:59:47,1.0,2.60,14.00,0.50,0.5,1.00,0.00,0.3,16.30,
2,2018-12-21 13:48:30,3.0,0.00,4.50,0.50,0.5,0.00,0.00,0.3,5.80,
3,2018-11-28 15:52:25,5.0,0.00,3.50,0.50,0.5,0.00,0.00,0.3,7.55,
4,2018-11-28 15:56:57,5.0,0.00,52.00,0.00,0.5,0.00,0.00,0.3,55.55,
...,...,...,...,...,...,...,...,...,...,...,...
6310414,2019-07-29 16:34:53,,3.86,19.83,2.75,0.0,0.00,6.12,0.3,29.00,0.0
6310415,2019-07-29 16:07:57,,15.48,51.38,2.75,0.0,0.00,0.00,0.3,54.43,0.0
6310416,2019-07-29 16:01:31,,12.92,62.35,2.75,0.0,0.00,0.00,0.3,65.40,0.0
6310417,2019-07-29 16:58:00,,7.12,39.45,2.75,0.5,0.00,0.00,0.3,43.00,0.0


### Create a new column, pre_tip_amount, with all the payment columns except total_amount and tip_amount. (Note that total_amount is the sum of all the other payment columns, including tip_amount. It should be equivalent to calculating total_amount - tip_amount.)

In [142]:
df[['fare_amount','extra','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']].sum(axis=1)

0           8.30
1          15.30
2           5.80
3           4.80
4          52.80
           ...  
6310414    29.00
6310415    54.43
6310416    65.40
6310417    43.00
6310418    22.15
Length: 13978211, dtype: float64

In [143]:
df['pre_tip_amount'] = df[['fare_amount','extra','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']].sum(axis=1)

In [144]:
df.head(5)

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,8.3
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,15.3
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,,5.8
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,,4.8
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,,52.8


In [145]:
# df['pre_tip_amount'] = df['total_amount'] - df['tip_amount']
# df.head(5)

### Create a new column, tip_percentage, showing the percentage of pre_ tip_amount that the tip was.

In [146]:
df['tip_percentage'] = df['tip_amount']/df['pre_tip_amount']
df.head(5)

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,8.3,0.198795
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,15.3,0.065359
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,,5.8,0.0
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,,4.8,0.0
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,,52.8,0.0


### What was the mean tip percentage across all trips in the data set?

In [147]:
df['tip_percentage'].mean()

np.float64(0.13003974566357937)

In [148]:
# How many times did people tip more than 100%?
(df['tip_percentage'] > 1).value_counts()

tip_percentage
False    13970379
True         7832
Name: count, dtype: int64

In [149]:
(df['tip_amount'] == 0).value_counts(normalize=True)

tip_amount
False    0.679015
True     0.320985
Name: proportion, dtype: float64

### How many times did people tip more than the pretip amount?

In [150]:
df.loc[df['tip_amount'] > df['pre_tip_amount']]

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
119,2019-01-01 00:19:39,2.0,0.60,13.00,0.5,0.5,25.0,0.0,0.3,39.30,,14.30,1.748252
299,2019-01-01 00:24:24,2.0,1.13,5.50,0.5,0.5,13.2,0.0,0.3,20.00,,6.80,1.941176
663,2019-01-01 00:32:56,2.0,0.10,-2.50,-0.5,-0.5,0.0,0.0,-0.3,-3.80,,-3.80,-0.000000
947,2019-01-01 00:30:57,2.0,0.00,2.50,0.5,0.5,8.0,0.0,0.3,11.80,,3.80,2.105263
1131,2019-01-01 00:36:12,2.0,1.00,7.00,0.5,0.5,20.0,0.0,0.3,28.30,,8.30,2.409639
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6304692,2019-07-26 06:04:10,,0.01,-23.90,0.0,-0.5,0.0,0.0,-0.3,-24.70,0.0,-24.70,-0.000000
6304694,2019-07-26 06:00:10,,0.14,-23.34,0.0,-0.5,0.0,0.0,-0.3,-24.14,0.0,-24.14,-0.000000
6305581,2019-07-27 20:39:10,,0.05,-24.04,0.0,-0.5,0.0,0.0,-0.3,-24.84,0.0,-24.84,-0.000000
6307658,2019-07-30 07:01:10,,0.00,-16.89,0.0,-0.5,0.0,0.0,-0.3,-17.69,0.0,-17.69,-0.000000


In [151]:
df.loc[df['tip_amount'] > df['pre_tip_amount']].shape[0]

28232

### On which day of the week do people tip the greatest percentage of the fare, on average?


In [152]:
df['tpep_pickup_datetime'].dt.weekday

0          1
1          1
2          4
3          2
4          2
          ..
6310414    0
6310415    0
6310416    0
6310417    0
6310418    0
Name: tpep_pickup_datetime, Length: 13978211, dtype: int32

In [153]:
# We can run the groupby method without creating a new column, just using the the result we get back from dt.day_of_week
df.groupby(df['tpep_pickup_datetime'].dt.weekday)['tip_percentage'].mean()

tpep_pickup_datetime
0    0.128723
1    0.131424
2    0.132221
3    0.133970
4    0.129136
5    0.125801
6    0.126634
Name: tip_percentage, dtype: float64

In [154]:
df.groupby(df['tpep_pickup_datetime'].dt.weekday)['tip_percentage'].mean().sort_values().tail(1)

tpep_pickup_datetime
3    0.13397
Name: tip_percentage, dtype: float64

### At which hour do people tip the greatest percentage?

In [155]:
df['tpep_pickup_datetime'].dt.hour

0           0
1           0
2          13
3          15
4          15
           ..
6310414    16
6310415    16
6310416    16
6310417    16
6310418    16
Name: tpep_pickup_datetime, Length: 13978211, dtype: int32

In [159]:
df.groupby(df['tpep_pickup_datetime'].dt.hour)['tip_percentage'].mean()

tpep_pickup_datetime
0     0.131490
1     0.130710
2     0.130914
3     0.121053
4     0.118987
5     0.112028
6     0.119915
7     0.132134
8     0.137116
9     0.133017
10    0.127200
11    0.125022
12    0.124376
13    0.124567
14    0.123727
15    0.123547
16    0.124655
17    0.128640
18    0.133292
19    0.135174
20    0.138160
21    0.137685
22    0.138816
23    0.134978
Name: tip_percentage, dtype: float64

In [160]:
df.groupby(df['tpep_pickup_datetime'].dt.hour)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
22    0.138816
20    0.138160
21    0.137685
8     0.137116
19    0.135174
23    0.134978
18    0.133292
9     0.133017
7     0.132134
0     0.131490
2     0.130914
1     0.130710
17    0.128640
10    0.127200
11    0.125022
16    0.124655
13    0.124567
12    0.124376
14    0.123727
15    0.123547
3     0.121053
6     0.119915
4     0.118987
5     0.112028
Name: tip_percentage, dtype: float64

### Do people typically tip more in January or July?

In [163]:
df['tpep_pickup_datetime'].dt.month

0           1
1           1
2          12
3          11
4          11
           ..
6310414     7
6310415     7
6310416     7
6310417     7
6310418     7
Name: tpep_pickup_datetime, Length: 13978211, dtype: int32

In [170]:
df.groupby(df['tpep_pickup_datetime'].dt.month)['tip_percentage'].mean().sort_values(ascending=False) # the dataset is not cleaning at all, some data is scrambled

tpep_pickup_datetime
5     0.200000
8     0.158099
3     0.148046
9     0.141431
1     0.137011
2     0.132224
7     0.121570
12    0.109367
6     0.107354
10    0.100000
4     0.074877
11    0.046026
Name: tip_percentage, dtype: float64

In [176]:
df.groupby(df['tpep_pickup_datetime'].dt.month)['tip_percentage'].mean().iloc[[0, 6]]

tpep_pickup_datetime
1    0.137011
7    0.121570
Name: tip_percentage, dtype: float64

### What was the 1-day period in our data set when people tipped the greatest percentage?

In [177]:
df = df.set_index('tpep_pickup_datetime')

In [179]:
df.resample('D')['tip_percentage'].mean()

tpep_pickup_datetime
2001-02-02    0.0
2001-02-03    NaN
2001-02-04    NaN
2001-02-05    NaN
2001-02-06    NaN
             ... 
2088-01-20    NaN
2088-01-21    NaN
2088-01-22    NaN
2088-01-23    NaN
2088-01-24    0.0
Freq: D, Name: tip_percentage, Length: 31768, dtype: float64

In [181]:
df.resample('1D')['tip_percentage'].mean().sort_values(ascending=False).head(10)

tpep_pickup_datetime
2019-02-13    0.358127
2019-02-25    0.250000
2019-08-20    0.241865
2019-05-20    0.200000
2019-08-15    0.200000
2019-09-22    0.200000
2019-11-27    0.200000
2019-08-10    0.200000
2019-09-24    0.200000
2010-08-08    0.200000
Name: tip_percentage, dtype: float64

In [183]:
df.loc['2019-01']

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01 00:46:40,1.0,1.50,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,8.3,0.198795
2019-01-01 00:59:47,1.0,2.60,14.0,0.5,0.5,1.00,0.0,0.3,16.30,,15.3,0.065359
2019-01-01 00:21:28,1.0,1.30,6.5,0.5,0.5,1.25,0.0,0.3,9.05,,7.8,0.160256
2019-01-01 00:32:01,1.0,3.70,13.5,0.5,0.5,3.70,0.0,0.3,18.50,,14.8,0.250000
2019-01-01 00:57:32,2.0,2.10,10.0,0.5,0.5,1.70,0.0,0.3,13.00,,11.3,0.150442
...,...,...,...,...,...,...,...,...,...,...,...,...
2019-01-31 23:57:36,1.0,4.79,18.0,0.5,0.5,3.86,0.0,0.3,23.16,0.0,19.3,0.200000
2019-01-31 23:32:03,1.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,
2019-01-31 23:36:36,1.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,
2019-01-31 23:14:53,1.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,


In [189]:
df.sort_index().loc['2019-01-01':'2019-01-31'] # first sort or you are going to get a non-monotonic error

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01 00:00:00,2.0,7.37,23.5,0.5,0.5,0.00,0.0,0.3,24.80,,24.8,0.000000
2019-01-01 00:00:01,6.0,1.73,7.0,0.5,0.5,0.00,0.0,0.3,8.30,,8.3,0.000000
2019-01-01 00:00:03,1.0,0.60,5.0,0.5,0.5,0.00,0.0,0.3,6.30,,6.3,0.000000
2019-01-01 00:00:05,1.0,1.53,9.0,0.5,0.5,0.00,0.0,0.3,10.30,,10.3,0.000000
2019-01-01 00:00:06,1.0,3.20,26.0,0.5,0.5,5.45,0.0,0.3,32.75,,27.3,0.199634
...,...,...,...,...,...,...,...,...,...,...,...,...
2019-01-31 23:59:55,2.0,1.41,6.5,0.5,0.5,1.00,0.0,0.3,8.80,0.0,7.8,0.128205
2019-01-31 23:59:55,0.0,2.90,11.5,0.5,0.5,0.00,0.0,0.3,12.80,0.0,12.8,0.000000
2019-01-31 23:59:56,1.0,0.20,3.0,0.5,0.5,1.25,0.0,0.3,5.55,0.0,4.3,0.290698
2019-01-31 23:59:56,1.0,2.59,11.0,0.5,0.5,2.46,0.0,0.3,14.76,0.0,12.3,0.200000


In [190]:
df.loc['07-2019']

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-07-01 17:02:05,1.0,3.24,13.50,0.50,0.5,2.0,0.00,0.3,16.80,,14.80,0.135135
2019-07-01 17:21:00,1.0,0.73,4.50,0.50,0.5,0.0,0.00,0.3,5.80,,5.80,0.000000
2019-07-01 17:37:03,1.0,1.37,6.00,0.50,0.5,1.0,0.00,0.3,8.30,,7.30,0.136986
2019-07-01 17:44:00,1.0,1.45,7.00,0.50,0.5,1.0,0.00,0.3,9.30,,8.30,0.120482
2019-07-01 17:55:52,1.0,0.45,4.00,0.50,0.5,0.0,0.00,0.3,5.30,,5.30,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-29 16:34:53,,3.86,19.83,2.75,0.0,0.0,6.12,0.3,29.00,0.0,29.00,0.000000
2019-07-29 16:07:57,,15.48,51.38,2.75,0.0,0.0,0.00,0.3,54.43,0.0,54.43,0.000000
2019-07-29 16:01:31,,12.92,62.35,2.75,0.0,0.0,0.00,0.3,65.40,0.0,65.40,0.000000
2019-07-29 16:58:00,,7.12,39.45,2.75,0.5,0.0,0.00,0.3,43.00,0.0,43.00,0.000000


In [194]:
df.sort_index().loc['2019-07-01':'2019-07-31']

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-07-01 00:00:00,2.0,3.34,12.5,0.5,0.5,2.70,0.0,0.3,19.00,2.5,16.3,0.165644
2019-07-01 00:00:01,2.0,1.23,6.5,0.5,0.5,2.06,0.0,0.3,12.36,2.5,10.3,0.200000
2019-07-01 00:00:03,1.0,5.46,17.0,0.5,0.5,0.00,0.0,0.3,18.30,0.0,18.3,0.000000
2019-07-01 00:00:04,2.0,1.10,8.0,3.0,0.5,0.00,0.0,0.3,11.80,2.5,14.3,0.000000
2019-07-01 00:00:05,1.0,2.05,13.5,0.5,0.5,3.46,0.0,0.3,20.76,2.5,17.3,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-31 23:59:51,1.0,2.46,11.0,0.5,0.5,2.96,0.0,0.3,17.76,2.5,14.8,0.200000
2019-07-31 23:59:52,2.0,6.20,20.5,0.5,0.5,6.50,0.0,0.3,28.30,0.0,21.8,0.298165
2019-07-31 23:59:55,1.0,1.26,6.0,0.5,0.5,1.96,0.0,0.3,11.76,2.5,9.8,0.200000
2019-07-31 23:59:56,1.0,1.20,6.0,3.0,0.5,2.45,0.0,0.3,12.25,2.5,12.3,0.199187


In [198]:
df = pd.concat([df.loc['2019-01'], df.loc['2019-07']])
df

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01 00:46:40,1.0,1.50,7.00,0.50,0.5,1.65,0.00,0.3,9.95,,8.30,0.198795
2019-01-01 00:59:47,1.0,2.60,14.00,0.50,0.5,1.00,0.00,0.3,16.30,,15.30,0.065359
2019-01-01 00:21:28,1.0,1.30,6.50,0.50,0.5,1.25,0.00,0.3,9.05,,7.80,0.160256
2019-01-01 00:32:01,1.0,3.70,13.50,0.50,0.5,3.70,0.00,0.3,18.50,,14.80,0.250000
2019-01-01 00:57:32,2.0,2.10,10.00,0.50,0.5,1.70,0.00,0.3,13.00,,11.30,0.150442
...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-29 16:34:53,,3.86,19.83,2.75,0.0,0.00,6.12,0.3,29.00,0.0,29.00,0.000000
2019-07-29 16:07:57,,15.48,51.38,2.75,0.0,0.00,0.00,0.3,54.43,0.0,54.43,0.000000
2019-07-29 16:01:31,,12.92,62.35,2.75,0.0,0.00,0.00,0.3,65.40,0.0,65.40,0.000000
2019-07-29 16:58:00,,7.12,39.45,2.75,0.5,0.00,0.00,0.3,43.00,0.0,43.00,0.000000


In [202]:
df.resample('1D')['tip_percentage'].mean().sort_values(ascending=False).head(10)

tpep_pickup_datetime
2019-01-31    0.144351
2019-01-30    0.143530
2019-01-24    0.143434
2019-01-22    0.142769
2019-01-15    0.142329
2019-01-29    0.141330
2019-01-10    0.141291
2019-01-16    0.141147
2019-01-17    0.140356
2019-01-23    0.140309
Name: tip_percentage, dtype: float64

In [207]:
df.resample('1D')['tip_percentage'].mean().dropna() # using resample means we get NaN values for February, March, April, May, and June. We thus remove those with dropna. 

tpep_pickup_datetime
2019-01-01    0.120109
2019-01-02    0.123951
2019-01-03    0.125335
2019-01-04    0.129713
2019-01-05    0.130852
                ...   
2019-07-27    0.114696
2019-07-28    0.116914
2019-07-29    0.121732
2019-07-30    0.125061
2019-07-31    0.127564
Name: tip_percentage, Length: 62, dtype: float64

## Beyond the exercise

### You saw that 32% of riders don’t tip at all. Of those who do, what percentage do they tip, on average?

In [244]:
(df['tip_amount'] == 0).value_counts(normalize=True)

tip_amount
False    0.679025
True     0.320975
Name: proportion, dtype: float64

In [245]:
df.loc[df['tip_amount'] != 0, 'tip_percentage'].mean()

np.float64(0.19147066349422356)

### How many of the rides in the data set, supposedly from January and July 2019, are from outside of those dates?

In [246]:
dfs = [pd.read_csv(filepath_or_buffer=file, usecols=columns, parse_dates=['tpep_pickup_datetime']) for file in files]
df = pd.concat(dfs)
df.head(5)

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [247]:
df['pre_tip_amount'] = df[['fare_amount','extra','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']].sum(axis=1)
df['tip_percentage'] = df['tip_amount']/df['pre_tip_amount']

In [248]:
((df['tpep_pickup_datetime'].dt.month != 1) & (df['tpep_pickup_datetime'].dt.month != 7)).sum()

np.int64(728)

In [249]:
df.groupby(df['tpep_pickup_datetime'].dt.month)['tpep_pickup_datetime'].count().sort_values().iloc[0:10].sum()

np.int64(728)

### Looking only at dates in January and July, in what week did ``passengers`` tip the greatest percentage?

In [250]:
df = df.set_index('tpep_pickup_datetime')
df.index

DatetimeIndex(['2019-01-01 00:46:40', '2019-01-01 00:59:47',
               '2018-12-21 13:48:30', '2018-11-28 15:52:25',
               '2018-11-28 15:56:57', '2018-11-28 16:25:49',
               '2018-11-28 16:29:37', '2019-01-01 00:21:28',
               '2019-01-01 00:32:01', '2019-01-01 00:57:32',
               ...
               '2019-07-29 16:51:18', '2019-07-29 16:32:00',
               '2019-07-29 16:02:00', '2019-07-29 16:29:00',
               '2019-07-29 16:59:39', '2019-07-29 16:34:53',
               '2019-07-29 16:07:57', '2019-07-29 16:01:31',
               '2019-07-29 16:58:00', '2019-07-29 16:16:00'],
              dtype='datetime64[ns]', name='tpep_pickup_datetime', length=13978211, freq=None)

In [251]:
df = pd.concat([df.loc['2019-01'], df.loc['2019-07']])
df.index.month.unique()

Index([1, 7], dtype='int32', name='tpep_pickup_datetime')

In [252]:
df.resample('ME')['tip_percentage'].mean()

tpep_pickup_datetime
2019-01-31    0.137012
2019-02-28         NaN
2019-03-31         NaN
2019-04-30         NaN
2019-05-31         NaN
2019-06-30         NaN
2019-07-31    0.121570
Freq: ME, Name: tip_percentage, dtype: float64

In [253]:
df.resample('ME')['tip_percentage'].mean().dropna()

tpep_pickup_datetime
2019-01-31    0.137012
2019-07-31    0.121570
Name: tip_percentage, dtype: float64

In [256]:
df.resample('1W')['tip_percentage'].mean().dropna().sort_values(ascending=False)

tpep_pickup_datetime
2019-02-03    0.141979
2019-01-27    0.138930
2019-01-20    0.138536
2019-01-13    0.137901
2019-01-06    0.126983
2019-08-04    0.124910
2019-07-14    0.123459
2019-07-21    0.123341
2019-07-28    0.123036
2019-07-07    0.112952
Name: tip_percentage, dtype: float64