In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '../../pandas-workout-data/data/nyc_taxi_2019-01.csv'
columns = ['passenger_count', 'trip_distance', 'total_amount']

In [3]:
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
2,3,0.00,5.80
3,5,0.00,7.55
4,5,0.00,55.55
...,...,...,...
7667787,1,4.79,23.16
7667788,1,0.00,0.00
7667789,1,0.00,0.00
7667790,1,0.00,0.00


Using a descending sort, find the average cost of the 20 longest (in distance) taxi rides in January 2019.

In [4]:
df['trip_distance'].sort_values(ascending=False).head(20)

6074021    831.80
4286612    700.70
6770897    214.01
4707513    211.36
4881766    201.27
4813319    160.52
2567394    144.20
4876401    143.63
1144878    142.88
4911293    132.80
5040285    131.47
11081      128.73
6079036    128.26
4999998    123.42
6119013    121.80
56477      119.80
6405247    115.85
1128002    113.71
6554697    113.58
7392997    112.82
Name: trip_distance, dtype: float64

To **sort** our ``data frame`` by the trip_distance column, we can say

In [5]:
df.sort_values(by='trip_distance', ascending=False) # This returns a new data frame identical to df, but with the rows sorted according to trip_distance in ascending order

Unnamed: 0,passenger_count,trip_distance,total_amount
6074021,1,831.80,11.76
4286612,1,700.70,9.00
6770897,5,214.01,761.80
4707513,2,211.36,56.56
4881766,1,201.27,152.46
...,...,...,...
4570324,2,0.00,12.80
4570397,2,0.00,63.35
5117570,1,0.00,3.30
6960464,1,0.00,3.34


Our analysis is of the total_amount column. With the data already sorted by trip_ distance, we can now retrieve just that one column using square brackets

In [6]:
df.sort_values(by='trip_distance',ascending=False)['total_amount'].iloc[:20]


6074021     11.76
4286612      9.00
6770897    761.80
4707513     56.56
4881766    152.46
4813319    143.56
2567394     18.80
4876401    456.56
1144878    327.38
4911293    238.70
5040285    614.30
11081      453.44
6079036    220.80
4999998    514.32
6119013    322.32
56477      330.30
6405247    230.30
1128002      0.00
6554697    518.32
7392997    419.52
Name: total_amount, dtype: float64

In [None]:
df.sort_values(by='trip_distance',ascending=False)['total_amount'].loc[:20] # In this case loc gives me more rows why?

6074021     11.76
4286612      9.00
6770897    761.80
4707513     56.56
4881766    152.46
            ...  
3195750      4.80
699753       3.30
3462122     78.96
3463637      4.30
20           3.80
Name: total_amount, Length: 7600174, dtype: float64

Notice that we have to use **iloc** here, **not loc**. ``That’s because loc works with the actual index values—which``, now that we’ve sorted the data frame by ``trip_distance``, **are unordered.** ``Asking for loc[:20] will return many more than 20 rows``

Having retrieved total_amount from the 20 longest-distance taxi rides, we can finally calculate the mean value:

In [8]:
df.sort_values(
    by='trip_distance', ascending=False
               )['total_amount'].iloc[0:20].mean()

np.float64(290.00999999999993)

Using an ascending sort, find the average cost of the 20 longest (in distance) taxi rides in January 2019. Are the results any different?

In [9]:
df.sort_values(
    by='trip_distance', ascending=True
               )['total_amount'].iloc[-20:].mean()

np.float64(290.01000000000005)

But let’s ignore the rounded results and look at the original results, 290.00999999999993 and 290.0100000000001. The differences are slight, but they’re real. 
Is there anything we can do to avoid such problems?

The answer is: sort of. ``If we use longer (i.e., more bits) floats, such problems will crop up less often.`` For example, we can instruct pandas to read the total_amount column into 128-bit floats, ``rather than 64-bit floats, which are the default``:

In [10]:
df['total_amount'].dtype

dtype('float64')

In [11]:
df = pd.read_csv(
    filepath_or_buffer=path,
    usecols=columns,
    dtype={
        'total_amount':np.longdouble
    }
)
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
2,3,0.00,5.80
3,5,0.00,7.55
4,5,0.00,55.55
...,...,...,...
7667787,1,4.79,23.16
7667788,1,0.00,0.00
7667789,1,0.00,0.00
7667790,1,0.00,0.00


In [12]:
df.dtypes

passenger_count      int64
trip_distance      float64
total_amount       float64
dtype: object

In [13]:
df['total_amount'].dtype

dtype('float64')

In [14]:
df.sort_values(
    by='trip_distance',
    ascending=False
    )['total_amount'].iloc[:20].mean()

np.float64(290.00999999999993)

In [15]:
df.sort_values(
    by='trip_distance',
    ascending=True
    )['total_amount'].iloc[-20:].mean()

np.float64(290.01000000000005)

In [16]:
# In this case is not working the 'total_amount':np.longdouble

Sort by ``ascending passenger count and descending trip distance.`` (So, start with the longest trip with 0 passengers and end with the shortest trip with 9 passengers.) What is the average price paid for the top 50 rides?


We should sort the rows of ``df`` in **ascending** order from the smallest number of passengers to the greatest number o passengers. ``And in resolving ties between rows with the same passenger count`` We can use the **trip_distance** column. However, whereas **passenger_count** is sorted in ascending order, I asked you to sort **trip_distance** in descending order 

In [17]:
df.sort_values(
    by=['passenger_count', 'trip_distance'],
    ascending=[True, False]           
               )

Unnamed: 0,passenger_count,trip_distance,total_amount
3528550,0,62.4,320.30
3296320,0,55.0,336.35
1878399,0,50.7,227.56
5549709,0,48.9,260.30
4176048,0,46.2,200.06
...,...,...,...
2883943,9,0.0,12.25
4534691,9,0.0,110.76
4852210,9,0.0,12.74
4997772,9,0.0,9.80


The first row of the returned data frame has the longest trip for the fewest passengers, and its final row has the shortest trip for the most passengers.

In [18]:
# Test of mine
df.sort_values(
    by=['passenger_count', 'trip_distance'],
    ascending=[True, True]           
               )

Unnamed: 0,passenger_count,trip_distance,total_amount
24423,0,0.00,3.80
30590,0,0.00,4.80
40644,0,0.00,24.35
41039,0,0.00,25.00
41984,0,0.00,54.36
...,...,...,...
4534691,9,0.00,110.76
4852210,9,0.00,12.74
4997772,9,0.00,9.80
7286548,9,0.00,10.30


In [19]:
(
    df
    .sort_values(['passenger_count',
                  'trip_distance'],
                  ascending=[True, False])
   ['total_amount']
   .iloc[:50]
   .mean()
)

np.float64(135.49739999999997)

# Beyond the exercise

### In which five rides did people pay the most per mile?


In [20]:
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
2,3,0.00,5.80
3,5,0.00,7.55
4,5,0.00,55.55
...,...,...,...
7667787,1,4.79,23.16
7667788,1,0.00,0.00
7667789,1,0.00,0.00
7667790,1,0.00,0.00


In [21]:
# First, remove 0-length trips
mask = df['trip_distance'] != 0

In [22]:
(df['trip_distance'] == 0).sum()


np.int64(54770)

In [23]:
df = df.loc[mask].copy()
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
7,1,1.30,9.05
8,1,3.70,18.50
9,2,2.10,13.00
...,...,...,...
7667783,2,4.15,15.80
7667784,1,1.34,9.30
7667785,1,1.45,14.16
7667786,2,4.28,21.96


In [24]:
(df['trip_distance'] == 0).sum()

np.int64(0)

In [25]:
df['cost_per_mile'] = df['total_amount'].divide(df['trip_distance'])

In [26]:
df.sort_values(by='cost_per_mile', ascending=False).iloc[slice(0,5)]

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile
2499600,1,2.4,623261.66,259692.358333
478791,1,0.1,6667.45,66674.5
7099014,4,0.01,415.3,41530.0
6403254,1,0.01,322.3,32230.0
4136499,1,0.01,273.96,27396.0


In [27]:
df.sort_values(by='cost_per_mile', ascending=True).iloc[slice(-5,None)]

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile
4136499,1,0.01,273.96,27396.0
6403254,1,0.01,322.3,32230.0
7099014,4,0.01,415.3,41530.0
478791,1,0.1,6667.45,66674.5
2499600,1,2.4,623261.66,259692.358333


### Let's assume that multi-passenger rides are split evenly among the passengers. Given that assumption, in which 10 rides did each individual pay the greatest amount? And again, how far did they travel?

In [28]:
mask = df['passenger_count'] >= 2

In [29]:
df = df.loc[mask].copy()
df

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile
9,2,2.10,13.00,6.190476
10,2,2.80,19.55,6.982143
27,3,0.60,8.15,13.583333
29,2,3.00,15.95,5.316667
33,2,19.13,70.27,3.673288
...,...,...,...,...
7667780,3,1.14,9.96,8.736842
7667781,3,1.89,10.80,5.714286
7667782,3,8.14,34.80,4.275184
7667783,2,4.15,15.80,3.807229


In [30]:
df['payment_per_person'] = df['total_amount'] / df['passenger_count']
df

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile,payment_per_person
9,2,2.10,13.00,6.190476,6.500000
10,2,2.80,19.55,6.982143,9.775000
27,3,0.60,8.15,13.583333,2.716667
29,2,3.00,15.95,5.316667,7.975000
33,2,19.13,70.27,3.673288,35.135000
...,...,...,...,...,...
7667780,3,1.14,9.96,8.736842,3.320000
7667781,3,1.89,10.80,5.714286,3.600000
7667782,3,8.14,34.80,4.275184,11.600000
7667783,2,4.15,15.80,3.807229,7.900000


In [31]:
df.sort_values('payment_per_person').tail(10)

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile,payment_per_person
5031491,2,64.3,343.32,5.339347,171.66
4563340,2,0.4,350.3,875.75,175.15
4202883,2,60.23,369.06,6.127511,184.53
4751745,2,100.78,403.5,4.003771,201.75
5726185,2,65.05,416.82,6.407686,208.41
149362,2,17.2,426.8,24.813953,213.4
7593395,2,83.61,449.32,5.373998,224.66
3842620,2,110.04,515.82,4.687568,257.91
3014027,2,16.6,560.76,33.780723,280.38
2972145,2,19.9,589.96,29.646231,294.98


### In the exercise solution, I showed that we needed to use iloc or head/tail to retrieve the first/last 20 rows because the index was scrambled after our sort operation. But you can pass ignore_index=True to sort_values: then the resulting data frame has a numeric index starting at 0. Use this option and loc to get the mean total_amount for the 20 longest trips.

In [32]:
df.head(5)

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile,payment_per_person
9,2,2.1,13.0,6.190476,6.5
10,2,2.8,19.55,6.982143,9.775
27,3,0.6,8.15,13.583333,2.716667
29,2,3.0,15.95,5.316667,7.975
33,2,19.13,70.27,3.673288,35.135


In [None]:
df.sort_values(by='trip_distance', ignore_index=True, ascending=False)

Unnamed: 0,passenger_count,trip_distance,total_amount,cost_per_mile,payment_per_person
0,5,214.01,761.80,3.559647,152.360000
1,2,211.36,56.56,0.267600,28.280000
2,4,160.52,143.56,0.894343,35.890000
3,2,144.20,18.80,0.130374,9.400000
4,3,142.88,327.38,2.291293,109.126667
...,...,...,...,...,...
2083560,2,0.01,4.30,430.000000,2.150000
2083561,2,0.01,3.30,330.000000,1.650000
2083562,6,0.01,3.30,330.000000,0.550000
2083563,3,0.01,3.30,330.000000,1.100000


In [36]:
df.sort_values(by='trip_distance', ignore_index=True, ascending=False).loc[slice(0,20), ['trip_distance', 'total_amount']]

Unnamed: 0,trip_distance,total_amount
0,214.01,761.8
1,211.36,56.56
2,160.52,143.56
3,144.2,18.8
4,142.88,327.38
5,121.8,322.32
6,115.85,230.3
7,110.04,515.82
8,102.2,11.3
9,100.78,403.5


In [44]:
df.sort_values(by='trip_distance', ignore_index=True, ascending=False).loc[slice(0,20), 'total_amount'].mean().round(4)
# df.sort_values('trip_distance',
#                 ascending=False,
#               ignore_index=True)['total_amount'].loc[:20].mean() # work too

np.float64(253.659)