In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '../../pandas-workout-data/data/nyc_taxi_2019-01.csv'
columns = ['passenger_count', 'trip_distance', 'total_amount']

Load taxi data from January 2019 into a data frame using only the columns passenger_count, trip_distance, and total_amount.

In [3]:
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
2,3,0.00,5.80
3,5,0.00,7.55
4,5,0.00,55.55
...,...,...,...
7667787,1,4.79,23.16
7667788,1,0.00,0.00
7667789,1,0.00,0.00
7667790,1,0.00,0.00


For each number of passengers, find the mean cost of a taxi ride. Sort this result from lowest (i.e., cheapest) to highest (i.e., most expensive).

In [4]:
df.groupby('passenger_count', sort=False)['total_amount'].mean().sort_values(ascending=True) # From the lowest to the highest

passenger_count
6    15.437892
5    15.546940
3    15.604015
1    15.609601
4    15.650307
2    15.831294
0    18.663658
9    31.094444
7    48.278421
8    64.105517
Name: total_amount, dtype: float64

Sort the results again by increasing the number of passengers

In [5]:
df.groupby('passenger_count', sort=True)['total_amount'].mean()

passenger_count
0    18.663658
1    15.609601
2    15.831294
3    15.604015
4    15.650307
5    15.546940
6    15.437892
7    48.278421
8    64.105517
9    31.094444
Name: total_amount, dtype: float64

Create a new column, trip_distance_group, in which the values are short (< 2 miles), medium (≥ 2 miles and ≤ 10 miles), and long (> 10 miles). What is the average number of passengers per trip length category? Sort this result from highest (most passengers) to lowest (fewest passengers).

In [6]:
df.head(5)

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


In [7]:
df['total_amount'].isnull().sum()

np.int64(0)

In [8]:
# The only reason I’d avoid it here is performance: with ~7.6M rows, apply loops in Python and is much slower than a vectorized approach.
# def set_distance_value(distance): 
#     if distance > 10:
#         return 'long'
#     elif distance >= 2:
#         return 'medium'
#     else:
#         return 'short'

In [9]:
# The only reason I’d avoid it here is performance: with ~7.6M rows, apply loops in Python and is much slower than a vectorized approach.
# df['trip_distance_group'] = df['trip_distance'].apply(set_distance_value) 


In [18]:
# Usin cut method
# df['trip_distance_group'] = pd.cut(df['trip_distance'], 
#                                    [df['trip_distance'].min(), 2, 10, 
#                                     df['trip_distance'].max()],
#                                   labels=['short', 'medium', 'long'])
# df.groupby('trip_distance_group')['passenger_count'].mean().sort_values(ascending=False)

In [11]:
# bins = [df['trip_distance'].min(), 2, 10, df['trip_distance'].max()]
# labels = ['short', 'medium', 'long']

In [19]:
# Using cut method
# df['trip_distance_group'] = pd.cut(df['trip_distance'], bins, right=False) [0,2) [2,10) [10, 831.8)

In [13]:
# df.groupby('trip_distance_group').count()

In [14]:
condition_list = [df['trip_distance'] > 10, df['trip_distance'] >= 2] # order matters
choice_list = ['long', 'medium']

In [15]:
df['trip_distance_group'] = np.select(condition_list, choice_list, 'short')
df.head(5)

Unnamed: 0,passenger_count,trip_distance,total_amount,trip_distance_group
0,1,1.5,9.95,short
1,1,2.6,16.3,medium
2,3,0.0,5.8,short
3,5,0.0,7.55,short
4,5,0.0,55.55,short


In [16]:
df.groupby('trip_distance_group').count()

Unnamed: 0_level_0,passenger_count,trip_distance,total_amount
trip_distance_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,423719,423719,423719
medium,2494589,2494589,2494589
short,4749484,4749484,4749484


In [17]:
df.groupby('trip_distance_group')['passenger_count'].mean().sort_values(ascending=False)

trip_distance_group
long      1.590035
medium    1.576764
short     1.559943
Name: passenger_count, dtype: float64

# Beyond the exercise

### Create a single data frame containing rides from both January 2019 and January 2020, with a column year indicating which year the ride comes from. Use groupby to compare the average cost of a taxi in January from each of these two years.

In [22]:
path_january19 = '../../pandas-workout-data/data/nyc_taxi_2019-01.csv'
path_january20 = '../../pandas-workout-data/data/nyc_taxi_2020-01.csv'
columns = ['passenger_count','trip_distance', 'total_amount']

In [24]:
df19 = pd.read_csv(filepath_or_buffer=path_january19, usecols=columns)
df19.head(2)

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3


In [27]:
df19['year'] = 2019
df19.head(2)

Unnamed: 0,passenger_count,trip_distance,total_amount,year
0,1,1.5,9.95,2019
1,1,2.6,16.3,2019


In [25]:
df20 = pd.read_csv(filepath_or_buffer=path_january20, usecols=columns)
df20.head(2)

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1.0,1.2,11.27
1,1.0,1.2,12.3


In [28]:
df20['year'] = 2020
df20.head(2)

Unnamed: 0,passenger_count,trip_distance,total_amount,year
0,1.0,1.2,11.27,2020
1,1.0,1.2,12.3,2020


In [29]:
len(df19) + len(df20)

14072800

In [30]:
df = pd.concat([df19, df20])

In [31]:
len(df)

14072800

In [32]:
df.head(2)

Unnamed: 0,passenger_count,trip_distance,total_amount,year
0,1.0,1.5,9.95,2019
1,1.0,2.6,16.3,2019


In [33]:
df.tail(2)

Unnamed: 0,passenger_count,trip_distance,total_amount,year
6405006,,5.49,30.22,2020
6405007,,11.6,58.11,2020


In [35]:
group_by_year = df.groupby('year')

In [37]:
group_by_year['total_amount'].mean()

year
2019    15.682222
2020    18.663149
Name: total_amount, dtype: float64

### Create a two-level grouping, first by year and then by passenger_count.

In [None]:
# The result is a series with a multi-index

df.groupby(['year', 'passenger_count'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F2B8C1D370>

In [39]:
df.groupby(['year', 'passenger_count']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,total_amount
year,passenger_count,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,0.0,2.651561,18.663658
2019,1.0,2.779088,15.609601
2019,2.0,2.880572,15.831294
2019,3.0,2.840698,15.604015
2019,4.0,2.853084,15.650307
2019,5.0,2.865741,15.54694
2019,6.0,2.842335,15.437892
2019,7.0,2.561579,48.278421
2019,8.0,3.142759,64.105517
2019,9.0,1.486667,31.094444


In [40]:
df.groupby(['year', 'passenger_count'])['total_amount'].mean()

year  passenger_count
2019  0.0                18.663658
      1.0                15.609601
      2.0                15.831294
      3.0                15.604015
      4.0                15.650307
      5.0                15.546940
      6.0                15.437892
      7.0                48.278421
      8.0                64.105517
      9.0                31.094444
2020  0.0                18.059724
      1.0                18.343110
      2.0                19.050504
      3.0                18.736862
      4.0                19.128092
      5.0                18.234443
      6.0                18.367962
      7.0                71.143103
      8.0                58.197059
      9.0                81.244211
Name: total_amount, dtype: float64

### The **corr** method allows us to see how strongly two columns correlate with one another. Use corr and then sort_values to find which columns have the highest correlation.

In [41]:
# This exercise is totally copied from the author's github

In [42]:
# df.corr() returns a square, so we can look at just the first column
# and get everything we need

# When we sort, we see very clearly that there's basically no correlation
# between passenger_count and any other column. (Except for itself, of 
# course...)  So there really doesn't seem to be any financial advantage
# for a driver to take a larger group.
df.corr().sort_values('passenger_count')

Unnamed: 0,passenger_count,trip_distance,total_amount,year
year,-0.021602,0.00114,0.007657,1.0
total_amount,-0.000136,0.004331,1.0,0.007657
trip_distance,0.008974,1.0,0.004331,0.00114
passenger_count,1.0,0.008974,-0.000136,-0.021602
