# Analysis of taxi rides in New York

**Target:**

**Analysis answers:**
- Are there areas where the number of trips on weekends is higher than on weekdays?
- Select the areas with the most trips

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../../../data/01_module/02_part/taxi_nyc_simulated.csv', encoding='utf-8', sep=',')

In [4]:
df.head()

Unnamed: 0,pickup_dt,pickup_month,borough,pickups,hday,spd,vsb,temp,dewp,slp,pcp 01,pcp 06,pcp 24,sd
0,2022-01-01 00:00:00,1,Manhattan,178,Y,2.779914,8.300281,31.71988,34.109586,999.107727,0.0,0.0,0.0,0
1,2022-01-01 01:00:00,1,Staten Island,196,N,13.18535,4.046307,74.828167,46.940784,997.301476,0.0,0.3,0.0,0
2,2022-01-01 02:00:00,1,Queens,211,Y,3.258516,4.047667,89.941796,38.567821,1006.584357,0.0,0.0,0.0,0
3,2022-01-01 03:00:00,1,Brooklyn,185,Y,16.259801,5.502311,22.871306,24.78411,1028.56469,0.0,0.0,1.5,0
4,2022-01-01 04:00:00,1,Manhattan,160,Y,15.918842,8.700797,69.39162,63.404507,991.891116,0.0,0.0,0.0,0


In [5]:
print(f'Shape: {df.shape}')

df.info()

Shape: (26058, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26058 entries, 0 to 26057
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pickup_dt     26058 non-null  object 
 1   pickup_month  26058 non-null  int64  
 2   borough       26058 non-null  object 
 3   pickups       26058 non-null  int64  
 4   hday          26058 non-null  object 
 5   spd           26058 non-null  float64
 6   vsb           26058 non-null  float64
 7   temp          26058 non-null  float64
 8   dewp          26058 non-null  float64
 9   slp           26058 non-null  float64
 10  pcp 01        26058 non-null  float64
 11  pcp 06        26058 non-null  float64
 12  pcp 24        26058 non-null  float64
 13  sd            26058 non-null  int64  
dtypes: float64(8), int64(3), object(3)
memory usage: 2.8+ MB


## Data preprocessing

In [6]:
df = df.rename(columns={
    'pcp 01': 'pcp_01',
    'pcp 06': 'pcp_01',
    'pcp 24': 'pcp_24'
})

In [7]:
df.isna().sum()

pickup_dt       0
pickup_month    0
borough         0
pickups         0
hday            0
spd             0
vsb             0
temp            0
dewp            0
slp             0
pcp_01          0
pcp_01          0
pcp_24          0
sd              0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df.value_counts('borough')

borough
Manhattan        9814
Brooklyn         5816
Queens           4699
Bronx            3111
Staten Island    1559
EWR              1059
Name: count, dtype: int64

In [10]:
all_pickups = df.pickups.sum()
all_pickups

np.int64(4691981)

## Aggregation of pickups by boroughs of New Yors

In [11]:
pickups_by_borough = (
    df.groupby('borough', as_index=False)
    .agg({'pickups': 'sum'})
    .sort_values(by='pickups', ascending=False)
)
pickups_by_borough.head(3)

Unnamed: 0,borough,pickups
3,Manhattan,1767677
1,Brooklyn,1048459
4,Queens,846657


In [12]:
if pickups_by_borough.pickups.sum() == all_pickups:
    print('Ok')
else:
    print('We made a mistake')

Ok


## Getting the borough with the least number of pickups

In [13]:
smallest_pickups_index = pickups_by_borough.pickups.idxmin()

min_pickups = pickups_by_borough.loc[smallest_pickups_index].borough
min_pickups

'EWR'

## Group by borough and select observations where weekend pickups more than weekdays pickups

In [29]:
pickups_by_borough_and_hday = (
    df.groupby(['borough', 'hday'], as_index=False)
    .agg({'pickups': 'mean'})
    .sort_values(by=['borough', 'pickups'], ascending=[False, False])
    .groupby('borough')
    .head(1)
    .query("hday == 'Y'")
    .rename(columns={'pickups': 'mean_pickups'})
)

pickups_by_borough_and_hday

Unnamed: 0,borough,hday,mean_pickups
11,Staten Island,Y,180.583691
7,Manhattan,Y,180.134803
5,EWR,Y,180.403226
3,Brooklyn,Y,180.479394
1,Bronx,Y,179.672316


## Group data by borough and month and count number of pickups each month

In [33]:
pickups_by_mon_bor = (
    df
        .groupby(['borough', 'pickup_month'], as_index=False)
        .agg({'pickups': 'sum'})
        .sort_values(by='pickups', ascending=False)
)
pickups_by_mon_bor

Unnamed: 0,borough,pickup_month,pickups
43,Manhattan,8,155067
40,Manhattan,5,154222
36,Manhattan,1,152924
45,Manhattan,10,151479
41,Manhattan,6,150075
...,...,...,...
35,EWR,12,14436
24,EWR,1,14322
34,EWR,11,14079
27,EWR,4,13688


In [34]:
def temp_to_celcius(fahrengeit):
    return (fahrengeit - 32) * 5 / 9

In [38]:
df = df.rename(columns={'temp': 'temp_fahrengeit'})

In [39]:
df['temp_celsius'] = df['temp_fahrengeit'].apply(temp_to_celcius)

In [41]:
df.head()

Unnamed: 0,pickup_dt,pickup_month,borough,pickups,hday,spd,vsb,temp_fahrengeit,dewp,slp,pcp_01,pcp_01.1,pcp_24,sd,temp_C,temp_celsius
0,2022-01-01 00:00:00,1,Manhattan,178,Y,2.779914,8.300281,31.71988,34.109586,999.107727,0.0,0.0,0.0,0,-0.155622,-0.155622
1,2022-01-01 01:00:00,1,Staten Island,196,N,13.18535,4.046307,74.828167,46.940784,997.301476,0.0,0.3,0.0,0,23.793426,23.793426
2,2022-01-01 02:00:00,1,Queens,211,Y,3.258516,4.047667,89.941796,38.567821,1006.584357,0.0,0.0,0.0,0,32.189886,32.189886
3,2022-01-01 03:00:00,1,Brooklyn,185,Y,16.259801,5.502311,22.871306,24.78411,1028.56469,0.0,0.0,1.5,0,-5.071497,-5.071497
4,2022-01-01 04:00:00,1,Manhattan,160,Y,15.918842,8.700797,69.39162,63.404507,991.891116,0.0,0.0,0.0,0,20.773122,20.773122
