In [9]:
from pathlib import Path
import requests

def download_raw_data(year: int, month: int) -> Path:
    ''''''
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    response = requests.get(URL)

    if response.status_code == 200:
        path = f'../data/raw/rides_{year}--{month:02d}.parquet'
        open(path, "wb").write(response.content)
        return path
    else:
        raise Exception(f'{URL} is not available')

In [10]:
download_raw_data(year =2024, month = 1)

'../data/raw/rides_2024--01.parquet'

In [11]:
# load file into a pandas dataframe

import pandas as pd
rides = pd.read_parquet('../data/raw/rides_2024--01.parquet')
rides.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [12]:
rides = rides[['tpep_pickup_datetime','PULocationID']]

In [13]:
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 2 columns):
 #   Column                Dtype         
---  ------                -----         
 0   tpep_pickup_datetime  datetime64[us]
 1   PULocationID          int32         
dtypes: datetime64[us](1), int32(1)
memory usage: 33.9 MB


In [15]:
rides.rename(columns={'tpep_pickup_datetime': 'pickup_datetime',
                     'PULocationID': 'pickup_location_id',
                     }, inplace=True)

rides.head(20)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2024-01-01 00:57:55,186
1,2024-01-01 00:03:00,140
2,2024-01-01 00:17:06,236
3,2024-01-01 00:36:38,79
4,2024-01-01 00:46:51,211
5,2024-01-01 00:54:08,148
6,2024-01-01 00:49:44,138
7,2024-01-01 00:30:40,246
8,2024-01-01 00:26:01,161
9,2024-01-01 00:28:08,113


In [16]:
rides['pickup_datetime'].describe()

count                       2964624
mean     2024-01-17 00:46:36.431092
min             2002-12-31 22:59:39
25%      2024-01-09 15:59:19.750000
50%      2024-01-17 10:45:37.500000
75%      2024-01-24 18:23:52.250000
max             2024-02-01 00:01:15
Name: pickup_datetime, dtype: object

In [17]:
#remove dates not in January 2024
rides = rides[rides.pickup_datetime >= '2024-01-01']
rides = rides[rides.pickup_datetime < '2024-02-01']
rides['pickup_datetime'].describe()

count                       2964606
mean     2024-01-17 01:02:08.093335
min             2024-01-01 00:00:00
25%             2024-01-09 15:59:24
50%             2024-01-17 10:45:42
75%             2024-01-24 18:23:53
max             2024-01-31 23:59:55
Name: pickup_datetime, dtype: object

In [18]:
# Store the validated file

rides.to_parquet('../data/transformed/validated_rides_2024_01.parquet')