In [1]:
from pathlib import Path
import pandas as pd
import requests

def fetch_raw_data(year: int, month: int) -> str:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02}.parquet"
    response = requests.get(url)
    if response.status_code == 200:
        # Use pathlib to construct the path
        path = Path("..") / "data" / "raw" / f"rides_{year}_{month:02}.parquet"
        # Ensure the parent directory exists
        path.parent.mkdir(parents=True, exist_ok=True)
        # Write the content to the file
        path.write_bytes(response.content)
        print(f"Successfully fetched: {str(path)}")
        return str(path)
    else:
        raise Exception(f"{url} is not available")


In [2]:
fetch_raw_data(2023, 1)

Successfully fetched: ..\data\raw\rides_2023_01.parquet


'..\\data\\raw\\rides_2023_01.parquet'

In [4]:
import pandas as pd

def show_columns_and_sample(file_path: str, num_records: int = 5):
    df = pd.read_parquet(file_path)
    print("Columns in the dataset:")
    print(df.columns)
    print("\nSample records:")
    print(df.head(num_records))

# Example usage after fetching the file
file_path = fetch_raw_data(2023, 1)
show_columns_and_sample(file_path)


Successfully fetched: ..\data\raw\rides_2023_01.parquet
Columns in the dataset:
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

Sample records:
   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2023-01-01 00:32:10   2023-01-01 00:40:36              1.0   
1         2  2023-01-01 00:55:08   2023-01-01 01:01:27              1.0   
2         2  2023-01-01 00:25:04   2023-01-01 00:37:49              1.0   
3         1  2023-01-01 00:03:48   2023-01-01 00:13:25              0.0   
4         2  2023-01-01 00:10:29   2023-01-01 00:21:19              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLoc