In [19]:
from pathlib import Path
import pandas as pd
import requests
import zipfile
import io

def fetch_raw_data(year: int, month: int) -> Path:
    urls = [
        f"https://s3.amazonaws.com/tripdata/{year}{month:02}-citibike-tripdata.csv.zip",
        f"https://s3.amazonaws.com/tripdata/{year}{month:02}-citibike-tripdata.zip"
    ]
    
    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            print(f"Successfully downloaded zip from: {url}")
            break
    else:
        raise Exception(f"{url} is not available for {year}{month:02}")

    # Extract CSVs from the zip
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        csv_files = [name for name in z.namelist() if name.endswith(".csv")]
        
        df_list = []
        for csv_file in csv_files:
            with z.open(csv_file) as f:
                df = pd.read_csv(f, low_memory=False, encoding = 'latin1')
                 # Drop columns like 'unnamed: 0', 'unnamed_1', etc.
                df = df.loc[:, ~df.columns.str.lower().str.startswith("unnamed")]
                df_list.append(df)
        
        combined_df = pd.concat(df_list, ignore_index=True)

    # Save to CSV in the target directory
    path = Path("..") / "data" / "raw" / f"rides_{year}_{month:02}.csv"
    combined_df.to_csv(path, index=False)
    print(f"Saved CSV file to: {path}")
    
    return path


In [20]:
fetch_raw_data(2024, 6)


Successfully downloaded zip from: https://s3.amazonaws.com/tripdata/202406-citibike-tripdata.zip
Saved CSV file to: ..\data\raw\rides_2024_06.csv


WindowsPath('../data/raw/rides_2024_06.csv')