In [5]:
import os
import glob
import pandas as pd
from datetime import datetime
import zipfile

# It's a good practice to set up your directory paths at the beginning.
# Replace these with the actual paths on your system.
RAW_DATA_DIR = '/Users/vpry/Downloads/bike_sharing_dataset'
PROCESSED_DATA_DIR = '/Users/vpry/Downloads/processed_data/'

# Create the processed data directory if it doesn't exist
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

def process_and_save_data(file_path):
    """
    Reads a single zip file, selects 'start_at' and 'end_at' columns,
    and saves a new compressed CSV file. This version correctly handles
    multiple files inside the zip archive and renames columns.

    Args:
        file_path (str): The full path to the input .zip file.
    """
    try:
        # Step 2a: Read the compressed CSV file directly into a pandas DataFrame.
        # pandas can handle reading compressed files automatically.
        print(f"Processing file: {os.path.basename(file_path)}...")

        # Use the zipfile library to open the archive and find the specific CSV file.
        # This handles cases where extra files like __MACOSX are present.
        with zipfile.ZipFile(file_path, 'r') as zf:
            csv_file_in_zip = None
            for name in zf.namelist():
                if name.endswith('.csv') and not name.startswith('__MACOSX'):
                    csv_file_in_zip = name
                    break
            
            if csv_file_in_zip:
                # Step 2b: Read the actual CSV data from the file inside the zip.
                # The nrows=0 has been removed to read all data.
                df = pd.read_csv(zf.open(csv_file_in_zip))

                # Step 2c: Rename columns for standardization. This is a great practice!
                df = df.rename(columns={'Start date': 'start_at', 'End date': 'end_at', 'started_at':'start_at', 'ended_at':'end_at'})

                # Step 2d: Ensure the standardized columns are present.
                required_columns = ['start_at', 'end_at']
                if not all(col in df.columns for col in required_columns):
                    print(f"Skipping {os.path.basename(file_path)}: Missing required columns after renaming.")
                    return

                # Per user request, the duration calculation has been removed from this step.
                # We will keep the 'start_at' and 'end_at' columns as strings for later processing.

                # Step 2e: Select the final columns to be saved.
                final_df = df[['start_at', 'end_at']].copy()

                # Step 2f: Construct the output file path.
                # We'll use the same base name but change the extension to .csv.gz
                file_name = os.path.basename(file_path).replace('.zip', '.csv.gz')
                output_file_path = os.path.join(PROCESSED_DATA_DIR, file_name)

                # Step 2g: Save the new DataFrame to a compressed CSV file.
                # compression='gzip' handles the compression automatically.
                final_df.to_csv(output_file_path, index=False, compression='gzip')
                print(f"Successfully saved cleaned data to: {output_file_path}")
            else:
                print(f"An error occurred while processing {file_name}: No CSV file found in the archive.")

    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")

# 3. Main Script Execution
# ==============================================================================
if __name__ == '__main__':
    # Step 3a: Use glob to find all .zip files in the raw data directory.
    zip_files = glob.glob(os.path.join(RAW_DATA_DIR, '*.zip'))

    if not zip_files:
        print(f"No .zip files found in the directory: {RAW_DATA_DIR}. "
              "Please make sure your raw data files are there.")
    else:
        print(f"Found {len(zip_files)} files to process.")
        # Step 3b: Loop through each file and call the processing function.
        for file in zip_files:
            process_and_save_data(file)

        print("\nAll files have been processed.")


Found 97 files to process.
Processing file: 202210-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202210-capitalbikeshare-tripdata.csv.gz
Processing file: 202503-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202503-capitalbikeshare-tripdata.csv.gz
Processing file: 202012-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202012-capitalbikeshare-tripdata.csv.gz
Processing file: 201909-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/201909-capitalbikeshare-tripdata.csv.gz
Processing file: 202305-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202305-capitalbikeshare-tripdata.csv.gz
Processing file: 201802-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processe

  df = pd.read_csv(zf.open(csv_file_in_zip))


Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202102-capitalbikeshare-tripdata.csv.gz
Processing file: 202506-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202506-capitalbikeshare-tripdata.csv.gz
Processing file: 202009-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202009-capitalbikeshare-tripdata.csv.gz
Processing file: 201912-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/201912-capitalbikeshare-tripdata.csv.gz
Processing file: 202405-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/202405-capitalbikeshare-tripdata.csv.gz
Processing file: 201811-capitalbikeshare-tripdata.zip...
Successfully saved cleaned data to: /Users/vpry/Downloads/processed_data/201811-capitalbikeshare-tripdata.csv.gz
Processing file: 2010-capitalbikeshar