In [4]:
# data_column_lister.ipynb

# This script is designed to quickly list all column headers for each
# compressed CSV file (in .zip format) within a specified directory.
# It is useful for a quick check of data schema across multiple files.

# 1. Setup and Imports
# ==============================================================================
import os
import glob
import pandas as pd
import zipfile

# The directory containing your raw, compressed data files.
# Please ensure this path is correct.
RAW_DATA_DIR = '/Users/vpry/Downloads/bike_sharing_dataset'


# 2. Main Script Execution
# ==============================================================================
if __name__ == '__main__':
    # Step 2a: Use glob to find all .zip files in the specified directory.
    zip_files = glob.glob(os.path.join(RAW_DATA_DIR, '*.zip'))

    if not zip_files:
        print(f"No .zip files found in the directory: {RAW_DATA_DIR}. "
              "Please make sure your raw data files are there.")
    else:
        print(f"Found {len(zip_files)} files to check for columns.\n")
        
        # Step 2b: Loop through each file found.
        for file_path in zip_files:
            file_name = os.path.basename(file_path)
            try:
                # Step 2c: Open the zip file to inspect its contents.
                with zipfile.ZipFile(file_path, 'r') as zf:
                    # Find the actual CSV file inside the archive.
                    # This handles cases where extra files like __MACOSX are present.
                    csv_file_in_zip = None
                    for name in zf.namelist():
                        if name.endswith('.csv') and not name.startswith('__MACOSX'):
                            csv_file_in_zip = name
                            break
                    
                    if csv_file_in_zip:
                        # Step 2d: Read only the header of the specific CSV file.
                        # The 'nrows=0' parameter ensures that no data is loaded,
                        # making this process very fast.
                        df = pd.read_csv(zf.open(csv_file_in_zip), nrows=0)
                        
                        # Step 2e: Extract the column names.
                        columns = df.columns.tolist()
                        
                        # Step 2f: Print the file name and its columns in a readable format.
                        print(f"File: {file_name}")
                        print(f"  Columns: {', '.join(columns)}\n")
                    else:
                        print(f"An error occurred while processing {file_name}: No CSV file found in the archive.\n")

            except Exception as e:
                # Catch any errors that might occur during file reading
                # (e.g., file not a valid CSV, corruption, etc.).
                print(f"An error occurred while processing {file_name}: {e}\n")

Found 97 files to check for columns.

File: 202210-capitalbikeshare-tripdata.zip
  Columns: ride_id, rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng, end_lat, end_lng, member_casual

File: 202503-capitalbikeshare-tripdata.zip
  Columns: ride_id, rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng, end_lat, end_lng, member_casual

File: 202012-capitalbikeshare-tripdata.zip
  Columns: ride_id, rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng, end_lat, end_lng, member_casual

File: 201909-capitalbikeshare-tripdata.zip
  Columns: Duration, Start date, End date, Start station number, Start station, End station number, End station, Bike number, Member type

File: 202305-capitalbikeshare-tripdata.zip
  Columns: ride_id, rideable_type, started_at, ended_at,