In [2]:
import os
import zipfile
import pandas as pd
from pathlib import Path
import logging
import sys
import re

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s:%(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

In [9]:
DOWNLOAD_DIR = os.getenv("DOWNLOAD_DIR", "/data/download_tripdata")
PROCESSED_DIR = os.getenv("PROCESSED_DIR", "/data/processed_tripdata")

In [10]:
os.makedirs(PROCESSED_DIR, exist_ok=True)

OSError: [Errno 30] Read-only file system: '/data'

In [None]:
def extract_zip(file_path, extract_to):
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        logging.info(f"Extracted {file_path} to {extract_to}")
    except zipfile.BadZipFile as e:
        logging.error(f"Bad zip file {file_path}: {e}")
    except Exception as e:
        logging.error(f"Error extracting {file_path}: {e}")

In [None]:
def process_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Read CSV file {file_path} with {len(df)} records.")
        # Data Cleaning: Handle missing values
        df = df.dropna(subset=['ride_id', 'started_at', 'ended_at'])
        df = df.fillna({'rideable_type': 'unknown', 'member_casual': 'unknown'})
        return df
    except Exception as e:
        logging.error(f"Error processing CSV file {file_path}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on failure

In [None]:
def main():
    zip_files = list(Path(DOWNLOAD_DIR).glob("*.zip"))
    if not zip_files:
        logging.info("No zip files found to process.")
        sys.exit(0)

    all_dfs = []

    for zip_file in zip_files:
        extract_zip(zip_file, PROCESSED_DIR)

    # Find all CSV files after extraction
    csv_files = list(Path(PROCESSED_DIR).glob("*.csv"))
    if not csv_files:
        logging.warning("No CSV files found after extraction.")
        sys.exit(0)

    for csv_file in csv_files:
        df = process_csv(csv_file)
        if not df.empty:
            all_dfs.append(df)

    if not all_dfs:
        logging.warning("No data to ingest after processing CSV files.")
        sys.exit(0)

    # Concatenate all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)
    logging.info(f"Combined DataFrame has {len(combined_df)} records.")

    # Save the combined DataFrame to a single CSV file
    output_file = os.path.join(PROCESSED_DIR, "combined_tripdata.csv")
    combined_df.to_csv(output_file, index=False)
    logging.info(f"Saved combined data to {output_file}")

In [None]:
if __name__ == "__main__":
    main()