In [2]:
import pandas as pd
import h3
import hashlib


### Data ingestion planning


We want to convert the zip files of CSVs from the citi bike data [website](https://s3.amazonaws.com/tripdata/index.html) into a data format we can work with.

Big picture the plan is to take the CSVs of individual rides and upload each ride as an entry to the `ride_data` database table.

There are a few reasons to reshape the trip data before aggregating by month. The biggest reason is that the file structure for the data dumps is not consistent. So if we pulled down data and directly converted to aggregated monthly formats it would mean the logic for aggregating and the logic for extracting data would be tightly coupled. AKA if we want to change aggregation logic we would need to update the code for each file structure for data.

1. The file structure is inconsistent across years/months. Before 2024 all data is in one file
2. The `ride_id` field is not present until a certain year.
3. We want to easily be able to add more cities.
4. When there are multiple files for a single month we can process each one individually and send to the backend without having to load all of them each time. So our system doesn't need to **understand** the file structure. It can just load all the trips in the format we want then process them. 


This table will have 5 fields:

`id`: int incremental unique ID

`ride_id`: A unique ID for each ride

`locale`: Jersey or NYC for now

`start_date`: The starting time for the ride.

`created_at` timestamp for creation. 



A few solutions:

1. Data before a certain year does not contain the ride_id field. We will create it for those dates by combining and hashing the `start_time` and the `bike_id` since that should be unique.
2. Duplicated trips across files. If they are from recent years they will have `ride_id` which will dedupe. Otherwise our generated `ride_id` will hopefully suffice.


### Process

1. Hit index page https://s3.amazonaws.com/tripdata/index.html

2. Get all file names/last modified dates. Compare against db to see if anything requires update.

3. For any file which has been modified or is new, run ingestion.

4. Download zip file, iterate through each sub-file.

In [3]:
import pandas as pd
import requests
import zipfile
import io
from datetime import datetime
from pathlib import Path
import os


def process_all_csvs_from_zip_url(zip_url, locale):
    """
    Download ZIP file from URL and process ALL CSV files found in any folder/subfolder.

    Parameters:
    zip_url (str): URL to the ZIP file

    Returns:
    dict: Dictionary where keys are CSV filenames and values are processed DataFrames
    """

    try:
        # Download ZIP file into memory
        print(f"Downloading ZIP file from: {zip_url}")
        response = requests.get(zip_url, stream=True)
        response.raise_for_status()

        # Create a BytesIO object from the downloaded content
        zip_data = io.BytesIO(response.content)

        return process_all_csvs_from_zip_data(zip_data, locale)

    except requests.RequestException as e:
        raise Exception(f"Error downloading ZIP file: {str(e)}")
    except Exception as e:
        raise Exception(f"Error processing ZIP file: {str(e)}")


def process_all_csvs_from_local_zip(zip_file_path):
    """
    Process ALL CSV files from local ZIP file, including subfolders.

    Parameters:
    zip_file_path (str): Path to local ZIP file

    Returns:
    dict: Dictionary where keys are CSV filenames and values are processed DataFrames
    """

    try:
        with open(zip_file_path, "rb") as f:
            zip_data = io.BytesIO(f.read())

        return process_all_csvs_from_zip_data(zip_data)

    except FileNotFoundError:
        raise FileNotFoundError(f"ZIP file not found: {zip_file_path}")
    except Exception as e:
        raise Exception(f"Error processing ZIP file: {str(e)}")


def process_all_csvs_from_zip_data(zip_data, locale):
    """
    Process all CSV files from ZIP data (works with both URL and local files).

    Parameters:
    zip_data (io.BytesIO): ZIP file data

    Returns:
    dict: Dictionary where keys are CSV filenames and values are processed DataFrames
    """

    results = {}
    processed_count = 0
    failed_files = []

    try:
        with zipfile.ZipFile(zip_data, "r") as zip_ref:
            # Get ALL files in ZIP (including subfolders)
            all_files = zip_ref.namelist()

            # Filter for CSV files (case insensitive) and exclude system/metadata files
            csv_files = []
            for f in all_files:
                # Skip directories
                if f.endswith("/"):
                    continue
                # Skip macOS metadata files
                if "__MACOSX" in f or f.startswith("._"):
                    continue
                # Skip Windows/Linux hidden files
                if "/.DS_Store" in f or f.endswith(".DS_Store"):
                    continue
                # Skip other common system files
                if f.endswith(".thumbs.db") or f.endswith("Thumbs.db"):
                    continue
                # Keep only CSV files
                if f.lower().endswith(".csv"):
                    csv_files.append(f)

            if not csv_files:
                raise ValueError("No CSV files found in ZIP archive")

            print(f"Found {len(csv_files)} CSV file(s) in ZIP archive:")
            for csv_file in csv_files:
                print(f"  - {csv_file}")

            # Process each CSV file
            for csv_file_path in csv_files:
                try:
                    print(f"\nProcessing: {csv_file_path}")

                    # Read CSV directly from ZIP
                    with zip_ref.open(csv_file_path) as csv_file:
                        df = pd.read_csv(csv_file)

                    # Process the DataFrame based on which format it uses.
                    if "ride_id" in df.columns:
                        print("here")
                        processed_df = process_dataframe(df, locale)
                    else:
                        print("239821h")
                        processed_df = process_dataframe_old_format(df, locale)

                    # Use just the filename (without path) as key
                    filename_only = os.path.basename(csv_file_path)

                    # Handle duplicate filenames by adding folder info
                    if filename_only in results:
                        # Create unique key with folder path
                        folder_path = os.path.dirname(csv_file_path)
                        unique_key = (
                            f"{folder_path}/{filename_only}"
                            if folder_path
                            else filename_only
                        )
                        results[unique_key] = processed_df
                    else:
                        results[filename_only] = processed_df

                    processed_count += 1
                    print(f"  ✓ Successfully processed {len(processed_df)} rows")

                except Exception as e:
                    error_msg = f"Failed to process {csv_file_path}: {str(e)}"
                    print(f"  ✗ {error_msg}")
                    failed_files.append((csv_file_path, str(e)))
                    continue

            # Summary
            print(f"\n{'='*50}")
            print(f"PROCESSING SUMMARY:")
            print(f"Successfully processed: {processed_count} files")
            print(f"Failed: {len(failed_files)} files")

            if failed_files:
                print(f"\nFailed files:")
                for failed_file, error in failed_files:
                    print(f"  - {failed_file}: {error}")

            if processed_count == 0:
                raise ValueError("No CSV files could be processed successfully")

            return results

    except zipfile.BadZipFile:
        print("File is not a valid ZIP archive, skipping.")
    except Exception as e:
        raise Exception(f"Error processing ZIP file: {str(e)}")


def combine_all_dataframes(results_dict):
    """
    Combine all processed DataFrames into a single DataFrame.

    Parameters:
    results_dict (dict): Dictionary of DataFrames from process_all_csvs_from_zip_*

    Returns:
    pd.DataFrame: Combined DataFrame with additional 'source_file' column
    """

    if not results_dict:
        raise ValueError("No DataFrames to combine")

    combined_dfs = []

    for filename, df in results_dict.items():
        # Add source file column
        df_copy = df.copy()
        df_copy["source_file"] = filename
        combined_dfs.append(df_copy)

    # Combine all DataFrames
    combined_df = pd.concat(combined_dfs, ignore_index=True)

    print(
        f"Combined {len(results_dict)} files into single DataFrame with {len(combined_df)} total rows"
    )

    return combined_df


def process_dataframe(df, locale):
    """
    Process DataFrame to create the desired output format.

    Parameters:
    df (pd.DataFrame): Input DataFrame

    Returns:
    pd.DataFrame: Processed DataFrame
    """

    required_columns = [
        "ride_id",
        "started_at",
        "start_lat",
        "start_lng",
        "end_lat",
        "end_lng",
    ]

    # Check if all required columns exist
    missing_columns = [col for col in required_columns if col not in df.columns]

    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Create new DataFrame with only the needed columns
    result_df = pd.DataFrame()

    # Copy ride_id as-is
    result_df["ride_id"] = df["ride_id"]

    # Convert started_at to date only (remove time component)
    result_df["start_date"] = pd.to_datetime(df["started_at"]).dt.date

    # Set local to constant "JC"
    result_df["locale"] = locale

    # Copy latitude and longitude
    result_df["start_lat"] = df["start_lat"]
    result_df["start_lng"] = df["start_lng"]
    result_df["end_lat"] = df["end_lat"]
    result_df["end_lng"] = df["end_lng"]

    return result_df


# Create hashed ride_id from starttime and bikeid combination
def create_ride_id_hash(row):
    # Convert both values to strings and concatenate
    combined_string = f"{row['start_time']}_{row['bike_id']}"
    # Create SHA256 hash and take first 16 characters for shorter ID
    hash_object = hashlib.sha256(combined_string.encode())
    return hash_object.hexdigest()[:16]


def process_dataframe_old_format(df, locale):
    """
    Process DataFrame to create the desired output format.

    Parameters:
    df (pd.DataFrame): Input DataFrame
    locale: Locale identifier

    Returns:
    pd.DataFrame: Processed DataFrame
    """

    # Define column mappings with multiple possible names (all lowercase for comparison)
    column_mappings = {
        "bikeid": ["bikeid", "bike_id", "bike id"],
        "starttime": ["starttime", "start time", "start_time"],
        "start_station_latitude": [
            "start station latitude",
            "start_station_latitude",
            "start station lat",
            "start_lat",
        ],
        "start_station_longitude": [
            "start station longitude",
            "start_station_longitude",
            "start station lng",
            "start station lon",
            "start_lng",
            "start_lon",
        ],
        "end_station_latitude": [
            "end station latitude",
            "end_station_latitude",
            "end station lat",
            "end_lat",
        ],
        "end_station_longitude": [
            "end station longitude",
            "end_station_longitude",
            "end station lng",
            "end station lon",
            "end_lng",
            "end_lon",
        ],
    }

    def find_column_match(df_columns, possible_names):
        """Find matching column name from possibilities, case-insensitive"""
        df_columns_lower = [col.lower() for col in df_columns]
        for possible_name in possible_names:
            if possible_name.lower() in df_columns_lower:
                # Return the original column name (with original case)
                original_idx = df_columns_lower.index(possible_name.lower())
                return df_columns[original_idx]
        return None

    # Find actual column names in the DataFrame
    actual_columns = {}
    missing_columns = []

    for standard_name, possible_names in column_mappings.items():
        matched_column = find_column_match(df.columns.tolist(), possible_names)
        if matched_column:
            actual_columns[standard_name] = matched_column
        else:
            missing_columns.append(
                f"{standard_name} (tried: {', '.join(possible_names)})"
            )

    if missing_columns:
        raise ValueError(
            f"Missing required columns: {missing_columns}. Available columns: {df.columns.tolist()}"
        )

    # Create new DataFrame with only the needed columns
    result_df = pd.DataFrame()

    # Generate unique ride_id using hash of starttime and bikeid
    # Create a temporary series for the hash function
    temp_df = pd.DataFrame(
        {
            "start_time": df[actual_columns["starttime"]],
            "bike_id": df[actual_columns["bikeid"]],
        }
    )
    result_df["ride_id"] = temp_df.apply(create_ride_id_hash, axis=1)

    # Convert started_at to date only (remove time component)
    result_df["start_date"] = pd.to_datetime(df[actual_columns["starttime"]]).dt.date

    result_df["locale"] = locale

    # Copy latitude and longitude using the matched column names
    result_df["start_lat"] = df[actual_columns["start_station_latitude"]]
    result_df["start_lng"] = df[actual_columns["start_station_longitude"]]
    result_df["end_lat"] = df[actual_columns["end_station_latitude"]]
    result_df["end_lng"] = df[actual_columns["end_station_longitude"]]

    return result_df


def download_and_save_zip(zip_url, local_path):
    """
    Download ZIP file and save locally.

    Parameters:
    zip_url (str): URL to download
    local_path (str): Local path to save the file
    """
    try:
        print(f"Downloading ZIP file to: {local_path}")
        response = requests.get(zip_url, stream=True)
        response.raise_for_status()

        # Create directory if it doesn't exist
        Path(local_path).parent.mkdir(parents=True, exist_ok=True)

        # Save file
        with open(local_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"ZIP file saved successfully to: {local_path}")
        return local_path

    except requests.RequestException as e:
        raise Exception(f"Error downloading ZIP file: {str(e)}")
    except Exception as e:
        raise Exception(f"Error saving ZIP file: {str(e)}")


# Example usage
# if __name__ == "__main__":

#     # Method 1: Process all CSVs directly from URL (RECOMMENDED)
#     try:
#         zip_url = "https://example.com/data.zip"  # Replace with actual URL

#         # Process all CSV files found anywhere in the ZIP
#         results = process_all_csvs_from_zip_url(zip_url)

#         print(f"\nProcessed {len(results)} CSV files:")
#         for filename, df in results.items():
#             print(f"  {filename}: {len(df)} rows")
#             print(f"    Sample data:")
#             print(f"    {df.head(2).to_string()}")
#             print()

#         # Option A: Work with individual DataFrames
#         for filename, df in results.items():
#             # Save each processed file individually
#             output_filename = f"processed_{filename}"
#             # df.to_csv(output_filename, index=False)
#             print(f"Ready to save: {output_filename}")

#         # Option B: Combine all into single DataFrame
#         combined_df = combine_all_dataframes(results)
#         print(f"Combined DataFrame shape: {combined_df.shape}")
#         print("Combined sample:")
#         print(combined_df.head())

#         # Save combined data
#         # combined_df.to_csv('all_rides_combined.csv', index=False)

#     except Exception as e:
#         print(f"Error: {e}")

#     # Method 2: Download first, then process (for repeated processing)
#     try:
#         zip_url = "https://example.com/data.zip"  # Replace with actual URL
#         local_zip_path = "data/downloaded_file.zip"

#         # Download and save
#         download_and_save_zip(zip_url, local_zip_path)

#         # Process all CSVs from local ZIP file
#         results = process_all_csvs_from_local_zip(local_zip_path)

#         print(f"Successfully processed {len(results)} CSV files from local ZIP")

#     except Exception as e:
#         print(f"Method 2 error: {e}")

### Process

1. Grab 



In [4]:
def apply_h3_latlng_to_cell(df_by_file, resolution=9):
    output_obj = {}
    for key in df_by_file.keys():
        print(f"[{key}] entries: {df_by_file[key].shape[0]}")
        resolution = 9
        df_in_loop = df_by_file[key].copy()
        df_in_loop["h3_cell_start"] = df_in_loop.apply(
            lambda row: (
                h3.latlng_to_cell(row["start_lat"], row["start_lng"], resolution)
                if pd.notnull(row["start_lat"]) and pd.notnull(row["start_lng"])
                else None
            ),
            axis=1,
        )
        df_in_loop["h3_cell_end"] = df_in_loop.apply(
            lambda row: (
                h3.latlng_to_cell(row["end_lat"], row["end_lng"], resolution)
                if pd.notnull(row["end_lat"]) and pd.notnull(row["end_lng"])
                else None
            ),
            axis=1,
        )

        df_in_loop.drop(
            columns=["start_lat", "start_lng", "end_lat", "end_lng"], inplace=True
        )
        # Remove duplicates. These are infrequent and probably just bad data.
        dupes = df_in_loop.duplicated(subset=["ride_id"])
        print(f"  - Removing {dupes.sum()} duplicate ride_id entries")
        df_in_loop = df_in_loop[~dupes]

        output_obj[key] = df_in_loop
    return output_obj

In [5]:
from supabase import create_client, Client
import dotenv
dotenv.load_dotenv()

url: str = os.environ.get("URL")
key: str = os.environ.get("ANON_KEY")
supabase: Client = create_client(url, "sb_secret__hURMlWV0e8lHlKreudFAA_UBScLp6U")

In [6]:
import json
def upload_results(input):
    for file_name in input.keys():

        json_str = input[file_name].to_json(orient="records", date_format='iso')
        data = json.loads(json_str)
        result = (
            supabase.table("ride_data")
            .upsert(data, on_conflict="ride_id,locale")
            .execute()
        )



In [7]:
from datetime import datetime

# Specify the date you want to filter by
date_to_filter = "2025-07-21"  # Change this to your desired date

# Query the Supabase table
response = (
    supabase.table("ride_data").select("*").eq("start_date", date_to_filter).execute()
)

response.data

[{'id': 19299,
  'created_at': '2025-09-03T17:22:43.757629+00:00',
  'ride_id': '7255E3D93CEC0F45',
  'start_date': '2025-07-21',
  'locale': 'NYC',
  'h3_cell_start': '892a100ab23ffff',
  'h3_cell_end': '892a100aba3ffff'},
 {'id': 19353,
  'created_at': '2025-09-03T17:22:43.757629+00:00',
  'ride_id': 'CDC67D9888536234',
  'start_date': '2025-07-21',
  'locale': 'NYC',
  'h3_cell_start': '892a100da53ffff',
  'h3_cell_end': '892a100dadbffff'},
 {'id': 19368,
  'created_at': '2025-09-03T17:22:43.757629+00:00',
  'ride_id': '6EE71FE8C768DA2F',
  'start_date': '2025-07-21',
  'locale': 'NYC',
  'h3_cell_start': '892a100d60bffff',
  'h3_cell_end': '892a100d2b3ffff'},
 {'id': 19375,
  'created_at': '2025-09-03T17:22:43.757629+00:00',
  'ride_id': '71440AE5A20EF49A',
  'start_date': '2025-07-21',
  'locale': 'NYC',
  'h3_cell_start': '892a100d143ffff',
  'h3_cell_end': '892a100dc33ffff'},
 {'id': 19389,
  'created_at': '2025-09-03T17:22:43.757629+00:00',
  'ride_id': 'AF9F9B1DD9012092',
  's

In [8]:
# file_df.iloc[0]
# result = (
#         supabase.table("processed_files")
#         .upsert(
#             json.loads(file_df.iloc[0].to_json(date_format='iso')),
#             on_conflict="file_name,locale",
#         )
#         .execute()
#     )

In [9]:
def get_processed_file_record(file_df_entry):
    json_str = file_df_entry.to_json(date_format='iso')
    data = json.loads(json_str)
    return data
    

In [10]:
def process_file(file_obj):
    url = f"https://s3.amazonaws.com/tripdata/{file_obj['file_name']}"
    df_by_file = process_all_csvs_from_zip_url(url, locale="NYC")
    if not df_by_file:
        print(f"No data processed for {file_obj['file_name']}, skipping upload.")
        return
    output = apply_h3_latlng_to_cell(df_by_file)

    upload_results_with_guaranteed_retry(output, max_workers=3)
    # Update supabase processed_files table
    print(f"Updating processed_files table for {file_obj['file_name']}")
    result = (
        supabase.table("processed_files")
        .upsert(
            get_processed_file_record(file_obj),
            on_conflict="file_name,locale",
        )
        .execute()
    )

In [11]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import time

def upload_results_with_guaranteed_retry(
    output, batch_size=1000, max_workers=2, max_retries=5
):
    """
    Upload with comprehensive retry logic. Process fails only if retries are exhausted.
    """
    print(
        f"Starting upload process with {len(output)} files, max_retries={max_retries}"
    )

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {}

        # Submit all jobs with staggered starts
        for i, (file_name, df) in enumerate(output.items()):
            time.sleep(i * 0.2)  # Stagger submissions
            future = executor.submit(
                upload_file_with_retry, file_name, df, batch_size, max_retries
            )
            future_to_file[future] = file_name

        # Wait for all to complete
        failed_files = []
        for future in as_completed(future_to_file):
            file_name = future_to_file[future]
            try:
                future.result()  # This will raise if all retries failed
                print(f"🎉 Successfully completed {file_name}")
            except Exception as e:
                failed_files.append(file_name)
                print(f"💥 FINAL FAILURE for {file_name}: {e}")

                # Cancel remaining work
                for pending_future in future_to_file:
                    if not pending_future.done():
                        pending_future.cancel()
                        print(f"⏹️  Cancelled remaining upload")
                break

        if failed_files:
            raise RuntimeError(f"Upload process failed. Failed files: {failed_files}")

    print("✅ All uploads completed successfully!")


def upload_file_with_retry(file_name: str, df, batch_size: int, max_retries: int):
    """Upload a single file with comprehensive retry logic for each batch"""
    print(f"📁 Processing {file_name} with {len(df)} records")
    
    json_str = df.to_json(orient="records", date_format="iso")
    data = json.loads(json_str)
    total_batches = (len(data) + batch_size - 1) // batch_size

    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        batch_num = i // batch_size + 1

        # Retry this specific batch
        success = upload_batch_with_retry(
            batch, file_name, batch_num, total_batches, max_retries
        )

        if not success:
            raise RuntimeError(f"Batch {batch_num} failed after {max_retries} retries")

        # Small delay between batches within same file
        time.sleep(0.1)

    print(f"✅ Completed all batches for {file_name}")


def upload_batch_with_retry(
    batch, file_name: str, batch_num: int, total_batches: int, max_retries: int
) -> bool:
    """Retry a single batch with exponential backoff"""

    for attempt in range(max_retries):
        try:
            result = (
                supabase.table("ride_data")
                .upsert(batch, on_conflict="ride_id,locale,start_date")
                .execute()
            )

            print(
                f"[{file_name}] ✅ Batch {batch_num}/{total_batches}: {len(batch)} records"
            )
            return True

        except Exception as e:
            error_str = str(e).lower()

            wait_time = calculate_backoff(attempt)
            remaining_attempts = max_retries - attempt - 1

            print(
                f"[{file_name}] ⚠️ Batch {batch_num} failed (attempt {attempt + 1}/{max_retries}): {e}"
            )

            if remaining_attempts > 0:
                print(
                    f"[{file_name}] ⏳ Retrying in {wait_time:.1f}s... ({remaining_attempts} attempts left)"
                )
                time.sleep(wait_time)
            else:
                print(
                    f"[{file_name}] 💥 Batch {batch_num} exhausted all {max_retries} retries"
                )
                return False

    return False


def calculate_backoff(
    attempt: int, base_delay: float = 1.0, max_delay: float = 60.0
) -> float:
    """Calculate exponential backoff with jitter"""
    delay = min(base_delay * (2**attempt), max_delay)
    # Add jitter to avoid thundering herd
    jitter = random.uniform(0.1, 0.3) * delay
    return delay + jitter

In [12]:
def process_files(file_df) -> None:
    for index, row in file_df.iterrows():
        process_file(row)

In [16]:
file_df = pd.read_csv("./new_files_test.csv")
process_files(file_df)

Downloading ZIP file from: https://s3.amazonaws.com/tripdata/202501-citibike-tripdata.zip
Found 3 CSV file(s) in ZIP archive:
  - 202501-citibike-tripdata_1.csv
  - 202501-citibike-tripdata_3.csv
  - 202501-citibike-tripdata_2.csv

Processing: 202501-citibike-tripdata_1.csv


  df = pd.read_csv(csv_file)


here
  ✓ Successfully processed 1000000 rows

Processing: 202501-citibike-tripdata_3.csv
here
  ✓ Successfully processed 124475 rows

Processing: 202501-citibike-tripdata_2.csv


  df = pd.read_csv(csv_file)


here
  ✓ Successfully processed 1000000 rows

PROCESSING SUMMARY:
Successfully processed: 3 files
Failed: 0 files
[202501-citibike-tripdata_1.csv] entries: 1000000
  - Removing 0 duplicate ride_id entries
[202501-citibike-tripdata_3.csv] entries: 124475
  - Removing 0 duplicate ride_id entries
[202501-citibike-tripdata_2.csv] entries: 1000000
  - Removing 0 duplicate ride_id entries
Starting upload process with 3 files, max_retries=5
📁 Processing 202501-citibike-tripdata_1.csv with 1000000 records
📁 Processing 202501-citibike-tripdata_3.csv with 124475 records
📁 Processing 202501-citibike-tripdata_2.csv with 1000000 records
[202501-citibike-tripdata_1.csv] ✅ Batch 1/1000: 1000 records
[202501-citibike-tripdata_3.csv] ✅ Batch 1/125: 1000 records
[202501-citibike-tripdata_2.csv] ✅ Batch 1/1000: 1000 records
[202501-citibike-tripdata_1.csv] ✅ Batch 2/1000: 1000 records
[202501-citibike-tripdata_2.csv] ⚠️ Batch 2 failed (attempt 1/5): EOF occurred in violation of protocol (_ssl.c:2427)[202

RuntimeError: Upload process failed. Failed files: ['202501-citibike-tripdata_1.csv']

In [8]:
def upload_results_batched(output, batch_size=1000):
    for file_name, df in output.items():
        print(f"Processing {file_name} with {len(df)} records")
        
        # Convert to JSON once
        json_str = df.to_json(orient="records", date_format='iso')
        data = json.loads(json_str)
        
        # Process in batches
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            try:
                result = (
                    supabase.table("ride_data")
                    .upsert(batch, on_conflict="ride_id,locale")
                    .execute()
                )
                print(f"Uploaded batch {i//batch_size + 1}: {len(batch)} records")
            except Exception as e:
                print(f"Error uploading batch {i//batch_size + 1}: {e}")
                # Optionally continue with next batch or re-raise

In [35]:
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor

def upload_results_concurrent(output, batch_size=1000, max_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        for file_name, df in output.items():
            future = executor.submit(upload_single_file, file_name, df, batch_size)
            futures.append(future)
        
        # Wait for all files to complete
        for future in futures:
            try:
                future.result()
            except Exception as e:
                print(f"File upload failed: {e}")
                raise e

def upload_single_file(file_name, df, batch_size):
    print(f"Processing {file_name} with {len(df)} records")
    json_str = df.to_json(orient="records", date_format='iso')
    data = json.loads(json_str)
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        try:
            result = (
                supabase.table("ride_data")
                .upsert(batch, on_conflict="ride_id,locale")
                .execute()
            )
            print(f"[{file_name}] Uploaded batch {i//batch_size + 1}: {len(batch)} records")
        except Exception as e:
            print(f"[{file_name}] Error uploading batch {i//batch_size + 1}: {e}")
            raise e
