# CSV Splitter

- For uploading into Github and sharing

In [4]:
!source ../venv/bin/activate
# !pip install -r ../requirements.txt

In [3]:
import os
import pandas as pd
from math import ceil

In [5]:
def split_csv(input_file, output_prefix, max_size_mb=4):
    """
    Split a CSV file into chunks of approximately max_size_mb each.

    Args:
        input_file (str): Path to input CSV file
        output_prefix (str): Prefix for output files
        max_size_mb (float): Maximum size of each chunk in MB
    """
    # Convert MB to bytes
    max_size_bytes = max_size_mb * 1024 * 1024

    # Get the total size of the input file
    file_size = os.path.getsize(input_file)

    if file_size <= max_size_bytes:
        print(
            f"File size ({file_size / 1024 / 1024:.2f} MB) is less than or equal to {max_size_mb} MB. No splitting needed."
        )
        return

    # Calculate approximate number of chunks needed
    num_chunks = ceil(file_size / max_size_bytes)

    # Read the CSV file
    df = pd.read_csv(input_file)
    total_rows = len(df)

    # Calculate approximate rows per chunk
    rows_per_chunk = ceil(total_rows / num_chunks)

    # Split the dataframe and save chunks
    for i in range(0, total_rows, rows_per_chunk):
        chunk = df.iloc[i : i + rows_per_chunk]
        chunk_number = i // rows_per_chunk + 1
        output_file = f"{output_prefix}_{chunk_number}.csv"
        chunk.to_csv(output_file, index=False)
        chunk_size = os.path.getsize(output_file) / 1024 / 1024
        print(f"Created {output_file} ({chunk_size:.2f} MB)")

In [9]:
split_csv("final_mock_data.csv", "final_mock_data")

Created final_mock_data_1.csv (2.62 MB)
Created final_mock_data_2.csv (2.70 MB)
Created final_mock_data_3.csv (2.78 MB)
