In [2]:
%pip install pandas pyarrow

import pandas as pd
import os
import pyarrow.parquet as pq
import pyarrow as pa



Note: you may need to restart the kernel to use updated packages.


In [11]:
# Split Parquet File
# ------------------
# Pass in a Parquet file to split it into 99MB chunks.

def split_parquet(input_file, chunk_size_mb=99):
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_dir = os.path.join(os.path.dirname(input_file), base_name)
    os.makedirs(output_dir, exist_ok=True)
    
    table = pq.read_table(input_file)
    total_size = os.path.getsize(input_file)
    num_chunks = total_size // (chunk_size_mb * 1024 * 1024) + 1
    
    row_chunks = len(table) // num_chunks
    
    for i in range(num_chunks):
        start_idx = i * row_chunks
        end_idx = (i + 1) * row_chunks if i < num_chunks - 1 else len(table)
        chunk_table = table.slice(start_idx, end_idx - start_idx)
        chunk_file = os.path.join(output_dir, f'chunk_{i}.parquet')
        pq.write_table(chunk_table, chunk_file)
        print(f"Saved {chunk_file} ({os.path.getsize(chunk_file) / (1024 * 1024):.2f} MB)")

# Provide the Parquet file to split
input_file = "CLEANED_V9.1.parquet"
split_parquet(input_file)


Saved CLEANED_V9.1/chunk_0.parquet (93.30 MB)
Saved CLEANED_V9.1/chunk_1.parquet (95.10 MB)
Saved CLEANED_V9.1/chunk_2.parquet (93.54 MB)


In [9]:
# Merge Parquet Files
# ------------------
# Pass in the folder containing the split files to merge them back into a single Parquet file.

def merge_parquet(input_folder, output_file):
    chunk_files = sorted([os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.parquet')])
    
    dfs = [pd.read_parquet(chunk) for chunk in chunk_files]
    merged_df = pd.concat(dfs, ignore_index=True)
    merged_df.to_parquet(output_file, index=False)
    print(f"Merged file saved as {output_file}")

# Provide the folder where the chunks are stored
input_folder = "CLEANED_V6.1"
output_merged_file = "CLEANED_V6.1_merged.parquet"
merge_parquet(input_folder, output_merged_file)


Merged file saved as CLEANED_V6.1_merged.parquet
