In [1]:
# CSV to Parquet Conversion (Iterative)
**Dataset:** ~1890 samples, split into batches of 10.
**Dimensions:** ~480k Columns (CpG Beta Values).
**Optimization:** Using `float32` for memory efficiency and `PyArrow` for iterative writing.

SyntaxError: invalid decimal literal (1681375674.py, line 3)

In [4]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import glob
import os
import time

# --- CONFIGURATION ---
# Use *.csv to grab all files in the folder
input_path = "C:/Users/Stuti/Desktop/Projects/CGP/sample_batches/*.csv" 

# Output filename
output_file = "genomics_data.parquet"

# Get list of files
csv_files = sorted(glob.glob(input_path))

print(f"Target: {output_file}")
print(f"Found {len(csv_files)} CSV files to process.")

Target: genomics_data.parquet
Found 189 CSV files to process.


In [None]:
# Initialize variables
parquet_writer = None
schema = None
start_time = time.time()

print("Starting conversion...")

for i, file in enumerate(csv_files):
    try:
        # 1. Read CSV (Engine 'c' is faster)
        df = pd.read_csv(file, engine='c')
        
        # 2. Optimization: Downcast floats
        # CpG Beta values (0-1) do not need 64-bit precision. 
        # Float32 cuts RAM and Disk usage by 50%.
        fcols = df.select_dtypes('float').columns
        df[fcols] = df[fcols].astype('float32')
        
        # 3. Convert to PyArrow Table
        table = pa.Table.from_pandas(df)
        
        # 4. Initialize Writer (Only on the first file)
        if parquet_writer is None:
            schema = table.schema
            # ZSTD provides excellent compression for genomics data
            parquet_writer = pq.ParquetWriter(output_file, schema, compression='zstd')
        
        # 5. Schema Check (Safety net)
        # Ensures that if column order changes in later CSVs, they are aligned
        if not table.schema.equals(schema):
            table = table.cast(schema)

        # 6. Write to Parquet
        parquet_writer.write_table(table)
        
        # Optional: Print progress every 10 files to keep log clean
        if (i + 1) % 10 == 0:
            print(f"Processed {i+1}/{len(csv_files)} files...")

    except Exception as e:
        print(f"Error processing file {file}: {e}")
        # Decide if you want to break or continue. 
        # usually better to stop and fix data issues.
        break

# 7. Close Writer
if parquet_writer:
    parquet_writer.close()

end_time = time.time()
duration = (end_time - start_time) / 60

print(f"--- DONE ---")
print(f"Total time: {duration:.2f} minutes")
print(f"File saved as: {output_file}")

Starting conversion...


In [None]:
# Verify the result
import os

file_size = os.path.getsize(output_file) / (1024 * 1024 * 1024) # Size in GB
print(f"Final File Size: {file_size:.2f} GB")

# Read metadata only (fast)
parquet_file = pq.ParquetFile(output_file)
print(f"Total Rows: {parquet_file.metadata.num_rows}")
print(f"Total Columns: {parquet_file.metadata.num_columns}")
print(f"Row Groups (Batches): {parquet_file.num_row_groups}")

# Optional: Peek at the first 5 rows and 5 columns
# We use columns=[list] to avoid loading all 480k columns into RAM
first_few_cols = parquet_file.schema.names[:5] 
subset = pd.read_parquet(output_file, columns=first_few_cols).head()
print("\nFirst 5 rows (subset):")
print(subset)