In [2]:
import pandas as pd
import pyarrow.parquet as pq
import os

In [16]:
events = pd.read_csv('data/train_events.csv')
events.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [8]:
input_path = "dataOutput/chunk_0.parquet"
output_dir = "dataOutput/500mb"
chunk_size = 100 * 1024 * 1024  # 10 MB in bytes

In [9]:
# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Open the Parquet file
parquet_file = pq.ParquetFile(input_path)

# Calculate the number of rows to achieve the desired chunk size
row_group_size = parquet_file.metadata.row_group(0).total_byte_size
rows_per_chunk = chunk_size // row_group_size if row_group_size < chunk_size else 1

# Since the logic is now based on row groups, we need to adjust the loop to iterate over row groups
num_row_groups = parquet_file.num_row_groups

# Iterate over the file in chunks
for i in range(0, num_row_groups, rows_per_chunk):
    # We might end up reading multiple row groups if they are smaller than the desired chunk size
    dataframes = [parquet_file.read_row_group(j).to_pandas() for j in range(i, min(i + rows_per_chunk, num_row_groups))]
    chunk = pd.concat(dataframes)
    output_path = os.path.join(output_dir, f'chunk_{i//rows_per_chunk}.parquet')
    chunk.to_parquet(output_path)
    print(f"Saved chunk {i//rows_per_chunk} to {output_path}")

Saved chunk 0 to dataOutput/500mb\chunk_0.parquet
Saved chunk 1 to dataOutput/500mb\chunk_1.parquet
Saved chunk 2 to dataOutput/500mb\chunk_2.parquet
Saved chunk 3 to dataOutput/500mb\chunk_3.parquet
Saved chunk 4 to dataOutput/500mb\chunk_4.parquet
Saved chunk 5 to dataOutput/500mb\chunk_5.parquet
Saved chunk 6 to dataOutput/500mb\chunk_6.parquet
Saved chunk 7 to dataOutput/500mb\chunk_7.parquet
Saved chunk 8 to dataOutput/500mb\chunk_8.parquet
Saved chunk 9 to dataOutput/500mb\chunk_9.parquet


In [4]:
series = pd.read_parquet("dataOutput/chunk_0.parquet", engine='fastparquet')
series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215
