In [15]:
import pandas as pd
import pyarrow.parquet as pq
import os

In [16]:
events = pd.read_csv('data/train_events.csv')
events.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [17]:
input_path = "data/train_series.parquet"
output_dir = "dataOutput/500mb"
chunk_size = 1000 * 1024 * 1024  # 100 MB in bytes

In [18]:
# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Open the Parquet file
parquet_file = pq.ParquetFile(input_path)

# Calculate the number of rows to achieve the desired chunk size
row_group_size = parquet_file.metadata.row_group(0).total_byte_size
rows_per_chunk = chunk_size // row_group_size if row_group_size < chunk_size else 1

# Since the logic is now based on row groups, we need to adjust the loop to iterate over row groups
num_row_groups = parquet_file.num_row_groups

def apply_sliding_window(df):
    N = 10
    df['anglez_mean'] = df['anglez'].rolling(window=N).mean()
    df['anglez_std'] = df['anglez'].rolling(window=N).std()
    df['enmo_mean'] = df['enmo'].rolling(window=N).mean()
    df['enmo_std'] = df['enmo'].rolling(window=N).std()
    return df

# Iterate over the file in chunks
for i in range(0, num_row_groups, rows_per_chunk):
    # We might end up reading multiple row groups if they are smaller than the desired chunk size
    dataframes = [parquet_file.read_row_group(j).to_pandas() for j in range(i, min(i + rows_per_chunk, num_row_groups))]
    chunk = pd.concat(dataframes)
    chunk_sliding_window_applied = apply_sliding_window(chunk)
    output_path = os.path.join(output_dir, f'chunk_{i//rows_per_chunk}.parquet')
    chunk_sliding_window_applied.to_parquet(output_path)
    print(f"Saved chunk {i//rows_per_chunk} to {output_path}")

Saved chunk 0 to dataOutput/500mb\chunk_0.parquet
Saved chunk 1 to dataOutput/500mb\chunk_1.parquet
Saved chunk 2 to dataOutput/500mb\chunk_2.parquet
Saved chunk 3 to dataOutput/500mb\chunk_3.parquet
Saved chunk 4 to dataOutput/500mb\chunk_4.parquet
Saved chunk 5 to dataOutput/500mb\chunk_5.parquet
Saved chunk 6 to dataOutput/500mb\chunk_6.parquet


In [21]:
series = pd.read_parquet("dataOutput/500mb/chunk_2.parquet", engine='fastparquet')
series.head(100)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,anglez_mean,anglez_std,enmo_mean,enmo_std
0,1e6717d93c1d,461670,2019-01-14T04:42:30-0500,20.241501,0.0006,,,,
1,1e6717d93c1d,461671,2019-01-14T04:42:35-0500,20.241501,0.0006,,,,
2,1e6717d93c1d,461672,2019-01-14T04:42:40-0500,20.241501,0.0006,,,,
3,1e6717d93c1d,461673,2019-01-14T04:42:45-0500,20.241501,0.0006,,,,
4,1e6717d93c1d,461674,2019-01-14T04:42:50-0500,20.241501,0.0006,,,,
...,...,...,...,...,...,...,...,...,...
95,1e6717d93c1d,461765,2019-01-14T04:50:25-0500,8.686600,0.0003,8.6866,0.0,0.0003,0.0
96,1e6717d93c1d,461766,2019-01-14T04:50:30-0500,8.686600,0.0003,8.6866,0.0,0.0003,0.0
97,1e6717d93c1d,461767,2019-01-14T04:50:35-0500,8.686600,0.0003,8.6866,0.0,0.0003,0.0
98,1e6717d93c1d,461768,2019-01-14T04:50:40-0500,8.686600,0.0003,8.6866,0.0,0.0003,0.0
