In [1]:
import gc
import pathlib

import pandas as pd
import tqdm

In [2]:
# import jump data # absolute paths
jump_data_path = pathlib.Path("/home/lippincm/Desktop/18TB/normalized_sc_data").resolve(
    strict=True
)

# get the list of parquet files in the directory that are not aggregated
jump_data_files = list(jump_data_path.glob("*.parquet"))
jump_data_files = [x for x in jump_data_files if "agg" not in str(x)]

In [3]:
# loop through the files and aggregate the data
for file in tqdm.tqdm(jump_data_files):
    # read file
    # read file
    jump_df = pd.read_parquet(file)
    # extract the file name and path
    file_name = file.stem
    file_path = file.parent
    # define save path
    agg_file_name = file_path / f"{file_name}_agg.parquet"

    # separate the data into the different types metadata, data
    metadata = jump_df[jump_df.columns[jump_df.columns.str.contains("Metadata")]]
    features = jump_df[jump_df.columns[~jump_df.columns.str.contains("Metadata")]]
    features = features.copy()
    features.loc[:, "Metadata_Well"] = metadata["Metadata_Well"]
    # aggregate the data
    agg_df = features.groupby("Metadata_Well").agg("mean")
    metadata = metadata.drop_duplicates(subset="Metadata_Well")
    # add the metadata back to the aggregated data
    agg_df = agg_df.merge(metadata, on="Metadata_Well", how="left")
    # save the aggregated data
    agg_df.to_parquet(agg_file_name)
    print(agg_df.shape)
    agg_df.head()
    del jump_df, agg_df, metadata, features
    gc.collect()

  2%|▏         | 1/51 [00:58<49:03, 58.88s/it]

(384, 5810)


  4%|▍         | 2/51 [02:44<1:10:18, 86.08s/it]

(384, 5810)


  6%|▌         | 3/51 [05:56<1:47:38, 134.56s/it]

(384, 5810)


  8%|▊         | 4/51 [07:00<1:23:32, 106.64s/it]

(384, 5810)


 10%|▉         | 5/51 [08:14<1:12:47, 94.95s/it] 

(384, 5806)


 12%|█▏        | 6/51 [09:20<1:03:49, 85.10s/it]

(384, 5810)
(384, 5806)


 16%|█▌        | 8/51 [11:55<57:26, 80.14s/it]  

(384, 5806)
(384, 5810)


 20%|█▉        | 10/51 [18:20<1:39:36, 145.76s/it]

(384, 5810)


 22%|██▏       | 11/51 [19:33<1:22:13, 123.35s/it]

(384, 5804)


 24%|██▎       | 12/51 [20:30<1:07:03, 103.18s/it]

(384, 5810)
(383, 5810)


 27%|██▋       | 14/51 [24:33<1:10:32, 114.39s/it]

(384, 5810)


 29%|██▉       | 15/51 [25:24<57:10, 95.28s/it]   

(384, 5806)


 31%|███▏      | 16/51 [26:11<47:12, 80.92s/it]

(384, 5806)


 33%|███▎      | 17/51 [27:36<46:27, 81.99s/it]

(384, 5806)


 35%|███▌      | 18/51 [28:54<44:24, 80.75s/it]

(384, 5804)
(384, 5806)


 39%|███▉      | 20/51 [31:21<39:13, 75.92s/it]

(384, 5810)


 41%|████      | 21/51 [33:01<41:33, 83.11s/it]

(384, 5806)
(384, 5810)


 43%|████▎     | 22/51 [34:56<44:51, 92.81s/it]

(383, 5810)


 47%|████▋     | 24/51 [37:47<39:25, 87.59s/it]

(383, 5806)


 49%|████▉     | 25/51 [38:46<34:17, 79.13s/it]

(384, 5806)


 51%|█████     | 26/51 [40:39<37:12, 89.28s/it]

(384, 5804)


 53%|█████▎    | 27/51 [42:03<35:04, 87.69s/it]

(384, 5806)


 55%|█████▍    | 28/51 [42:55<29:25, 76.78s/it]

(384, 5804)


 57%|█████▋    | 29/51 [44:17<28:46, 78.49s/it]

(384, 5806)


 59%|█████▉    | 30/51 [45:54<29:21, 83.89s/it]

(384, 5810)


 61%|██████    | 31/51 [47:11<27:19, 81.99s/it]

(384, 5804)


 63%|██████▎   | 32/51 [48:13<24:02, 75.91s/it]

(384, 5804)


 65%|██████▍   | 33/51 [49:56<25:10, 83.93s/it]

(384, 5810)


 67%|██████▋   | 34/51 [51:13<23:14, 82.04s/it]

(384, 5806)


 69%|██████▊   | 35/51 [51:52<18:25, 69.08s/it]

(384, 5806)


 71%|███████   | 36/51 [52:59<17:07, 68.49s/it]

(384, 5806)


 73%|███████▎  | 37/51 [54:19<16:47, 71.94s/it]

(384, 5804)


 75%|███████▍  | 38/51 [56:18<18:39, 86.13s/it]

(384, 5810)


 76%|███████▋  | 39/51 [57:26<16:08, 80.68s/it]

(384, 5806)


 78%|███████▊  | 40/51 [58:27<13:41, 74.68s/it]

(384, 5810)
(384, 5810)


 80%|████████  | 41/51 [1:00:17<14:12, 85.24s/it]

(384, 5804)


 84%|████████▍ | 43/51 [1:02:31<09:59, 74.90s/it]

(384, 5806)


 86%|████████▋ | 44/51 [1:03:09<07:27, 63.87s/it]

(384, 5806)
(384, 5810)


 90%|█████████ | 46/51 [1:08:00<08:07, 97.41s/it] 

(384, 5810)


 92%|█████████▏| 47/51 [1:08:57<05:41, 85.41s/it]

(384, 5810)


 94%|█████████▍| 48/51 [1:09:58<03:53, 77.95s/it]

(384, 5804)


 96%|█████████▌| 49/51 [1:10:57<02:24, 72.13s/it]

(384, 5810)


 98%|█████████▊| 50/51 [1:11:39<01:03, 63.32s/it]

(301, 5810)


100%|██████████| 51/51 [1:15:07<00:00, 88.37s/it] 

(384, 5810)



