In [8]:
import pathlib
import typing

import dask
import dask.array
import dask.dataframe
import dask.bag
import polyflexmd.data_analysis.data.read as read
import polyflexmd.data_analysis.data.constants as constants
import polyflexmd.data_analysis.transform.transform as transform
import polyflexmd.data_analysis.pipelines.trajectory

import pandas as pd

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
system = read.read_lammps_system_data(pathlib.Path(
    "/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/initial_system.data"))

In [12]:
p = "/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=2/j_d_end=1/polymer-2-1.out"

In [10]:
def read_lammps_custom_trajectory_file_generator(
        path: pathlib.Path,
        column_types: dict[str, typing.Any]
) -> typing.Generator[tuple[list[str], typing.Generator[list[typing.Any], None, None]], None, None]:
    df_bag = dask.bag.read_text(str(path)).to_dataframe()

    #df_bag = df_bag.dropna().map(pd.Series).to_bag()

    def process_timestep(df):
        timestep = df.iloc[1][0]
        particles_n = df.iloc[3][0]
        columns = df.iloc[5][0].split()[2:]
        columns_n = len(columns)

        if particles_n == 0 or columns_n == 0:
            raise StopIteration

        header = ["t", *columns]
        data = df.iloc[6:].apply(lambda x: x.str.split(), axis=0).to_records(index=False, column_dtypes=column_types)
        yield header, data

    for _, df in df_bag.groupby(df_bag[0].str.contains("ITEM: TIMESTEP").cumsum()):
        yield from process_timestep(df)

In [11]:
next(read_lammps_custom_trajectory_file_generator(path=pathlib.Path(p),
                                                  column_types=constants.RAW_TRAJECTORY_DF_COLUMN_TYPES
                                                  ))

NotImplementedError: Iteration of DataFrameGroupBy objects requires computing the groups which may be slow. You probably want to use 'apply' to execute a function for all the columns. To access individual groups, use 'get_group'. To list all the group names, use 'df[<group column>].unique().compute()'.

t      NaN
id     NaN
type   NaN
x      NaN
y      NaN
z      NaN
ix     NaN
iy     NaN
iz     NaN
dtype: float64

In [147]:
@dask.delayed
def process_timestep(df):
    timestep = df.iloc[1][0]
    columns = df.iloc[8][0].split()[2:]

    header = ["t", *columns]
    rows = []
    for _, row in df.iloc[9:].to_records(index=True):
        values = row.split()
        values.insert(0, timestep)
        rows.append(values)

    return pd.DataFrame(rows, columns=header).astype(constants.RAW_TRAJECTORY_DF_COLUMN_TYPES)


def read(path: pathlib.Path) -> dask.dataframe.DataFrame:
    df_bag = dask.bag.read_text(path, linedelimiter="\n").to_dataframe(columns=["row"])
    columns = df_bag.loc[df_bag["row"].str.contains("ITEM: ATOMS")].head(1).iloc[0]["row"].split()[2:]
    columns.insert(0, "t")
    return df_bag.groupby(df_bag["row"].str.contains("ITEM: TIMESTEP").cumsum()).apply(
        process_timestep,
        meta=pd.DataFrame(
            columns=columns,
        ).astype(constants.RAW_TRAJECTORY_DF_COLUMN_TYPES)
    ).reset_index(drop=True)

read(p).head(10)

Unnamed: 0,t,id,type,x,y,z,ix,iy,iz
0,1000000,1,1,0.0,0.0,0.0,0,0,0
1,1000000,2,1,-0.159927,-0.017451,-0.956566,0,0,0
2,1000000,3,2,0.394319,-0.204321,-1.71753,0,0,0
3,1000000,4,2,0.864422,-0.537296,-2.51628,0,0,0
4,1000000,5,2,1.67989,-0.475385,-3.00902,0,0,0
5,1000000,6,2,2.57993,-0.22791,-3.06611,0,0,0
6,1000000,7,2,3.45689,-0.431163,-3.46151,0,0,0
7,1000000,8,2,4.38175,-0.734328,-3.47958,0,0,0
8,1000000,9,2,5.0838,-1.06759,-2.88169,0,0,0
9,1000000,10,2,5.70817,-1.31065,-2.21273,0,0,0


In [8]:
kappas = [1.0 + i * 5 for i in range(2)]
d_ends = [1.2 + i * 0.2 for i in range(3)]

df_trajectories = polyflexmd.data_analysis.pipelines.trajectory.read_and_process_trajectories(
    trajectories=polyflexmd.data_analysis.data.read.get_experiment_trajectories_paths(
        experiment_raw_data_path=pathlib.Path(
            "/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw"),
        style="l_K+d_end",
        kappas=kappas,
        d_ends=d_ends,
        read_relax=True
    ),
    system=system
)

df_trajectories

Reading paths [PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=1/polymer_relax-1-1.out'), PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=1/polymer-1-1.out')] ...
Joining ...
Unfolding coordinates...
Reading paths [PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=2/polymer_relax-1-2.out'), PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=2/polymer-1-2.out')] ...
Joining ...
Unfolding coordinates...
Reading paths [PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=3/polymer_relax-1-3.out'), PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_

Unnamed: 0_level_0,t,id,type,x,y,z,molecule-ID,kappa,d_end
npartitions=672,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,uint64,uint16,uint8,float64,float64,float64,uint16,category[known],category[known]
,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


In [14]:
df_trajectories.to_csv(
    "/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/processed/trajectories.csv",
    single_file=True, index=False)

['/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/processed/trajectories.csv']

In [12]:
df_ete = transform.calc_end_to_end_df(
    df_trajectories,
    group_by_params=["kappa", "d_end"],
    parallel=False
)
df_ete

ValueError: Metadata inference failed in `groupby.apply(lambda)`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
IndexError('single positional indexer is out-of-bounds')

Traceback:
---------
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/dask/dataframe/utils.py", line 193, in raise_on_meta_error
    yield
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/dask/dataframe/groupby.py", line 2489, in apply
    meta = self._meta_nonempty.apply(func, *meta_args, **meta_kwargs)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/groupby/groupby.py", line 1353, in apply
    result = self._python_apply_general(f, self._selected_obj)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/groupby/groupby.py", line 1402, in _python_apply_general
    values, mutated = self.grouper.apply(f, data, self.axis)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/groupby/ops.py", line 767, in apply
    res = f(group)
  File "/home/egor/Projects/polyflexmd/src/polyflexmd/data_analysis/transform/transform.py", line 102, in <lambda>
    return gb.apply(lambda dfg: dfg.groupby(["t"]).apply(calculate_end_to_end))
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/groupby/groupby.py", line 1353, in apply
    result = self._python_apply_general(f, self._selected_obj)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/groupby/groupby.py", line 1402, in _python_apply_general
    values, mutated = self.grouper.apply(f, data, self.axis)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/groupby/ops.py", line 767, in apply
    res = f(group)
  File "/home/egor/Projects/polyflexmd/src/polyflexmd/data_analysis/transform/transform.py", line 66, in calculate_end_to_end
    leaf_atom_data: pd.Series = molecule_traj_step_df_unf \
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/indexing.py", line 1103, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/indexing.py", line 1656, in _getitem_axis
    self._validate_integer(key, axis)
  File "/home/egor/Projects/polyflexmd/.venv/lib/python3.10/site-packages/pandas/core/indexing.py", line 1589, in _validate_integer
    raise IndexError("single positional indexer is out-of-bounds")


In [10]:
df_trajectory_unfolded = transform.unfold_coordinates_df(
    trajectory_df=transform.join_raw_trajectory_df_with_system_data(
        raw_trajectory_df=read.read_multiple_raw_trajectory_dfs([
            pathlib.Path(
                "/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=1/polymer_relax-1-1.out"),
            pathlib.Path(
                "/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=1/polymer-1-1.out")
        ]),
        system_data=system
    ),
    system_data=system
)
df_trajectory_unfolded

Reading paths [PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=1/polymer_relax-1-1.out'), PosixPath('/home/egor/Projects/polyflexmd/data/test-5-FENE-beadspring-vary-l_K-vary-d_end/e296c212/data/raw/i_kappa=1/j_d_end=1/polymer-1-1.out')] ...
Joining ...
Unfolding coordinates...


Unnamed: 0_level_0,t,id,type,x,y,z,ix,iy,iz,molecule-ID
npartitions=19152,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,uint64,uint16,uint8,float64,float64,float64,int16,int16,int16,uint16
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
