In [1]:
import pandas as pd
from typing import Sequence, Dict

from diskos.sender_specific import ANAL_ID_COL
from diskos.utils import split_on_last_occurrence

In [2]:
src_cols = ['AnalID', 'ColA[x]', 'ColN[x]', 'ColA_2[y]', 'ColN_2[y]']
src_vals = [
    # all sets of cols in one row in src data
    ['001WXA', 0.05, ''  , '001WYA', 1.35 , ''      , ''  , ''  ],
    ['002WXA', ''  , 0.15, ''      , ''   , '002WZA', 2.31, 2.32],
]
df_src = pd.DataFrame(src_vals, columns=src_cols)

In [3]:
def build_column_groups(df: pd.DataFrame) -> Sequence[Sequence[str]]:
    col_groups = []
    cur_idx = -1
    for col in df.columns:
        if col.startswith(ANAL_ID_COL):
            col_groups.append([col])
            cur_idx += 1
        else:
            col_groups[cur_idx].append(col)
    return col_groups

In [4]:
build_column_groups(df_src)

[['AnalID', 'ColA', 'ColN'],
 ['AnalID_2', 'ColA_2'],
 ['AnalID_3', 'ColA_3', 'ColN_3']]

In [5]:
def build_col_map(df: pd.DataFrame) -> Dict[str, Sequence[str]]:
    suffix_to_cols = {}
    cur_suffix = ''
    for col in df.columns:
        if col.startswith(ANAL_ID_COL):
            _, cur_suffix = split_on_last_occurrence(col, '_')
            suffix_to_cols[cur_suffix] = [col]
        else:
            suffix_to_cols[cur_suffix].append(col)
    return suffix_to_cols

In [6]:
col_map = build_col_map(df_src)
col_map

{'': ['AnalID', 'ColA', 'ColN'],
 '2': ['AnalID_2', 'ColA_2'],
 '3': ['AnalID_3', 'ColA_3', 'ColN_3']}

In [7]:
# todo - need col_map or could just take clean cols?
def split_cols(df: pd.DataFrame, col_map: Dict[str, Sequence[str]]) -> Sequence[pd.DataFrame]:
    dfs = []  # split data into multipe dfs
    
    for suffix, cols in col_map.items():
        new_df = df[cols]
        cleaned_cols = [split_on_last_occurrence(c, '_')[0] for c in cols]
        new_df.columns = cleaned_cols  # ensure matching col names ready for concat        
        new_df = new_df[new_df[ANAL_ID_COL] != ""]  # filter out empy rows
        dfs.append(new_df)
            
    # fill any missing cols with empties (all dfs need to be same shape to concat rows)
    for idx, col in enumerate(dfs[0].columns.tolist()):
        for df in dfs:
            if not col in df.columns.tolist():
                df[col] = ""
                
    return dfs

In [8]:
pd.concat(split_cols(df_src, col_map)).reset_index(drop=True)

Unnamed: 0,AnalID,ColA,ColN
0,001WXA,0.05,
1,002WXA,,0.15
2,001WYA,1.35,
3,002WZA,2.31,2.32
