In [88]:
import pandas as pd
from utils import get_dirs, get_files
from os.path import join
from tqdm import tqdm

In [89]:
def read_metadata(path):
    df = pd.read_excel(path, skiprows=1, header=1)
    df = df[df['Model Type'] == "PDX"]
    return df

def pdxnet_model_ids(df):
    id_col = 'model_id'
    df[id_col] = df['ContributorPDX.ID']
    df[id_col] = df[id_col].str.split('[,;]')
    df = df.explode(id_col)
    df.reset_index(drop=True, inplace=True)
    df[id_col] = df[id_col].str.replace('\s+\(.*\)', '', regex=True).str.replace("HCI0", "HCI-0").str.replace(" ", '')
    condition = (df['Contributor'] == 'BCM') & (~df[id_col].astype(str).str.startswith('BCM-')) & (
        df[id_col].astype(str).str.isnumeric())
    df.loc[condition, id_col] = 'BCM-' + df.loc[condition, id_col].astype(str)
    return df

In [90]:
files = {'raw_all': '/Users/tushar/CancerModels/submission/PDXNet/raw/PDXNet Portal - Seven Bridges.csv',
         'raw_dir': '/Users/tushar/CancerModels/submission/PDXNet/raw/'}

In [91]:
providers = get_dirs(files['raw_dir'])
raw_data = pdxnet_model_ids(pd.read_csv(files['raw_all']).drop('Unnamed: 0', axis=1))

for i in range(len(providers)):
    provider = providers[i]
    p_path = join(files['raw_dir'], provider)
    raw_files = [f for f in get_files(p_path) if f.endswith('.xlsx')]
    for f in tqdm(raw_files, f'For {provider} processing {len(raw_files)} excel files'):
        df = read_metadata(join(p_path, f))
        temp = raw_data[raw_data['Contributor']==provider]
        df = df.merge(temp, left_on='Model ID', right_on='model_id', how='outer')
        df.to_csv(join(p_path, f + '-merged.tsv'), sep='\t', index=False)

For HCI processing 1 excel files: 100%|██████████| 1/1 [00:00<00:00, 17.21it/s]
  warn(msg)
For WUSTL processing 1 excel files: 100%|██████████| 1/1 [00:00<00:00,  9.33it/s]
For MDACC processing 2 excel files: 100%|██████████| 2/2 [00:00<00:00, 13.94it/s]
For BCM processing 1 excel files: 100%|██████████| 1/1 [00:00<00:00, 12.38it/s]
For WISTAR processing 1 excel files: 100%|██████████| 1/1 [00:00<00:00, 14.49it/s]
