In [1]:
import pandas as pd
from tqdm import tqdm
from utils import read_metadata_without_fields, get_files, get_dirs, read_metadata_with_fields
from pandas import read_csv, DataFrame, concat
from os.path import join, dirname, exists
from time import ctime, time
from subprocess import run, check_output, CalledProcessError
from numpy import isnan
import logging

logging.basicConfig(filename='{}.log'.format('data_overview_file_generation'), filemode='a+', level=logging.DEBUG)

In [2]:
def get_git_tags():
    try:
        tags = check_output([f'cd {home}; git tag'], shell=True, text=True).splitlines()
        return [t for t in tags if t.__contains__('PDCM')]
    except CalledProcessError as e:
        print("Error:", e)
        return None

def checkout_and_run_function(tag, git_tags):
    try:
        logging.info(f"{ctime()}: Running pipeline for tag: {tag}")
        run([f'cd {home}; git checkout {tag}'], shell=True, check=True)
        mol_data, total_models, phenomics_dp, provider_country_df = get_mol_data(home, git_tags[tag])
        phenomics_dp.to_csv(f"phenomics_data_points_{str(git_tags[tag]).replace('PDCM_', '')}.csv", index=False)
        provider_country_df.to_csv(f"provider_country_{str(git_tags[tag]).replace('PDCM_', '')}.csv", index=False)
        mol_data.to_csv(f"{str(git_tags[tag]).replace('PDCM_', '')}.csv", index=False)
        total_models.to_csv(f"total_models_{str(git_tags[tag]).replace('PDCM_', '')}.csv", index=False)
    except CalledProcessError as e:
        print("Error:", e)

In [3]:
# Weights for fields that are common in all types of models
common_weights = {
    "sex": 1,
    "ethnicity": 0.5,
    "sample_id": 1,
    "age_in_years_at_collection": 1,
    "diagnosis": 1,
    "tumour_type": 1,
    "primary_site": 1,
    "collection_site": 0.5,
    "stage": 0.5,
    "grade": 0.5,
    "treated_at_collection": 0.5,
    "treated_prior_to_collection": 0.5,
    "validation_technique": 1,
    "description": 1,
}

# Weights for fields that only apply to PDX models
pdx_only_weights = {
    "host_strain_name": 1,
    "host_strain_nomenclature": 1,
    "engraftment_site": 1,
    "engraftment_type": 1,
    "engraftment_sample_type": 1,
    "engraftment_sample_state": 0.5,
    "passage_number": 1,
    "passages_tested": 1,
    "validation_host_strain_nomenclature": 1,
}

# Weights for fields that only apply to In Vitro models
in_vitro_only_weights = {
    "model_name": 1,
    "model_name_aliases": 0.5,
    "growth_properties": 1,
    "growth_media": 1,
    "media_id": 1,
    "plate_coating": 1,
    "other_plate_coating" :1,
    "passage_number": 1,
    "contaminated": 1,
    "contamination_details": 0.5,
    "supplements": 0.5,
    "drug": 0.5,
    "drug_concentration": 0.5,
    "morphological_features": 1,
    "tumour_status": 1,
    "model_purity": 1,
}

output_columns = [['model_id', 'provider', 'release', 'score', 'mcs', 'data', 'model_type'], common_weights.keys(), pdx_only_weights.keys(), in_vitro_only_weights.keys()]
output_columns = [l for sl in output_columns for l in sl]
pdx_max_score = sum([sl for l in [common_weights.values(), pdx_only_weights.values()] for sl in l]) + 7
in_vitro_max_score = sum([sl for l in [common_weights.values(), in_vitro_only_weights.values()] for sl in l]) + 7
def get_mcs_for_releases(mcs, tag, git_tags):
    try:
        logging.info(f"{ctime()}: Running pipeline for tag: {tag}")
        run([f'cd {home}; git checkout {tag} -f'], shell=True, check=True)
        mc, tsv = calculate_mcs(home, mcs, git_tags[tag])
        mcs = pd.concat([mcs, mc])
        tsv.to_csv(f"MCS_raw_{str(git_tags[tag]).replace('PDCM_', '')}.csv", index=False)
    except CalledProcessError as e:
        print("Error:", e)
    return mcs

def calculate_mcs(home, mcs, tag):
    cm_providers = sorted(get_dirs(home))
    raw = pd.DataFrame()
    for pro_num in range(len(cm_providers)):
        provider = cm_providers[pro_num]
        provider_path = join(home, provider)
        tsv = read_tsvs_into_one_df(provider_path, provider)
        if tsv is not None:
            tsv['release'] = tag
            tsv['score'] = 0
            tsv['mcs'] = 0
            for _, row in tsv.iterrows():
                model_type = row['model_type']
                score = calculate_mcs_per_model(model_type, row)
                #print(len(row['data']))
                score += len(row['data'])
                tsv.at[_, 'score'] = score
                if model_type == 'pdx':
                    max_score = pdx_max_score
                elif model_type != 'other':
                    max_score = in_vitro_max_score
                else:
                    max_score = 1
                tsv.at[_, 'mcs'] = (score / max_score) * 100
            for col in output_columns:
                if col not in tsv.columns:
                    tsv[col] = ''
            raw = pd.concat([raw, tsv])
    raw['mcs'] = raw['mcs'].astype(int)
    mcs = pd.concat([mcs, raw[['model_id', 'model_type','provider', 'release', 'score', 'mcs']]]).reset_index(drop=True)
    return mcs, raw[output_columns]

def add_to_score(score, weight, row):
    for col in weight.keys():
        if col in row.index and  row[col] != '':
            score += weight[col]
    return score

def calculate_mcs_per_model(model_type, row):
    score = 0
    score = add_to_score(score, common_weights, row)
    if model_type == 'pdx':
        score = add_to_score(score, pdx_only_weights, row)
    elif model_type == 'organoid' or model_type == 'cell line':
        score = add_to_score(score, in_vitro_only_weights, row)
    return score

def read_tsvs_into_one_df(provider_path, provider):
    if not exists(join(provider_path, f'{provider}_metadata-patient_sample.tsv')):
        logging.info(f"{ctime()}: {provider}: Patient sample missing.")
        return None
    if not (exists(join(provider_path, f'{provider}_metadata-pdx_model.tsv')) or  exists(join(provider_path, f'{provider}_metadata-cell_model.tsv'))):
        logging.info(f"{ctime()}: {provider}: Model sheet missing.")
        return None
    patient_sample = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-patient_sample.tsv'))
    patient = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-patient.tsv'))
    sharing = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-sharing.tsv'))
    patient['patient_id'] = patient['patient_id'].astype(str)
    patient_sample['patient_id'] = patient_sample['patient_id'].astype(str)
    model = patient_sample.merge(patient, on ='patient_id', how='left')
    model_temp = DataFrame()
    if exists(join(provider_path, f'{provider}_metadata-pdx_model.tsv')):
        pdx = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-pdx_model.tsv'))
        pdx['model_type'] = 'PDX'
        model_temp = pd.concat([model_temp, pdx])
    if exists(join(provider_path, f'{provider}_metadata-cell_model.tsv')):
        cell = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-cell_model.tsv'))
        cell['model_type'] = cell['type']
        model_temp = pd.concat([model_temp, process_model_type(cell)])
    model = model.merge(model_temp, on='model_id', how='left')
    model = model.merge(sharing, on ='model_id', how='left')
    if exists(join(provider_path, f'{provider}_metadata-model_validation.tsv')):
        model_validation = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-model_validation.tsv'))
        model = model.merge(model_validation, on ='model_id', how='left')
    model['provider'] = provider
    model = model.drop_duplicates(['model_id'])
    model = get_data_types(model, provider_path, provider)
    return model.applymap(lambda x: x.lower() if isinstance(x, str) else x).replace('not provided', '').replace('not collected', '').replace('unknown', '')

def get_data_types(df, provider_path, provider):
    if exists(join(provider_path, f'{provider}_molecular_metadata-platform.tsv')):
        platform = read_metadata_without_fields(join(provider_path, f'{provider}_molecular_metadata-platform.tsv'))[['platform_id','molecular_characterisation_type']]
        platform['molecular_characterisation_type'] = platform['molecular_characterisation_type'].str.replace('cytogenetics', 'biomarker')
        sample = read_metadata_without_fields(join(provider_path, f'{provider}_molecular_metadata-sample.tsv'))
        sample = sample[['platform_id','model_id','sample_id']]
        sample['sample_id'] = sample['sample_id'].astype(str)
        pmd = sample.merge(platform, on='platform_id', how='left')
        pmd = df[['model_id', 'sample_id']].merge(pmd, on='model_id', how='inner').groupby('model_id')['molecular_characterisation_type'].apply(lambda x: [item for item in x if item in ['mut', 'biomarker', 'cna', 'expression', 'immunemarker', 'drug', 'treatment']]).reset_index().rename(columns={'molecular_characterisation_type': 'data'})
        df = df.merge(pmd[['model_id', 'data']], on='model_id', how='left')
        df['data'] = df['data'].fillna(0).apply(lambda x: [] if x == 0 else x)
    else:
        df['data'] = [[] for m in df['model_id']]
    df['data'] = [list(set(d)) for d in df['data']]
    return df

def process_model_type(data):
    data.loc[:, 'model_type'] = ['Organoid' if str(t).lower().__contains__('organoid') else t for t in
                                     data['model_type']]
    data.loc[:, 'model_type'] = [
        'Cell Line' if str(t).lower().__contains__('cell') or str(t).lower().__contains__('pdc') or str(
            t).lower().__contains__('2d') or str(t).lower().__contains__('2-d') else t for t in data['model_type']]
    data.loc[:, 'model_type'] = [
        'Other' if str(t).lower().__contains__('other') or str(t).lower().__contains__('mixed') else t for t in
        data['model_type']]
    return data


In [11]:
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG"
data_types = ['mut', 'cna', 'expression', 'immunemarker', 'biomarker', 'images', 'cyto']
usa = "United States of America"
provider_country = {"BROD": usa, "DFCI-CPDM": usa, "MDAnderson-CCH": usa, "UOC-BC": "United Kingdom", "CCIA": "Australia", "GCCRI": usa, "NKI": "Netherlands", "UOM-BC": "United Kingdom", "CHOP": usa, "HCI-BCM": usa, "PDMR": usa, "VHIO-BC": "Spain", "CMP": "United Kingdom", "HKU": "Hong Kong", "PMLB": "Canada", "VHIO-CRC": "Spain", "CRL": usa, "IRCCS-CRC": "Italy", "PMLB-Organoid": "Canada", "VHIO-PC": "Spain", "CSHL": usa, "IRCCS-GC": "Italy", "SANG": "United Kingdom", "WCMC": usa, "CUIMC": usa, "JAX": usa, "SJCRH": usa, "WUSTL": usa, "Curie-BC": "France", "LIH": "Luxembourg", "TRACE": "Belgium", "Wistar-MDAnderson-Penn": usa, "Curie-LC": "France", "LurieChildrens": usa, "UCD": usa, "Curie-OC": "France", "MDAnderson": usa, "UMCG": "Netherlands", "UMCU": "Netherlands", "HCMI": usa, "PPTC": usa, "PIVOT": usa, 'CDH': 'Singapore', 'IRCCS-IDI': 'Italy', 'IRCCS-DEB': "Italy", 'IRCCS-HSM-GE': 'Italy', 'IRCCS-ICH': "Italy", 'IRCCS-IOR': 'Italy', 'IRCCS-IRE': "Italy", 'IRCCS-IRFMN': 'Italy', 'IRCCS-IRST': "Italy", 'IRCCS-ITGPII': 'Italy', 'IRCCS-OPBG': "Italy", 'IRCCS-PAS': "Italy",'IRCCS-CSS': "Italy",'IRCCS-CRO': "Italy", 'TTUHSC': usa,  'MBDI': 'Australia', 'VHIO-PMP': 'Spain'}

## EDA 1: Data overview
- Percentage of models that have data based on the data types
- Data types based on the protocol used
- Data types based on the technology (Platform used)

In [12]:
def get_mol_data(home, release):
    cm_providers = sorted(get_dirs(home))
    mol_data = DataFrame()
    total_model = DataFrame()
    phenomics_dp = DataFrame()
    provider_country_df = DataFrame()

    for pro_num in tqdm(range(len(cm_providers)), "Generating data for analysis"):
        phenomics_mol_data_dp = DataFrame()
        pmd = DataFrame()
        provider = cm_providers[pro_num]
        provider_country_df = pd.concat([provider_country_df, DataFrame([[release, provider, provider_country[provider]]])])
        logging.info(f"{ctime()}: Processing: {provider}")
        provider_path = join(home, provider)
        if not exists(join(provider_path, f'{provider}_metadata-patient_sample.tsv')):
            logging.info(f"{ctime()}: {provider}: Patient sample missing.")
            continue
        if not (exists(join(provider_path, f'{provider}_metadata-pdx_model.tsv')) or
                exists(join(provider_path, f'{provider}_metadata-cell_model.tsv'))):
            logging.info(f"{ctime()}: {provider}: Model sheet missing.")
            continue
        start = time()
        provider_mol_data = DataFrame()
        provider_data_types = get_dirs(provider_path)
        model = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-patient_sample.tsv'))
        patient = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-patient.tsv'))[['patient_id', 'sex', 'ethnicity']]
        patient['patient_id'] = patient['patient_id'].astype(str)
        model['patient_id'] = model['patient_id'].astype(str)
        model = model.merge(patient, on ='patient_id', how='left')
        model_temp = DataFrame()
        if exists(join(provider_path, f'{provider}_metadata-pdx_model.tsv')):
            pdx = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-pdx_model.tsv'))[['model_id', 'publications']]
            pdx['model_type'] = 'PDX'
            model_temp = pd.concat([model_temp, pdx[['model_id', 'model_type', 'publications']]])
        if exists(join(provider_path, f'{provider}_metadata-cell_model.tsv')):
            cell = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-cell_model.tsv'))[['model_id', 'type', 'publications']]
            cell['model_type'] = cell['type']
            model_temp = pd.concat([model_temp, cell[['model_id', 'model_type', 'publications']]])
        model = model.merge(model_temp, on='model_id', how='left')
        model['provider'] = provider
        model = model.drop_duplicates(['model_id'])
        total_model = concat([total_model, model[['model_id', 'provider', 'model_type', 'publications', 'age_in_years_at_collection', 'diagnosis', 'tumour_type', 'primary_site', 'sex', 'ethnicity']]]).reset_index(drop=True)
        for data_type in data_types:
            if data_type in provider_data_types:
                data_path = join(provider_path, data_type)
                files = [f for f in get_files(data_path) if f.endswith('.tsv')]
                if len(files)==0:
                    dirs = get_dirs(data_path)
                    files = list()
                    for dir in dirs:
                        f = get_files(join(data_path, dir))
                        f = [join(dir, i) for i in f if i.endswith('.tsv')]
                        files.append(f)
                    files = [item for sublist in files for item in sublist]
                for file_name in files:
                    temp = read_csv(join(data_path, file_name), sep='\t', usecols=['sample_id', 'platform_id'])
                    if data_type == 'cyto':
                        data_type = 'biomarker'
                    temp['type'] = data_type
                    temp['provider'] = provider
                    phenomics_mol_data_dp = pd.concat([phenomics_mol_data_dp, DataFrame([['molecular_data_'+data_type, provider, temp.shape[0]]])]).reset_index(drop=True)
                    temp = temp.drop_duplicates()
                    provider_mol_data = concat([provider_mol_data, temp]).reset_index(drop=True)
        if len(provider_mol_data) > 0:
            platform = read_metadata_without_fields(join(provider_path, f'{provider}_molecular_metadata-platform.tsv'))[['platform_id','molecular_characterisation_type','instrument_model','library_strategy']]
            platform['molecular_characterisation_type'] = platform['molecular_characterisation_type'].str.replace('cytogenetics', 'biomarker')
            sample = read_metadata_without_fields(join(provider_path, f'{provider}_molecular_metadata-sample.tsv'))
            sample = sample[['platform_id','model_id','sample_id', 'sample_origin']]
            sample['sample_id'] = sample['sample_id'].astype(str)
            provider_mol_data['sample_id'] = provider_mol_data['sample_id'].astype(str)
            pmd = provider_mol_data.merge(platform, on='platform_id', how='left').merge(sample, on='sample_id', how='left')
            for ind, row in pmd.iterrows():
                model_type = model[model['model_id'] == row['model_id']]['model_type'].reset_index(drop=True)
                if len(model_type) > 0:
                    model_type = model_type[0]
                    if model_type != 'PDX':
                        sample_origin = 'cell'
                    else:
                        sample_origin = 'xenograft'
                    if row['sample_origin'] == '':
                        row['sample_origin'] = sample_origin
            pmd['sample_origin'] = pmd['sample_origin'].str.replace('\s$', '', regex=True).str.lower()

            mol_data = concat([mol_data, pmd]).reset_index(drop=True)

        if f'{provider}_metadata-model_image.tsv' in get_files(provider_path):
            images = read_metadata_without_fields(join(provider_path, f'{provider}_metadata-model_image.tsv'))
            phenomics_mol_data_dp = pd.concat([phenomics_mol_data_dp, DataFrame([['image_data_points', provider, images.shape[0]]])]).reset_index(drop=True)
            images['provider'] = provider
            images['type'] = 'images'
            images['molecular_characterisation_type'] = 'images'
            images = images[['model_id', 'provider', 'type', 'molecular_characterisation_type']].drop_duplicates()
            for col in mol_data.columns:
                if col not in images.columns:
                    images[col] = ""
            mol_data = concat([mol_data, images]).reset_index(drop=True)


        if 'treatment' in get_dirs(provider_path):
            treatment = read_metadata_with_fields(join(provider_path, f'treatment/{provider}_patienttreatment-Sheet1.tsv'))
            phenomics_mol_data_dp = pd.concat([phenomics_mol_data_dp, DataFrame([['treatment_data_points', provider, treatment.shape[0]]])]).reset_index(drop=True)
            treatment = treatment[['patient_id']].drop_duplicates().merge(model.drop_duplicates(['patient_id']), on='patient_id', how='left')
            treatment['provider'] = provider
            treatment['type'] = 'treatment'
            treatment['molecular_characterisation_type'] = 'treatment'
            treatment = treatment[['model_id', 'provider', 'type', 'molecular_characterisation_type', 'sample_id']].drop_duplicates()
            for col in mol_data.columns:
                if col not in treatment.columns:
                    treatment[col] = ""
            mol_data = concat([mol_data, treatment]).reset_index(drop=True)


        if 'drug' in get_dirs(provider_path):
            drug = read_metadata_with_fields(join(provider_path, f'drug/{provider}_drugdosing-Sheet1.tsv'))
            phenomics_mol_data_dp = pd.concat([phenomics_mol_data_dp, DataFrame([['drug_data_points', provider, drug.shape[0]]])]).reset_index(drop=True)
            drug = drug.drop_duplicates(['model_id'])
            drug['provider'] = provider
            drug['type'] = 'drug'
            drug['molecular_characterisation_type'] = 'drug'
            drug = drug[['model_id', 'provider', 'type', 'molecular_characterisation_type']].drop_duplicates()
            for col in mol_data.columns:
                if col not in drug.columns:
                    drug[col] = ""
            mol_data = concat([mol_data, drug]).reset_index(drop=True)


        if 0 in phenomics_mol_data_dp.columns:
            phenomics_mol_data_dp = phenomics_mol_data_dp.reset_index(drop=True).groupby(by=0).sum().reset_index().rename({0:'type', 2:'counts'},axis=1)
            total_model_id = phenomics_mol_data_dp['counts'].sum()
            total_row = {'type': 'molecular_data_points_total', 'counts': total_model_id}
            phenomics_mol_data_dp = phenomics_mol_data_dp.append(total_row, ignore_index=True)

        total_model.loc[:, 'mt'] = ['Organoid' if str(t).lower().__contains__('organoid') else t for t in total_model['model_type']]
        total_model.loc[:, 'mt'] = ['Cell Line' if str(t).lower().__contains__('cell') or str(t).lower().__contains__('pdc') or str(t).lower().__contains__('2d') or str(t).lower().__contains__('2-d') else t for t in total_model['mt']]
        total_model.loc[:, 'mt'] = ['Other' if str(t).lower().__contains__('other') or str(t).lower().__contains__('mixed') else t for t in total_model['mt']]

        df = total_model.groupby('mt').count()['model_id'].reset_index().rename({'mt': 'type', 'model_id': 'counts'}, axis=1)
        total_model_id = df['counts'].sum()
        total_row = {'type': 'model_type_total', 'counts': total_model_id}
        df['type'] = ['model_type_'+f.lower().replace(' ', '_' ) for f in df['type']]
        df = df.append(total_row, ignore_index=True)
        #.str.replace('\s$', '', regex=True)
        if pmd.shape[0]>0:
            pmd_counts = pmd.groupby(by='sample_origin').count()['model_id'].reset_index(name='counts').rename({'sample_origin': 'type'}, axis=1)
            total_model_id = pmd_counts['counts'].sum()
            total_row = {'type': 'sample_type_total', 'counts': total_model_id}
            pmd_counts['type'] = ['sample_type_'+f.lower().replace(' ', '_' ) for f in pmd_counts['type']]
            pmd_counts = pmd_counts.append(total_row, ignore_index=True)
        else:
            pmd_counts = DataFrame()

        phenomics_mol_data_dp = pd.concat([phenomics_mol_data_dp, df, pmd_counts])

        phenomics_mol_data_dp = phenomics_mol_data_dp.transpose()
        phenomics_mol_data_dp.columns = phenomics_mol_data_dp.iloc[0]
        phenomics_mol_data_dp = phenomics_mol_data_dp[1:]

        phenomics_mol_data_dp['release'] = release
        phenomics_mol_data_dp['provider'] = provider
        if phenomics_dp.shape[0]>0:
            for col in phenomics_dp.columns:
                if col not in phenomics_mol_data_dp.columns:
                    phenomics_mol_data_dp[col] = 0
        try:
            phenomics_dp = pd.concat([phenomics_dp, phenomics_mol_data_dp]).reset_index(drop=True)
        except Exception as e:
            pass
        logging.info(f"{ctime()}: {provider}: took {round(time()-start, 2)} secs")
    return mol_data, total_model, phenomics_dp, provider_country_df


In [13]:
current_dir = check_output('pwd', shell=True, text=True)[:-1]
git_tags = {'44fdc438': 'PDCM_DR_v6.7'}
for tag in git_tags.keys():
    checkout_and_run_function(tag, git_tags)

HEAD is now at 44fdc438c Merge branch 'dev' into 'master'
  phenomics_mol_data_dp = phenomics_mol_data_dp.reset_index(drop=True).groupby(by=0).sum().reset_index().rename({0:'type', 2:'counts'},axis=1)
  phenomics_mol_data_dp = phenomics_mol_data_dp.append(total_row, ignore_index=True)
  df = df.append(total_row, ignore_index=True)
  pmd_counts = pmd_counts.append(total_row, ignore_index=True)
  phenomics_mol_data_dp = phenomics_mol_data_dp.reset_index(drop=True).groupby(by=0).sum().reset_index().rename({0:'type', 2:'counts'},axis=1)
  phenomics_mol_data_dp = phenomics_mol_data_dp.append(total_row, ignore_index=True)
  df = df.append(total_row, ignore_index=True)
  pmd_counts = pmd_counts.append(total_row, ignore_index=True)
  df = df.append(total_row, ignore_index=True)
  phenomics_mol_data_dp = phenomics_mol_data_dp.reset_index(drop=True).groupby(by=0).sum().reset_index().rename({0:'type', 2:'counts'},axis=1)
  phenomics_mol_data_dp = phenomics_mol_data_dp.append(total_row, ignore_ind

In [14]:
#mol_data.to_csv('DR_v5.2.csv', index=False)

In [15]:
current_dir = check_output('pwd', shell=True, text=True)[:-1]
git_tags = {'00f46821': 'PDCM_DR_v1.0',
            'e2e44469': 'PDCM_DR_v2.0',
            'fcd095df': 'PDCM_DR_v2.1',
            '041be7d2': 'PDCM_DR_v3.0',
            '597fd689': 'PDCM_DR_v3.1',
            'cfb23e3c': 'PDCM_DR_v4.0',
            'b4f0203a': 'PDCM_DR_v5.0',
            '2e67bc5e': 'PDCM_DR_v5.1',
            '3b29259f': 'PDCM_DR_v5.2',
            'c3c428a3': 'PDCM_DR_v5.3',
            'b742f61c': 'PDCM_DR_v6.0',
            '34a403df': 'PDCM_DR_v6.1',
            '0db07c62': 'PDCM_DR_v6.2',
            'cfd59f5e': 'PDCM_DR_v6.3',
            '866de2cb': 'PDCM_DR_v6.4',
            'ff21dcf3': 'PDCM_DR_v6.5',
            '9e5c1a37': 'PDCM_DR_v6.6', '44fdc438': 'PDCM_DR_v6.7'}
mcs = pd.DataFrame()
for tag in git_tags.keys():
    mcs = get_mcs_for_releases(mcs, tag, git_tags)

HEAD is now at 44fdc438c Merge branch 'dev' into 'master'


In [16]:
mcs_files = sorted([f for f in get_files('/Users/tushar/CancerModels/utils/PdcmDataTransformers/Visualization') if f.__contains__('MCS_raw')])
mcs = pd.DataFrame()
for f in mcs_files:
    t = pd.read_csv(f)[['model_id', 'model_type','provider', 'release', 'score', 'mcs']]
    mcs = pd.concat([mcs, t]).reset_index(drop=True)

In [17]:
def compute_updated_mcs(r):
    if r['model_type'] == 'pdx':
        new_score = (r['score']/pdx_max_score) * 100
    elif r['model_type'] == 'organoid' or r['model_type'] == 'cell line':
        new_score = (r['score']/in_vitro_max_score) * 100
    else:
         new_score = 0
    r['new_score'] = new_score
    return r
mcs = mcs.apply(lambda x: compute_updated_mcs(x), axis=1)
mcs['new_score'] = mcs['new_score'].astype(int)

In [18]:
pivot_df = mcs.pivot(index=['model_id', 'provider', 'model_type'], columns='release', values='new_score')
pivot_df.columns.name = None
pivot_df = pivot_df.reset_index()

In [19]:
pivot_df

Unnamed: 0,model_id,provider,model_type,PDCM_DR_v1.0,PDCM_DR_v2.0,PDCM_DR_v2.1,PDCM_DR_v3.0,PDCM_DR_v3.1,PDCM_DR_v4.0,PDCM_DR_v5.0,...,PDCM_DR_v5.2,PDCM_DR_v5.3,PDCM_DR_v6.0,PDCM_DR_v6.1,PDCM_DR_v6.2,PDCM_DR_v6.3,PDCM_DR_v6.4,PDCM_DR_v6.5,PDCM_DR_v6.6,PDCM_DR_v6.7
0,,irccs-irst,,,,,,,,,...,,,,,,,0.0,0.0,0.0,0.0
1,,irccs-itgpii,,,,,,,,,...,,,,,,,0.0,0.0,0.0,0.0
2,111316-319-r,pdmr,pdx,,,,,,,,...,,,,,,,,,,69.0
3,112475-105-r,pdmr,pdx,56.0,56.0,52.0,52.0,49.0,49.0,49.0,...,49.0,49.0,52.0,52.0,52.0,52.0,52.0,56.0,56.0,66.0
4,114348-004-r,pdmr,pdx,56.0,56.0,54.0,54.0,50.0,50.0,50.0,...,50.0,50.0,54.0,54.0,54.0,54.0,54.0,58.0,58.0,67.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12037,z542,umcu,organoid,,,,,,,,...,,,,,,,28.0,28.0,28.0,28.0
12038,z583,umcu,organoid,,,,,,,,...,,,,,,,28.0,28.0,28.0,28.0
12039,z589,umcu,organoid,,,,,,,,...,,,,,,,28.0,28.0,28.0,28.0
12040,z599,umcu,organoid,,,,,,,,...,,,,,,,28.0,28.0,28.0,28.0


In [20]:
pivot_df[~pivot_df['model_id'].isna()].to_csv('metadata-scores-all-release.csv', index=False)