In [1]:
from os import listdir, getcwd, rename, makedirs, remove
from os.path import isfile, join, isdir, exists
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import requests
from utils import *

### Common functions

In [2]:
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]

def get_files(path):
    return [join(path, f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".tsv")]

def read_metadata_without_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    if 'Field' in metadata.columns:
        metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
        metadata = metadata.drop('Field', axis=1)
    return metadata

def read_metadata_with_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    return metadata

def sort_case_insensitive(sort_list):
    return sorted(sort_list, key=str.casefold)


In [2]:
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/"
providers = sorted(get_dirs(home))
API_scores = "https://www.cancermodels.org/api/search_index?data_source=eq."
API_select = "&select=pdcm_model_id,external_model_id,patient_sample_id,data_source,scores,model_type,dataset_available"

In [4]:
def fetch(url):
    response = requests.get(url)
    # Check if the API request was successful (status code 200)
    if response.status_code != 200:
        return None
    # Flatten the JSON data
    data = json.loads(response.text)
    flattened_data = pd.json_normalize(data)
    # Create a DataFrame from the flattened JSON data
    df = pd.DataFrame(flattened_data)
    return df

def group_df(df):
    # Group by "model_type" and calculate counts, min, and max scores
    grouped = df.groupby("model_type").agg(
        data_source=pd.NamedAgg(column="data_source", aggfunc=lambda x: x.value_counts().idxmax()),
        model_count=pd.NamedAgg(column="model_type", aggfunc="count"),
        min_data_score=pd.NamedAgg(column="scores.data_score", aggfunc="min"),
        max_data_score=pd.NamedAgg(column="scores.data_score", aggfunc="max"),
        min_pdx_metadata_score=pd.NamedAgg(column="scores.pdx_metadata_score", aggfunc="min"),
        max_pdx_metadata_score=pd.NamedAgg(column="scores.pdx_metadata_score", aggfunc="max"),
        dataset_available=pd.NamedAgg(column="dataset_available", aggfunc=lambda x: x.value_counts().idxmax())
    )
    # Reset the index for a cleaner DataFrame
    grouped = grouped.reset_index()
    return grouped.sort_values(by=["max_data_score", "max_pdx_metadata_score"], ascending=[False, False])

In [5]:
def assess(api_url):
    assessment = fetch(api_url).fillna("")
    if assessment is None:
        return None
    return group_df(assessment)

def get_dataset_assessment(providers, scores, select):
    da = pd.DataFrame()
    for i in tqdm(range(0, len(providers)), desc ="Generating data assessment report: "): ## get_dirs will get the provider dirs in updog
        provider = providers[i]
        url = scores+provider+select
        da = pd.concat([da, assess(url)], ignore_index=True)
    return da

In [6]:
assessment = get_dataset_assessment(providers, API_scores, API_select)
assessment

Generating data assessment report: 100%|██████████| 38/38 [00:06<00:00,  5.87it/s]


Unnamed: 0,model_type,data_source,model_count,min_data_score,max_data_score,min_pdx_metadata_score,max_pdx_metadata_score,dataset_available
0,cell line,BROD,107,0,71,0,0,"[mutation, copy number alteration, expression]"
1,other,BROD,30,0,71,0,0,"[mutation, copy number alteration, expression]"
2,organoid,BROD,94,0,57,0,0,
3,PDX,CCIA,90,42,71,62,69,"[mutation, copy number alteration, expression,..."
4,PDX,CHOP,35,57,71,65,86,"[mutation, copy number alteration, expression,..."
5,cell line,CMP,1881,0,71,0,0,"[mutation, copy number alteration, expression,..."
6,organoid,CMP,44,28,42,0,0,"[mutation, expression, immune markers]"
7,PDX,CRL,539,0,85,59,90,"[mutation, copy number alteration, expression,..."
8,organoid,CSHL,221,0,71,0,0,
9,organoid,CUIMC,32,57,71,0,0,"[mutation, copy number alteration, immune mark..."


In [5]:
pivot = ["GCCRI", "CHOP", "CCIA", "LurieChildrens", "MDAnderson-CCH", "SJCRH"]
contacts = pd.DataFrame()
for provider in sorted(pivot):
    provider_path = join(home, provider)
    contact = read_metadata_without_fields(join(provider_path, f"{provider}_metadata-sharing.tsv"))[['email', 'name']].drop_duplicates()
    contact['provider'] = provider
    contacts = pd.concat([contacts, contact]).reset_index(drop=True)
contacts

Unnamed: 0,email,name,provider
0,RLock@ccia.org.au,Richard Lock,CCIA
1,"patrick.reynolds@ttuhsc.edu,maris@email.chop.edu","Reynolds, Maris",CHOP
2,HoughtonP@uthscsa.edu,Peter J Houghton,GCCRI
3,xli@luriechildrens.org,Li Xiao-Nan,LurieChildrens
4,RGorlick@mdanderson.org,Richard Gorlick,MDAnderson-CCH
5,cstn@stjude.org,Childhood Solid Tumor Network,SJCRH


In [4]:
cna_symbol_locations = pd.DataFrame()
for provider in sorted(get_dirs(home)):
    if exists(join(home, provider, 'cna')):
        files = [f for f in get_files(join(home, provider, 'cna')) if f.endswith(".tsv")]
        if len(files) == 0:
            dir = get_dirs(join(home, provider, 'cna'))
            files = [get_files(join(home, provider, 'cna', d)) for d in dir]
            files = [x for xs in files for x in xs]
            files = [f for f in files if f.endswith(".tsv")]
        for file in files:
            temp = read_metadata_with_fields(join(home, provider, 'cna', file))[['symbol', 'chromosome', 'strand','seq_start_position', 'seq_end_position', 'ncbi_gene_id', 'ensembl_gene_id']]
            cna_symbol_locations = pd.concat([cna_symbol_locations, temp]).drop_duplicates(subset=['symbol']).reset_index(drop=True)
cna_symbol_locations

AttributeError: 'list' object has no attribute 'endswith'