In [1]:
import os

from utils import read_metadata_without_fields, read_metadata_with_fields,  request
from os import getcwd
import pandas as pd
from tqdm import tqdm
import re
from bs4 import BeautifulSoup

In [2]:
start_dir = getcwd()
model_metadata = pd.read_json(f"https://www.cancermodels.org/api/model_metadata?data_source=eq.PDMR&select=model_id,type,source_database_url")

Unnamed: 0,model_id,type,source_database_url
0,485977-131-R-V2-organoid,organoid,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
1,874868-142-R-J2-PDC,cell line,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
2,445649-340-R,PDX,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
3,994459-135-R,PDX,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
4,677734-321-R-J1-PDC,cell line,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
...,...,...,...
1126,275375-350-R-V1-organoid,organoid,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
1127,649442-058-R,PDX,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
1128,285369-178-R,PDX,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...
1129,381356-305-R,PDX,https://pdmdb.cancer.gov/pls/apex/f?p=101:4:0:...


In [3]:
def generate_image_sheet(model_metadata):
    images = pd.DataFrame()
    for i in tqdm(range(model_metadata.shape[0]), "Fetch from PDMR"):
        originator_df = pd.DataFrame()
        pdx_df = pd.DataFrame() 
        invitro_df = pd.DataFrame() 
        default_url = "https://pdmdb.cancer.gov/web/apex/"
        mid = model_metadata.iloc[i, 0]
        type = model_metadata.iloc[i, 1]
        link = model_metadata.iloc[i, 2]
        response = request(link, False, 'get')
        try:
            table = get_table_of_samples(response)
            ## originator
            originator_url = table[table['sampleID'] == 'ORIGINATOR'].reset_index(drop=True)['url']
            if originator_url.shape[0] > 0:
                originator_url = default_url + originator_url[0]
                originator_df = pd.concat([originator_df, process_originator_images(request(originator_url, False, 'get'), mid)])
            if type == "PDX":
                ## PDX
                pdx = table[table['pdm_type'] == 'PDX'].reset_index(drop=True)
                pdx['url'] = default_url + pdx['url']
                pdx_df = process_pdx_images(pdx, mid)
            if type != "PDX":
                sampleID = mid.split('-', 3)[-1]
                invitro = table[table['sampleID'] == sampleID].reset_index(drop=True)
                invitro['url'] = default_url + invitro['url']
                invitro_df = process_invitro_images(invitro, mid)
                invitro_df['type'] = type
                invitro_df['Field'] = ""
            temp = generate_image_df(originator_df, pdx_df, invitro_df)
            images = pd.concat([images, temp]).reset_index(drop=True)
        except Exception as e:
            #print(f"error for {mid}: {e}")
            continue
    return images

def get_table_of_samples(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    data_rows = []
    for div in soup.find_all('div'):
        attrs = div.attrs
        if 'class' in attrs.keys() and attrs['class'][0] == 'rc-title':
            header = div.text.strip()
            if header == "Sample (PDX)":
                table = div.findNext().find_all('table')[0]
                for row in table.find_all('tr')[2:]:  # Skip the first two rows (headers and separator)
                    #print(row.find_all('td'))
                    data = [td.find('a').attrs['href'] if td.find('a') else td.text.strip()  for td in row.find_all('td') if 'class' in td.attrs.keys() and td.attrs['class'][0] == 'data']
                    if len(data)>0 :
                        data_rows.append(data)
    return pd.DataFrame(data_rows, columns=['url', 'pdm_type', 'sampleID', 'patient_origin', 'pdx_passage', 'images', 'mutation', 'wes', 'rnaseq'])


def process_originator_images(response, mid):
    soup = BeautifulSoup(response.text, 'html.parser')    
    divs = [div.findNext().find_all('div') for div in soup.find_all('div') if
            'class' in div.attrs.keys() and div.attrs['class'][0] == 'rc-title' and div.text.strip() == 'Sample']
    table = [div.findNext().find_all('table') for div in divs[0] if
             'class' in div.attrs.keys() and div.attrs['class'][0] == 'rc-title' and div.text.strip() == 'Pathology Data'][
        0][0]
    tt = pd.DataFrame([[td.find('a').attrs['href'] if td.find('a') else td.find('img').attrs['src'] if td.find('img') else td.text.strip() for row in table.find_all('tr') for td in row.find_all('td') if 'class' in td.attrs.keys() and td.attrs['class'][0] == 'data'][0:8]], columns=["View","TumorGrade","TumorContent","Necrosis","Stromal","InflammatoryCells","Low_res_image", "High_res_image"]) 
    tt['sample_id'] = "ORIGIN"
    tt['model_id'] = mid
    tt['passage'] = "ORIGIN"
    return tt

def process_pdx_images(df, mid):
    temp = pd.DataFrame()
    for index, row in df.iterrows():
        response = request(row['url'], False, 'get')
        soup = BeautifulSoup(response.text, 'html.parser')    
        divs = [div.findNext().find_all('div') for div in soup.find_all('div') if
                'class' in div.attrs.keys() and div.attrs['class'][0] == 'rc-title' and div.text.strip() == 'Sample']
        table = [div.findNext().find_all('table') for div in divs[0] if
                 'class' in div.attrs.keys() and div.attrs['class'][0] == 'rc-title' and div.text.strip() == 'Pathology Data'][
            0][0]
        df_list = [td.find('a').attrs['href'] if td.find('a') else td.find('img').attrs['src'] if td.find('img') else td.text.strip() for row in table.find_all('tr') for td in row.find_all('td') if 'class' in td.attrs.keys() and td.attrs['class'][0] == 'data']
        df_list = [df_list[i:i + 8] for i in range(0, len(df_list), 8)]
        tt = pd.DataFrame(df_list, columns=["View","TumorGrade","TumorContent","Necrosis","Stromal","InflammatoryCells","Low_res_image", "High_res_image"])
        tt['sample_id'] = row['sampleID']
        tt['model_id'] = mid
        tt['passage'] = row['pdx_passage']
        temp = pd.concat([temp, tt])
    return temp.drop_duplicates()

def list_to_dataframe(flat_list):
    # Step 1: Convert flat list to 2D list with 4 elements per sub-list
    two_d_list = [flat_list[i:i + 4] for i in range(0, len(flat_list), 4)]
    
    # Step 2: Convert 2D list to DataFrame
    df = pd.DataFrame(two_d_list, columns=['Column1', 'Column2', 'Column3', 'Column4'])
    
    return df

def process_invitro_images(df, mid):
    temp = pd.DataFrame()
    for index, row in df.iterrows():
        response = request(row['url'], False, 'get')
        soup = BeautifulSoup(response.text, 'html.parser')
        divs = [div.findNext().find_all('div') for div in soup.find_all('div') if
                'class' in div.attrs.keys() and div.attrs['class'][0] == 'rc-title' and div.text.strip() == 'Sample']
        table = [div.findNext().find_all('table') for div in divs[0] if
                 'class' in div.attrs.keys() and div.attrs['class'][0] == 'rc-title' and div.text.strip() == 'In Vitro Images'][
            0][0]
        df_list = [td.find('a').attrs['href'] if td.find('a') else td.find('img').attrs['src'] if td.find('img') else td.text.strip() for row in table.find_all('tr') for td in row.find_all('td') if 'class' in td.attrs.keys() and td.attrs['class'][0] == 'data']
        df_list = [df_list[i:i + 4] for i in range(0, len(df_list), 4)]
        tt = pd.DataFrame(df_list, columns=["View","image_type","notes","image"])
        tt['sample_id'] = row['sampleID']
        tt['model_id'] = mid
        tt['passage'] = row['pdx_passage']
        temp = pd.concat([temp, tt])
    return temp.drop_duplicates()

def melt_df(df):
    df_melted = df.melt(id_vars=['View', 'TumorGrade', 'TumorContent', 'Necrosis', 'Stromal', 'InflammatoryCells', 'sample_id', 'model_id', 'passage'], value_vars=['Low_res_image', 'High_res_image'], var_name='magnification', value_name='URL')
    df_melted['URL'] = "https://pdmdb.cancer.gov/web/apex/" + df_melted['URL']
    df_melted['magnification'] = df_melted['magnification'].str.replace('Low_res_image', 'Low Magnification Image').str.replace('High_res_image', 'High Magnification Image')
    df_melted['Field'] = ""
    return df_melted


def generate_image_df(og, pdx, invitro):
    if og.shape[0] > 0:
        og = melt_df(og)
        og['description'] = og.apply(lambda x: f"Tumor Grade: {x['TumorGrade']}, Tumor Content: {x['TumorContent']}, Necrosis: {x['Necrosis']}, Stromal: {x['Stromal']}, Inflammatory Cells: {x['InflammatoryCells']}, Sample Type: Patient, Staining: H&E", axis=1)
        og['sample_type'] = "patient"    
        og['passage'] = ""
        og['staining'] = "H&E"
        og = og[['Field', 'model_id', 'URL', 'description', 'sample_type', 'passage', 'magnification', 'staining']]
    else:
        og = pd.DataFrame(columns=['Field', 'model_id', 'URL', 'description', 'sample_type', 'passage', 'magnification', 'staining'])
    if pdx.shape[0] != 0:
        pdx = melt_df(pdx)
        pdx['description'] = pdx.apply(lambda x: f"Tumor Grade: {x['TumorGrade']}, Tumor Content: {x['TumorContent']}, Necrosis: {x['Necrosis']}, Stromal: {x['Stromal']}, Inflammatory Cells: {x['InflammatoryCells']}, Sample Type: Patient, Staining: H&E", axis=1)
        pdx['sample_type'] = "xenograft"
        pdx['staining'] = "H&E"
        og = pd.concat([og, pdx[og.columns]]).reset_index(drop=True)
    if invitro.shape[0] != 0:
        invitro['URL'] = "https://pdmdb.cancer.gov/web/apex/" + invitro['image']
        invitro['description'] =  invitro.apply(lambda x: f"Staining: {x['image_type']}, Pathology notes: {x['notes']}", axis=1)
        invitro['sample_type'] = invitro['type']
        invitro['staining'] = invitro['image_type']
        invitro['passage'] = ""
        invitro['magnification'] = ""
        og = pd.concat([og, invitro[og.columns]]).reset_index(drop=True)
    return og

In [None]:
image_df = generate_image_sheet(model_metadata)
image_df

Fetch from PDMR:   1%|          | 9/1131 [00:32<51:12,  2.74s/it]  

error for 946996-313-R: list index out of range


Fetch from PDMR:   2%|▏         | 19/1131 [01:13<1:04:13,  3.47s/it]

error for 223117-310-R: list index out of range


Fetch from PDMR:   3%|▎         | 30/1131 [02:04<51:26,  2.80s/it]  

error for 748385-122-R: list index out of range


Fetch from PDMR:   3%|▎         | 32/1131 [02:11<57:42,  3.15s/it]

error for 728994-017-R: list index out of range


Fetch from PDMR:   5%|▍         | 54/1131 [03:44<59:25,  3.31s/it]  

error for 381356-305-R-J1-PDC: list index out of range


Fetch from PDMR:   5%|▍         | 56/1131 [03:48<44:21,  2.48s/it]

error for 671287-276-R: list index out of range


Fetch from PDMR:   5%|▌         | 61/1131 [04:01<45:50,  2.57s/it]

error for K57222-313-R-J1-PDC: list index out of range


Fetch from PDMR:   7%|▋         | 84/1131 [05:49<1:26:03,  4.93s/it]

error for 967376-340-R: list index out of range


Fetch from PDMR:   8%|▊         | 90/1131 [06:24<1:23:49,  4.83s/it]

error for 361931-004-R: list index out of range


Fetch from PDMR:   8%|▊         | 94/1131 [06:43<1:14:39,  4.32s/it]

error for 987419-145-R-V1-organoid: list index out of range


Fetch from PDMR:  10%|▉         | 109/1131 [07:47<51:25,  3.02s/it]  

error for 276233-004-R: list index out of range


Fetch from PDMR:  10%|▉         | 110/1131 [07:55<1:15:53,  4.46s/it]

error for 919269-233-R1: list index out of range


Fetch from PDMR:  11%|█▏        | 128/1131 [09:26<1:45:21,  6.30s/it]

error for 636577-100-R: list index out of range


Fetch from PDMR:  12%|█▏        | 138/1131 [10:21<1:32:01,  5.56s/it]

error for K14711-072-R: list index out of range


Fetch from PDMR:  12%|█▏        | 140/1131 [10:28<1:16:58,  4.66s/it]

error for 941425-263-T: list index out of range


Fetch from PDMR:  15%|█▍        | 165/1131 [12:23<1:03:25,  3.94s/it]

error for 185487-224-T: list index out of range


Fetch from PDMR:  16%|█▌        | 177/1131 [13:11<48:27,  3.05s/it]  

error for 559244-337-R-J2-PDC: list index out of range


Fetch from PDMR:  16%|█▌        | 180/1131 [13:38<1:48:25,  6.84s/it]

error for 521955-158-R7: list index out of range


Fetch from PDMR:  16%|█▋        | 185/1131 [14:03<1:23:46,  5.31s/it]

error for 259778-044-R: list index out of range


Fetch from PDMR:  17%|█▋        | 194/1131 [14:44<1:06:41,  4.27s/it]

error for 521955-158-R5-V10-organoid: list index out of range


Fetch from PDMR:  17%|█▋        | 196/1131 [14:50<56:20,  3.62s/it]  

error for 669591-354-R: list index out of range


Fetch from PDMR:  18%|█▊        | 199/1131 [15:00<47:52,  3.08s/it]  

error for 381356-305-R-V2-organoid: list index out of range


Fetch from PDMR:  19%|█▉        | 214/1131 [16:25<1:29:08,  5.83s/it]

error for K41856-061-R2: list index out of range


Fetch from PDMR:  19%|█▉        | 218/1131 [16:54<1:48:28,  7.13s/it]

error for 295223-140-R: list index out of range


Fetch from PDMR:  19%|█▉        | 219/1131 [17:02<1:51:55,  7.36s/it]

error for 952719-076-R: list index out of range


Fetch from PDMR:  20%|█▉        | 224/1131 [17:28<1:17:23,  5.12s/it]

error for 967376-340-R-J2-PDC: list index out of range


Fetch from PDMR:  20%|█▉        | 225/1131 [17:29<58:15,  3.86s/it]  

error for K98449-230-R: list index out of range


Fetch from PDMR:  21%|██        | 239/1131 [18:38<1:06:11,  4.45s/it]

error for 756784-338-R: list index out of range


Fetch from PDMR:  22%|██▏       | 252/1131 [19:32<1:00:41,  4.14s/it]

error for 713683-067-R: list index out of range


Fetch from PDMR:  23%|██▎       | 255/1131 [19:40<43:44,  3.00s/it]  

error for 947725-317-R-V3-organoid: list index out of range


Fetch from PDMR:  23%|██▎       | 260/1131 [19:58<49:56,  3.44s/it]  

error for K79811-243-R: list index out of range


Fetch from PDMR:  25%|██▍       | 278/1131 [21:09<1:16:59,  5.42s/it]

error for 245324-029-R: list index out of range


Fetch from PDMR:  26%|██▋       | 298/1131 [22:36<1:10:17,  5.06s/it]

error for 241356-047-R: list index out of range


Fetch from PDMR:  27%|██▋       | 311/1131 [23:22<1:05:26,  4.79s/it]

error for 961994-133-R: list index out of range


Fetch from PDMR:  28%|██▊       | 315/1131 [23:49<1:24:51,  6.24s/it]

error for 936559-144-R: list index out of range


Fetch from PDMR:  28%|██▊       | 319/1131 [24:05<1:03:35,  4.70s/it]