In [329]:
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from utils import get_files
import fitz  # PyMuPDF
import pandas as pd
import re
from utils import templates, join, read_metadata_with_fields

In [46]:
"""
# URL of the PDF files
url = 'https://cccells.org/PDF_Files/All/'

# Create a directory to save the downloaded PDFs
os.makedirs('pdfs', exist_ok=True)

# Fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all links that end with .pdf
pdf_links = [a['href'] for a in soup.find_all('a') if a['href'].endswith('.pdf')]

# Download each PDF
for pdf_link in pdf_links:
    pdf_url = pdf_link if pdf_link.startswith('http') else url + pdf_link
    pdf_response = requests.get(pdf_url)

    # Extract the PDF filename
    pdf_filename = os.path.join('pdfs', pdf_link.split('/')[-1].replace("%20", "_"))

    # Save the PDF
    with open(pdf_filename, 'wb') as pdf_file:
        pdf_file.write(pdf_response.content)
    
    print(f'Downloaded: {pdf_filename}')

print("All PDFs downloaded.")
"""

'\n# URL of the PDF files\nurl = \'https://cccells.org/PDF_Files/All/\'\n\n# Create a directory to save the downloaded PDFs\nos.makedirs(\'pdfs\', exist_ok=True)\n\n# Fetch the content of the page\nresponse = requests.get(url)\nsoup = BeautifulSoup(response.content, \'html.parser\')\n\n# Find all links that end with .pdf\npdf_links = [a[\'href\'] for a in soup.find_all(\'a\') if a[\'href\'].endswith(\'.pdf\')]\n\n# Download each PDF\nfor pdf_link in pdf_links:\n    pdf_url = pdf_link if pdf_link.startswith(\'http\') else url + pdf_link\n    pdf_response = requests.get(pdf_url)\n\n    # Extract the PDF filename\n    pdf_filename = os.path.join(\'pdfs\', pdf_link.split(\'/\')[-1].replace("%20", "_"))\n\n    # Save the PDF\n    with open(pdf_filename, \'wb\') as pdf_file:\n        pdf_file.write(pdf_response.content)\n    \n    print(f\'Downloaded: {pdf_filename}\')\n\nprint("All PDFs downloaded.")\n'

In [307]:
def extract_text_from_pdf(pdf_path):
    headers = []
    data = []
    doc = fitz.open(pdf_path)
    for page in doc:
        text = page.get_text("text")
        lines = text.split('\n')
        skip_header = ["data sheet", 'Childhood Cancer Repository', 'Powered by Alex’s Lemonade Stand', "COG resource Laboratory", "label contains the cell line name", "date frozen. All", "Magnification", 'reference']
        include_line = ["The Childhood Cancer Repository has"]
        #print(lines)
        i = 0
        #print(lines)
        while i < len(lines):
            line = lines[i]
            if any([True if h.lower() in line.lower() else False for h in include_line]):
                line = line + lines[i+1]
                i += 1
            if i > 1 and lines[i-1].lower().__contains__('references'):
                string = "".join(lines[i:])
                pmid_pattern = r'(?:PubMed ID:|PMID)\s*(\d+)'
                pmids = re.findall(pmid_pattern, string)
                if pmids:
                    pmid_string = 'PMID: '+', PMID: '.join(pmids)
                else: pmid_string = ''
                line = pmid_string
                i = len(lines)
            if line.lower().__contains__('this pdx model has been tested and confirmed over multiple passages to be'):
                line = line + lines[i+1]
                i +=1
            if any([True if h.lower() in line.lower() else False for h in skip_header ]) and any([False if h.lower() in line.lower() else True for h in include_line ]) :  # Example header keyword; adjust to your file
                headers.append(line)  # Adjust split logic to match your headers format
            else:
                data.append(line.replace('MYCN Cell Line', 'MYCN Cell Line:').replace('Age at sample', 'Age at sample:').replace('Growth Properties', 'Growth Properties:').replace('Telomere Mechanism', 'Telomere Mechanism:').replace('Human vs. Mouse ', 'Human vs. Mouse:').replace(': ', ':').replace('::', ':'))  # Adjust the split as per value structure
            i +=1
    doc.close()
    text = []
    page_start, blanks = False, False
    for d in data:
        if page_start:
            if d == ' ':
                blanks = True
            else:
                blanks = False
                page_start = False
        if d.__contains__(' / '):
            page_start = True
        if not page_start and not blanks:
            text.append(d)
    return headers, text

def process_text_to_data(lines):
    #lines = text.replace('\n ', '').split('\n')
    header = []
    data = []
    for line in lines:
        # Assuming tabular data is separated by whitespace or a specific delimiter
        if line.strip():
            if (not line.__contains__("µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)") and
                        not line.__contains__("supplements (to a final concentration)") and
                        not line.__contains__("Cells are grown in a base medium") and
                        not line.__contains__("Medium plus the following supplements") and
                        not line.__contains__("Fetal Bovine Serum, 4mM L-Glutamine") and
                        not line.__contains__("1X ITS (5 µg/mL insulin") and
                        not line.__contains__("see Protocols section") and
                        not line.__contains__("protocol") and
                        not line.__contains__('Magnification') and
                        not line.__contains__('label contains the cell line name') and
                        not line.__contains__('date frozen. All cell') and
                        not line.__contains__('www.cccells.org')):
                #print(line)
                if not line.__contains__(':') or line.__contains__('https') or line.__contains__("PMID") or line.__contains__("PubMed ID"):  # Avoid empty lines
                    #print(line)
                    line = line.replace("Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following", "Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)")
                    data.append(''.join(line.strip()))  # Adjust the split method as needed
                else:
                    header.append(line.replace(": ", "").replace(":", ""))
    return header, data

def process_data_to_dict(data):
    length = len(data)
    i = 0
    temp = {}
    while i < length:
        if i == 0:
            model_id = data[i].replace('PDX Data Sheet for ', '')
            temp['model_id'] = model_id
        elif data[i].__contains__(':'):
            key = data[i].replace(':', '')
            if i < length - 1:
                if data[i + 1].__contains__(':'):
                    temp[key] = ''
                    i += 1
                else:
                    temp[key] = data[i+1]
            else:
                temp[key] = data[i]
            i += 1
        elif not data[i].__contains__(':'):
            key = data[i-2].replace(':', '')
            if i < length - 1 and data[i + 1].__contains__(':'):
                temp[key] = ''
            else:
                if key in temp.keys():
                    temp[key] = temp[key] + data[i]
                else:
                    temp[key] = data[i]
        i+=1
    return temp

def process_pdf_into_tsv(path):
    pdf_files = sorted(['pdfs/'+f for f in get_files(path)])
    model_data = []
    fine_files = ['pdfs/COG-N-415x_Cell_Line_Data_Sheet.pdf']
    for i in tqdm(range(len(pdf_files))):
        pdf = pdf_files[i]
        if pdf not in model_page.keys():
            _, text = extract_text_from_pdf(pdf)
            #print(pdf)
            #print(text)
            header, data = process_text_to_data(text)
            processed_data = dict(zip(header, data))
            processed_data['file'] = pdf
            processed_data['model_id'] = pdf.replace('pdfs/', '').replace('_Cell_Line_Data_Sheet', '').replace('.pdf', '')
            #processed_data = process_data_to_dict(data)
            #print(processed_data)
            model_data.append(processed_data)
            if pdf.__contains__('COG-N') and processed_data['Disease'] == 'Growth Properties' and pdf not in fine_files:
                print(processed_data)
                break
        else:
            model_data.append(model_page[pdf])
    df = pd.DataFrame(model_data)
    return df

In [308]:
model_page = {
    "pdfs/76-COG-LY-465x_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'COG-LY-465x', 'PDX Name': 'COG-LY-465x', 'Disease': "Burkitt's Lymphoma", 'Phase of Therapy': 'Post-Chemotherapy (Progressive Disease), Post-mortem', 'Treatment': 'N/A', 'Disease Stage': 'N/A', 'Source of Culture': 'Tumor (floating tumor in central vein draw)', 'Primary Tumor Site': 'N/A', 'Date Established': 'June 2012', 'Status': '', 'expression': '', 'Gender': 'N/A', 'Age': '10 years', 'Race': 'N/A', 'Strain of Mice': 'NSG only', 'Injection Type': 'Subcutaneous', 'Growth Properties': 'Was originally established from intraperitoneal fluid, then expanded into NSG mice via subcutaneous injections; Grows fast, 1 month to 1500mm3. Please see Protocols section at https//www.cccells.org/protocols.php', 'Human vs. Mouse': 'Pending', 'STR Profile': 'May be obtained at https//strdb.cccells.org/', 'Notes': '', 'References': ''},
    "pdfs/BT-12.pdf": 
        {'model_id': 'BT-12','Disease': 'Atypical teratoid/rhabdoid tumor', 'Phase of Therapy': 'Diagnosis', 'Treatment': 'None', 'Disease Stage': '4', 'Gender': 'Female', 'Age at diagnosis': '2 months', 'Race': 'Caucasian', 'Age at sample collection': '2 months', 'Source of Culture': 'Solid tumor', 'Primary Tumor Site': 'Posterior fossa', 'Date Established': 'N/A', 'MYCN Patient': 'N/A', 'MYCN Cell line': 'Relative copy number - N/A', 'TH mRNA': 'N/A', 'p53 functionality': 'Non-functional', 'Telomere Mechanism': 'N/A', 'ALK': 'N/A', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': '27 hours', 'Growth Properties': 'Adherent and suspended, grow mostly in clusters', 'Notes': 'All cell lines are antibiotic-free, mycoplasma-free, and cryopreserved in 50% FBS / 7.5% DMSO. Each vial label contains the cell line name, passage number, total viable cell count (usually 5-10e6), the overall cell viability, and date frozen. All cell lines are validated with original patient sample by STR analysis.', 'STR Profile': 'May be obtained at https//strdb.cccells.org/ ', 'Cell Line Name': 'BT-12', 'References': 'PMID: 20922763'},
    "pdfs/CHLA-108_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'CHLA-108', 'Disease': 'Neuroblastoma', 'Phase of Therapy': 'Progressive Disease', 'Treatment': 'N/A', 'Disease Stage': 'N/A', 'Gender': 'Male', 'Age at diagnosis': 'N/A', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'Bone Marrow', 'Primary Tumor Site': 'N/A', 'Date Established': 'January 1992', 'MYCN Patient': 'N/A', 'MYCN Cell line': 'N/A', 'THmRNA': 'N/A', 'p53 functionality': 'N/A', 'Telomere Mechanism': 'N/A', 'ALK': 'N/A', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': '63 hours', 'Growth Properties': 'Adherent', 'Notes': 'The Childhood Cancer Repository has a matching cell line available from this same patient – CHLA-83. The repository has a matching EBV lymphoblastoid cell line – COG-V-538.', 'STR Profile': 'May be obtained at https//strdb.cccells.org/', 'Cell Line Name': 'CHLA-108', 'References': ''},
    "pdfs/CHLA-10_Cell_Line_Data_Sheet.pdf":  
        {'model_id': 'CHLA-10','Disease': 'Primitive neuroectodermal tumor (PNET)', 'Phase of Therapy':  'Post-Chemotherapy (Progressive Disease)', 'Treatment': '4 cycles of cisplatin, doxorubicin, cyclophosphamide, etoposide','Disease Stage': '', 'Gender': 'Female', 'Age at diagnosis': '168 years', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'Solid tumor (thoracic lymph node)', 'Primary Tumor Site': 'Thorax', 'Date Established': '1987', 'EWS/FLI1 Status': 'FLI1', 'p53 functionality': 'Non-Functional', 'Karyotype': '', 'Modal No': '', 'R-IC50 (DIMSCAN*)': 'Vincristine (ng/ml): 0.39 ± 0.05, Melphalan (µg/ml): 7.27 ± 1.20, Etoposide (ng/ml): 0.12 ± 0.01, Rapamycin (ng/ml): 0.62 ± 0.08', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C',  'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Growth Properties': 'Teardrop-shaped cells with processes, adherent, grow mostly in clumps', 'Doubling Time': '32 hours', 'STR Profile': 'May be obtained at https//strdb.cccells.org/', 'Notes': 'The Childhood Cancer Repository has a matching direct-to-culture diagnosis cell line available from this same patient – CHLA-9', 'References': 'PMID: 11212268, PMID: 15289350, PMID: 20922763', 'Cell Line Name': 'CHLA-10'},
    "pdfs/CHLA-119_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'CHLA-119','Disease': 'Neuroblastoma', 'Phase of Therapy': 'Progressive disease', 'Treatment': 'N/A', 'p53 functionality': 'N/A', 'Disease Stage': '4', 'Age at diagnosis': 'N/A', 'Gender': 'Male', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'Blood', 'Primary Tumor Site': 'N/A', 'Date Established': 'N/A', 'MYCN Patient': 'Amplified', 'MYCN Cell line': 'N/A', 'TH mRNA': 'Expressed', 'Telomere Mechanism': 'N/A', 'ALK': 'WT', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': 'N/A', 'Growth Properties': 'Adherent', 'STR Profile': 'May be obtained at https://strdb.cccells.org/', 'Notes': 'The repository has a matching EBV lymphoblastoid cell line – COG-V-483. The repository has generated a matching fibroblast line – COG-FB-498.', 'References': 'PMID: 31484706', 'Cell Line Name': 'CHLA-119'},
    "pdfs/CHLA-11_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'CHLA-11', 'Disease': 'Neuroblastoma', 'Phase of Therapy': 'Progressive disease at bone marrow transplant, post mortem', 'Treatment': 'N/A', 'Disease Stage': '4', 'Gender': 'Female', 'Age at diagnosis': 'NA', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'Blood', 'Primary Tumor Site': 'N/A', 'Date Established': 'March 1988', 'MYCN Patient': 'Amplified', 'MYCN Cell line': 'N/A', 'THmRNA': 'Expressed', 'p53 functionality': 'Functional', 'Telomere Mechanism': 'N/A', 'ALK': 'WT', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': '53 hours', 'Growth Properties': 'Adherent', 'STR Profile': 'May be obtained at https//strdb.cccells.org/', 'Notes': 'The Childhood Cancer Repository has a matching direct-to-culture diagnosis cell line available from this same patient – CHLA-8 and CHLA-12. The repository has a matching EBV lymphoblastoid cell line – COG-V-437.', 'Cell Line Name': 'CHLA-11', 'References': ''},
    "pdfs/CHLA-122_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'CHLA-122', 'Disease': 'Neuroblastoma', 'Phase of Therapy': 'Diagnosis', 'Treatment': 'None', 'Disease Stage': '4', 'Gender': 'Female', 'Age at diagnosis': '24 months', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'Bone Marrow', 'Primary Tumor Site': 'N/A', 'Date Established': 'November 1992', 'MYCN Patient': 'Amplified', 'MYCN Cell line': 'N/A', 'TH mRNA': 'Expressed', 'p53 status': 'Functional', 'Telomere Mechanism': 'N/A', 'ALK': 'WT', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration)20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': '72 hours', 'Growth Properties': 'Suspended, grow mostly in tight clumps', 'STR Profile': 'May be obtained at https//strdb.cccells.org/ ', 'Notes': 'COGcell.org has a post-treatment cell line available from this same patient (CHLA-136). The repository has a matching EBV lymphoblastoid cell line – COG-V-450. The repository has a matching fibroblast line – COG-FB-451.', 'References': 'PMID: 11507071, PMID: 20922763, PMID: 17623797, PMID: 18223229',  'Cell Line Name': 'CHLA-122'},
    "pdfs/CHLA-12_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'CHLA-12', 'Disease': 'Neuroblastoma', 'Phase of Therapy': 'Progressive disease at bone marrow transplant, post mortem', 'Treatment': 'N/A', 'Disease Stage': '4', 'Gender': 'Female', 'Age at diagnosis': 'N/A', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'N/A', 'Primary Tumor Site': 'Bone Marrow', 'Date Established': 'N/A', 'MYCN Patient': 'Amplified', 'MYCN Cell line': 'N/A', 'THmRNA': 'Expressed', 'p53 functionality': 'N/A', 'Telomere Mechanism': 'N/A', 'ALK': 'WT', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': '65 hours', 'Growth Properties': 'Floating, some clumping/adhering together', 'STR Profile': 'May be obtained at https//strdb.cccells.org/', 'Notes': 'The Childhood Cancer Repository has a matching direct-to-culture diagnosis cell line available from this same patient – CHLA-8 and CHLA-12. The repository has a matching EBV lymphoblastoid cell line – COG-V-437', 'Cell Line Name': 'CHLA-12'},
    "pdfs/CHLA-132_Cell_Line_Data_Sheet.pdf": 
        {'model_id': 'CHLA-132', 'Disease': 'Neuroblastoma', 'Phase of Therapy': 'Progressive Disease', 'Treatment': 'N/A', 'Disease Stage': '4', 'Gender': 'Female', 'Age at diagnosis': 'N/A', 'Race': 'N/A', 'Age at sample collection': 'N/A', 'Source of Culture': 'Bone Marrow', 'Primary Tumor Site': 'N/A', 'Date Established': 'May 1993', 'MYCN Patient': 'Non-amplified', 'MYCN Cell line': 'N/A', 'THmRNA': 'Expressed', 'p53 functionality': 'N/A', 'Telomere Mechanism': 'N/A', 'ALK': 'N/A', 'RNAseq': 'N/A', 'WES': 'N/A', 'Growth Conditions': '5% CO2, 20% O2, 37.0°C', 'Media Formulation': 'Cells are grown in a base medium of Iscove’s Modified Dulbecco’s Medium plus the following supplements (to a final concentration): 20% Fetal Bovine Serum, 4mM L-Glutamine, 1X ITS (5 µg/mL insulin, 5 µg/mL transferrin, 5 ng/mL selenous acid)', 'Doubling Time': 'N/A', 'Growth Properties': 'N/A', 'STR Profile': 'May be obtained at https://strdb.cccells.org/', 'Notes': 'None', 'Cell Line Name': 'CHLA-132', 'References': 'PMID: 32291317'}

}

In [309]:
df = process_pdf_into_tsv('pdfs')
df.to_csv('cccells.csv', index=False)
df

100%|██████████| 271/271 [00:06<00:00, 44.28it/s]


Unnamed: 0,model_id,PDX Name,Disease,Phase of Therapy,Treatment,Disease Stage,Source of Culture,Primary Tumor Site,Date Established,Status,...,PDX,P53 status,Preclinical Testing ProgramModel Description for In Vitro Cytotoxicity Testing. Pediatr Blood Cancer. 56,"B. Koneru, G. Lopez,A170E170 A. Farooqi, K. L. Conkrite, T. H. Nguyen, S. J. Macha, A. Modi, J. L. Rokita, E. Urias,",Histology Subtype,PAX-FKHR Status,2017;4170033.,"RT, Burgess SW, Shaw WA, Reynolds CPImproved oral delivery of N-(4-hydroxyphenyl)retinamide with","8. Peng H, Sohara Y, Moats RA, Nelson MD Jr, Groshen SG, Ye W, Reynolds CP, DeClerck YAThe activity of","tumor cell survival and proliferation. Cancer Res. 679346- 55, 2007."
0,COG-LY-465x,COG-LY-465x,Burkitt's Lymphoma,"Post-Chemotherapy (Progressive Disease), Post-...",,,Tumor (floating tumor in central vein draw),,June 2012,,...,,,,,,,,,,
1,BT-12,,Atypical teratoid/rhabdoid tumor,Diagnosis,,4,Solid tumor,Posterior fossa,,,...,,,,,,,,,,
2,CHLA-108,,Neuroblastoma,Progressive Disease,,,Bone Marrow,,January 1992,,...,,,,,,,,,,
3,CHLA-10,,Primitive neuroectodermal tumor (PNET),Post-Chemotherapy (Progressive Disease),"4 cycles of cisplatin, doxorubicin, cyclophosp...",,Solid tumor (thoracic lymph node),Thorax,1987,,...,,,,,,,,,,
4,CHLA-119,,Neuroblastoma,Progressive disease,,4,Blood,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,SMS-SB,,Pre-B Acute Lymphoblastic Leukemia (ALL),,,,Tumor,,April 1983,,...,,,,,,,,,,
267,TC-32,,Primitive neuroectodermal tumor (PNET),Diagnosis,,Female,ERG,Non-Functional,"+5,+10,i(1q),t(11;22)(q24;q12)",,...,,,,,,,,,,
268,TC-32,,Primitive neuroectodermal tumor (PNET),Diagnosis,,Female,ERG,Non-Functional,"+5,+10,i(1q),t(11;22)(q24;q12)",,...,,,,,,,,,,
269,TC-71,,Ewing’s Sarcoma,Post-Chemotherapy (Progressive Disease),Male,22 years,ERG,Non-Functional,"-Y,8,t(1;7)(q25;p11),del(2)(q36),t(2;14)(q12;q...",,...,,,,,,,,,,


In [363]:
column_mapper = {'model_id': 'model_id', 'PDX Name': 'pdx_model_name', 'Cell Line Name': 'in_vitro_model_name', 'Disease': 'diagnosis', 'Phase of Therapy': 'response_to_treatment', 'Treatment': 'treatment', 'Disease Stage': 'stage', 'Source of Culture': 'collection_site', 'Primary Tumor Site': 'primary_site', 'Date Established': 'collection_date', 'Gender': 'sex', 'Race': 'ethnicity', 'Strain of Mice': 'host_strain_name', 'Injection Type': 'engraftment_type', 'Growth Properties': 'growth_properties', 'Growth Conditions': 'growth_conditions', 'Media Formulation': 'growth_media', 'Doubling Time': 'doubling_time', 'Human vs. Mouse': 'pdx_validation', 'STR Profile': 'str_profile', 'Notes': 'comments', 'References': 'references', 'Age at diagnosis': 'age_at_initial_diagnosis', 'Age at sample collection': 'age_in_years_at_collection', 'MYCN Patient': 'mycn_status_patient', 'MYCN Cell line': 'mycn_status_cell', 'MYCN PDX': 'mycn_status_pdx', 'TH mRNA': 'th_mrna', 'p53 functionality': 'tp53', 'Telomere Mechanism': 'telomere', 'ALK': 'alk', 'Histology Subtype': 'histological_subtype', 'PAX-FKHR Status': 'pax', 'EWS/FLI1 Status': 'ews', 'Karyotype': 'karyotype', 'Modal No': 'modal_no'}

process_df = pd.read_csv('cccells.csv').rename(columns=column_mapper)
process_df = process_df.drop_duplicates(['model_id'])

In [364]:
process_df['collection_date'].unique()

array(['Jun-12', nan, 'Jan-92', '1987', 'Mar-88', 'Nov-92', 'May-93',
       'Nov-93', 'Dec-93', 'Apr-88', 'Nov-96', 'Nov-97', 'Oct-88',
       'Jul-99', 'Apr-89', 'Jan-01', 'May-01', 'May-89', 'Jun-89',
       'Sep-89', 'Feb-90', 'Jan-91', 'Jul-91', 'Feb-07', 'Oct-17',
       'Jul-16', 'Dec-04', '2005', 'Mar-05', 'Nov-09', 'Jun-07', 'Apr-08',
       'Jan-06', '2006', 'May-06', 'Sep-07', 'Dec-06', 'Dec-07', 'Sep-09',
       'Jan-12', 'Dec-11', 'Apr-12', 'Jul-13', 'Aug-13', 'Jul-14',
       'Apr-14', 'Aug-07', 'Sep-11', 'July 2001 (surgery May 2001)',
       'August 2001 (surgery May 2001)',
       'September 2001 (surgery July 2001)',
       'January 2002 (surgery November 2001)',
       'March 2002 (surgery February 2002)',
       'October 2002 (surgery October 2002)',
       'June 2003 (surgery April 2003)', 'July 2003 (surgery May 2003)',
       'August 2003 (surgery July 2003)',
       'September 2003 (surgery July 2003)',
       'January 2004 (surgery October 2003)',
       'March

In [370]:
def create_empty_columns(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = 'Not provided'
    return df

def convert_dates(date):
    try:
        # Handle different formats
        if 'Not provided' in date:
            return None  # or any placeholder you prefer
        elif len(date) == 4:  # Just the year
            return pd.to_datetime(date).strftime('%b %Y')
        else:
            return pd.to_datetime(date, format='%B %Y', errors='coerce').strftime('%b %Y') or \
                   pd.to_datetime(date, format='%b-%y', errors='coerce').strftime('%b %Y')
    except Exception as e:
        return date  # Handle any unexpected formats

def merge_selected_columns(row):
    merged_values = []
    columns_to_merge = [
    'mycn_status_patient', 
    'mycn_status_cell', 
    'mycn_status_pdx',
    'Morphology', 
    'th_mrna', 
    'tp53', 
    'telomere', 
    'alk'
    ]
    for col in columns_to_merge:
        value = row[col]
        if pd.notna(value):
            merged_values.append(f"{col}: {value}")
    
    return ' | '.join(merged_values)

def generate_patient_sheet(df):
    template = read_metadata_with_fields(join(templates, 'metadata', 'metadata_template-patient.tsv'))
    df['patient_id'] = df['model_id']
    df['initial_diagnosis'] = df['diagnosis']
    df = create_empty_columns(df, template.columns)
    df = df.fillna('Not provided')
    pd.concat([template, df[template.columns]]).to_csv('CCC_metadata-patient.tsv', sep='\t', index=False)

def generate_patient_sample(df):
    template = read_metadata_with_fields(join(templates, 'metadata', 'metadata_template-patient_sample.tsv'))
    df['patient_id'] = df['model_id']
    df['sample_id'] = df['model_id']
    df['staging_system'] = 'Not provided'
    df['tumour_type'] = 'primary'
    df['sharable'] = 'yes'
    df['virology_status'] = 'PCR evaluation of EBV, HAdV, Hantaan, HCMV, Hepatitis A, Hepatitis B, Hepatitis C, HHV 6, HHV8, HIV1, HIV2, HSV 1, HSV 2, HTLV 1, HTLV2, LCMV, Mycoplasma sp., Seoul, Sin Nombre, VZV were all negative.'
    df['collection_method'] = ['Surgery' if d.__contains__('surgery') else 'Not provided' for d in df['collection_date'].fillna('Not provided') ]
    df['collection_date'] = [d.split('(')[0] for d in df['collection_date'].fillna('Not provided')]
    df['collection_date'] = df['collection_date'].apply(convert_dates)
    df['treatment_naive_at_collection'] = ['Yes' if tnc == 'Yes' else 'No'for tnc in df['response_to_treatment'].fillna('Yes')]
    df['treated_prior_to_collection'] = ['No' if tnc == 'Yes' else 'Yes'for tnc in df['response_to_treatment'].fillna('Yes')]
    df['treated_at_collection'] = ['No' if tnc == 'Yes' else 'Yes'for tnc in df['response_to_treatment'].fillna('Yes')]
    df['gene_mutation_status'] = df.apply(merge_selected_columns, axis=1)
    df = create_empty_columns(df, template.columns)
    df = df.fillna('Not provided')
    pd.concat([template, df[template.columns]]).to_csv('CCC_metadata-patient_sample.tsv', sep='\t', index=False)
    
def generate_sharing_sheet(df):
    template = read_metadata_with_fields(join(templates, 'metadata', 'metadata_template-sharing.tsv'))
    df['database_url'] = 'https://www.cccells.org/PDF_Files/' + df['file'].str.replace('pdfs/', '')
    df = create_empty_columns(df, template.columns)
    pd.concat([template, df[template.columns]]).to_csv('CCC_metadata-sharing.tsv', sep='\t', index=False)

def generate_model_validation_sheet(df):
    df = df.drop_duplicates(['model_id'])
    df['validation_host_strain_nomenclature'] = df['host_strain_name'] 
    df['description'] = df['pdx_validation']
    df['STR_analysis'] = 'yes'
    template = read_metadata_with_fields(join(templates, 'metadata', 'metadata_template-model_validation.tsv'))
    df = create_empty_columns(df, template.columns)
    pd.concat([template, df[template.columns]]).to_csv('CCC_metadata-model_validation.tsv', sep='\t', index=False)

def get_models(df):
    cell_df = df[df['pdx_model_name'].isna()]
    pdx_df = df[~df['pdx_model_name'].isna()]
    generate_cell_sheet(cell_df)
    generate_pdx_sheet(pdx_df)

def generate_pdx_sheet(df):
    template = read_metadata_with_fields(join(templates, 'metadata', 'metadata_template-pdx_model.tsv'))
    df['sample_type'] = 'tissue fragment'
    df['host_strain_nomenclature'] = 'NOD.Cg-Prkdc<sup>scid</sup> Il2rg<sup>tm1Wjl</sup>/SzJ'
    df['host_strain_name'] = 'NSG'
    df['sample_state'] = 'Fresh'
    df['supplier'] = 'CCCells'
    df['supplier_type'] = 'Academic'
    df['vendor_link'] = 'https://www.cccells.org/RCLP.php'
    df['parent_id'] = df['comments']
    df['publications'] = df['references']
    df['catalog_number'] = df['model_id'].astype(str)
    df = create_empty_columns(df, template.columns)
    pd.concat([template, df[template.columns]]).to_csv('CCC_metadata-pdx_model.tsv', sep='\t', index=False)
    
    
    
    
    
def generate_cell_sheet(df):
    template = read_metadata_with_fields(join(templates, 'metadata', 'metadata_template-cell_model.tsv'))
    df['model_name'] = df['in_vitro_model_name']
    df['publications'] = df['references']
    df['catalog_number'] = df['model_id'].astype(str)
    df['type'] = 'Cell line'
    df['plate_coating'] = 'None'
    df['other_plate_coating'] = 'None'
    df['contaminated'] = 'No'
    df['contamination_details'] = 'None'
    df['supplier'] = 'CCCells'
    df['supplier_type'] = 'Academic'
    df['vendor_link'] = 'https://www.cccells.org/RCLP.php'
    df['parent_id'] = df['comments']
    df['comments'] = "Doubling time: " + df['doubling_time'] + "; Growth conditions: " + df['growth_conditions'] + "Notes: " + df['comments']
    df = create_empty_columns(df, template.columns)
    pd.concat([template, df[template.columns]]).to_csv('CCC_metadata-cell_model.tsv', sep='\t', index=False)
    
#generate_patient_sheet(process_df)
#generate_patient_sample(process_df)
#generate_sharing_sheet(df)
#generate_model_validation_sheet(process_df)
get_models(process_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model_name'] = df['in_vitro_model_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['publications'] = df['references']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['catalog_number'] = df['model_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try u

In [371]:
ps = read_metadata_with_fields('CCC/CCC_metadata-patient_sample.tsv')

In [383]:
[round(float(d.split(' months')[0])/12,2) if d.__contains__('months') else d if d.__contains__('days') else d for d in ps['age_in_years_at_collection']]

['Patient age at collection. Can be exact age or binned in 10 y group (1-9, 10-19, 20-29, ...)',
 '70',
 'numerical',
 'essential',
 '10',
 0.17,
 'Not provided',
 14.0,
 'Not provided',
 'Not provided',
 2.0,
 'Not provided',
 'Not provided',
 3.0,
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 1.5,
 8.42,
 'Not provided',
 'Not provided',
 12.0,
 2.0,
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 14.0,
 'Not provided',
 'Not provided',
 'Not provided',
 8.5,
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 8.5,
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 'Not provided',
 '4',
 'Not provided',
 '

In [384]:
ps['age_in_years_at_collection'] = [round(float(d.split(' months')[0])/12,2) if d.__contains__('months') else round(float(d.split(' days')[0])/(30*12),2) if d.__contains__('days') else d for d in ps['age_in_years_at_collection']]

In [385]:
ps.to_csv('CCC/CCC_metadata-patient_sample.tsv', sep='\t', index=False)