In [1]:
import pandas as pd
from Bio import Entrez
from pycparser.ply.yacc import token

from utils import get_files, join
from requests import post, exceptions, get
import xml.etree.ElementTree as ET
import os
import re

In [24]:
def replace_html_tags(text):
    text = re.sub(r'<sub>(.*?)</sub>', r'_\1_', text)  # Replace <sub> tags with underscores for subscript
    return text

def get_abstract_from_api(path):
    papers = pd.read_csv(path, sep='\t')
    if 'pid' not in papers.columns:
        papers['pid'] = papers['pubmed_ids'].str.replace(' ', '')
    papers['pmid'] = papers['pid'].astype(str).str.replace('PMID:', '')
    ids = ','.join(papers['pmid'].to_list())
    payload = {'db':'pubmed', 'id':ids}
    response = post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", data=payload)
    #print(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={ids}")
    root = ET.fromstring(response.text)

    # Extract information
    articles = []
    for article in root.findall('PubmedArticle'):
        pmid = article.find('.//PMID').text
        title = article.find('.//ArticleTitle').text
        journal_title = article.find('.//Journal//Title').text
        abstract = article.findall('.//AbstractText')
        abstract_text = ""
        for abs in abstract:
            if abs.text is not None:
                text = ET.tostring(abs, encoding='unicode', method='text')
                abstract_text += replace_html_tags(text)
        pub_date_element = article.find('.//JournalIssue/PubDate')
        year = pub_date_element.find('Year').text if pub_date_element.find('Year') is not None else ''
        month = pub_date_element.find('Month').text if pub_date_element.find('Month') is not None else ''
        day = pub_date_element.find('Day').text if pub_date_element.find('Day') is not None else ''

        # Format the publication date
        publication_date = f"{month}-{year}"
        articles.append({
            'PMID': pmid,
            'Title': title,
            'Journal': journal_title,
            'Abstract': abstract_text,
            'Date': publication_date
            
        })
    abstract_df = pd.DataFrame(articles)
    abstract_df = papers.merge(abstract_df, left_on='pmid', right_on='PMID', how='left')
    abstract_df.to_csv('corpus_negative_abtract.tsv', index=False, sep='\t')
    for pubmed_article in abstract_df.iterrows():
        filename = f"corpus/{pubmed_article[1]['pid'].replace(':', '_')}.txt"
        with open(filename, 'w') as file:
            file.write(str(pubmed_article[1]['Abstract']))
    return abstract_df

def read_pmid_get_abstract(path):
    papers = pd.read_csv(path, sep='\t')
    Entrez.email = 'tushar@ebi.ac.uk'
    if 'pid' not in papers.columns:
        papers['pid'] = papers['pubmed_ids'].str.replace(' ', '')
    papers['pmid'] = papers['pid'].astype(str).str.replace('PMID:', '')
    pmids = papers['pid'].str.replace('PMID:', '').to_list()
    handle = Entrez.efetch(db="pubmed", id=','.join(map(str, pmids)),
                           rettype="xml", retmode="text")
    records = Entrez.read(handle)
    abstracts = [(pubmed_article['MedlineCitation']['PMID'].strip(), pubmed_article['MedlineCitation']['Article']['Abstract']['AbstractText']) if 'Abstract' in pubmed_article['MedlineCitation']['Article'].keys() else (pubmed_article['MedlineCitation']['PMID'].strip(), 'Not provided')
                 for pubmed_article in records['PubmedArticle']]
    abstract_dict = dict(abstracts)
    abstract_df = pd.DataFrame([abstract_dict.keys(), abstract_dict.values()]).transpose()
    abstract_df = abstract_df.rename({0: 'pmid', 1: 'abstract'},axis=1)
    papers = papers.merge(abstract_df, on='pmid', how='outer')
    papers.to_csv('abstract_pmids_api_full.tsv', index=False, sep='\t')
    for pubmed_article in papers.iterrows():
        filename = f"new_abstracts_full/{pubmed_article[1]['pid'].replace(':', '_')}.txt"
        with open(filename, 'w') as file:
            file.write(str(pubmed_article[1]['abstract']))
    return papers

In [27]:
abstract_df = get_abstract_from_api("corpus_negative.tsv")
abstract_df

Unnamed: 0,pubmed_ids,pid,pmid,PMID,Title,Journal,Abstract,Date
0,29860986,29860986,29860986,29860986,Immunotherapy and Prevention of Pancreatic Can...,Trends in cancer,Pancreatic cancer is the third-leading cause o...,Jun-2018
1,27741350,27741350,27741350,27741350,Measuring cancer evolution from the genome.,The Journal of pathology,The temporal dynamics of cancer evolution rema...,Jan-2017
2,28574057,28574057,28574057,28574057,New methods in the diagnosis of cancer and gen...,Cancer gene therapy,Cancer is one of the leading cause of death in...,Jun-2017
3,33504580,33504580,33504580,33504580,Metabolic Codependencies in the Tumor Microenv...,Cancer discovery,Metabolic reprogramming enables cancer cell gr...,May-2021
4,27839715,27839715,27839715,27839715,[Lucy's cancer(s): A prehistorical origin?].,"Gynecologie, obstetrique & fertilite",The recent discovery of the earliest hominin c...,Dec-2016
...,...,...,...,...,...,...,...,...
126,PMID: 28232476,PMID:28232476,28232476,28232476,Potential Targets' Analysis Reveals Dual PI3K/...,Clinical cancer research : an official journal...,,Mar-2017
127,PMID: 27815673,PMID:27815673,27815673,27815673,Erratum to: Patient-Derived Mammosphere and Xe...,Journal of mammary gland biology and neoplasia,,Dec-2016
128,PMID: 29079660,PMID:29079660,29079660,29079660,Dual mTOR Kinase Inhibitor MLN0128 Sensitizes HR,Clinical cancer research : an official journal...,,Jan-2018
129,PMID:28473534,PMID:28473534,28473534,28473534,"Elacestrant (RAD1901), a Selective Estrogen Re...",Clinical cancer research : an official journal...,,Aug-2017


In [18]:
abstract_df = get_abstract_from_api("PMIDs_from_API.tsv")
abstract_df

Unnamed: 0,pubmed_ids,type,data_source,external_model_id,pid,pmid,PMID,Title,Journal,Abstract,Date
0,PMID: 19240712,PDX,Curie-BC,['HBCx-8'],PMID:19240712,19240712,19240712,CD44 targeting reduces tumour growth and preve...,British journal of cancer,CD44 is a marker of tumour-initiating cells an...,Mar-2009
1,PMID: 21081655,PDX,PMLB,"['PHLC412', 'PHLC344', 'PHLC134', 'PHLC432', '...",PMID:21081655,21081655,21081655,The ability to form primary tumor xenografts i...,Clinical cancer research : an official journal...,Primary tumor xenografts (PTXG) established di...,Jan-2011
2,PMID: 21321221,PDX,LIH,['P3'],PMID:21321221,21321221,21321221,Anti-VEGF treatment reduces blood supply and i...,Proceedings of the National Academy of Science...,"Bevacizumab, an antibody against vascular endo...",Mar-2011
3,PMID: 22247967,PDX,Curie-BC,"['HBCx-2', 'HBCx-31', 'HBCx-8']",PMID:22247967,22247967,22247967,Molecular profiling of patient-derived breast ...,Breast cancer research : BCR,Identification of new therapeutic agents for b...,Jan-2012
4,PMID: 23460667,PDX,LIH,"['T341', 'T331', 'T251', 'P3', 'P8', 'T16', 'T...",PMID:23460667,23460667,23460667,Side population in human glioblastoma is non-t...,Brain : a journal of neurology,The identification and significance of cancer ...,May-2013
...,...,...,...,...,...,...,...,...,...,...,...
3505,PMID:17606733,PDX,Curie-BC,"['HBCx-5', 'HBCx-17', 'HBCx-3', 'HBCx-14', 'HB...",PMID:17606733,17606733,17606733,A new model of patient tumor-derived breast ca...,Clinical cancer research : an official journal...,To establish a panel of human breast cancer (H...,Jul-2007
3506,PMID:17606733,PDX,Curie-BC,"['HBCx-5', 'HBCx-17', 'HBCx-3', 'HBCx-14', 'HB...",PMID:17606733,17606733,17606733,A new model of patient tumor-derived breast ca...,Clinical cancer research : an official journal...,To establish a panel of human breast cancer (H...,Jul-2007
3507,PMID:26440065,PDX,UMCG,"['UMCGOVPDX68a', 'UMCGOVPDX70', 'UMCGOVPDX56',...",PMID:26440065,26440065,26440065,Biobanking of patient and patient-derived xeno...,Scientific reports,Using patient-derived xenografts (PDXs) for pr...,Oct-2015
3508,PMID:28522592,PDX,Curie-LC,"['LCF9', 'LCF4', 'LCF15']",PMID:28522592,28522592,28522592,Sensitization of EGFR Wild-Type Non-Small Cell...,Molecular cancer therapeutics,The benefit of EGFR-TKI in non-small cell lung...,Aug-2017


In [8]:
abstract_df = read_pmid_get_abstract("pmid-cancer.tsv")

KeyboardInterrupt: 

In [9]:
publications = pd.read_json("https://www.cancermodels.org/api/publication_group")
model_information = pd.read_json("https://www.cancermodels.org/api/model_information?select=external_model_id,type,data_source,publication_group_id")

In [28]:
df = model_information.merge(publications, left_on='publication_group_id', right_on='id', how='left')[['external_model_id','type','data_source','pubmed_ids']]
df = df[~df['pubmed_ids'].isna()].reset_index(drop=True)
df['pubmed_ids'] = df['pubmed_ids'].str.split(',')
df = df.explode('pubmed_ids')
df = df[df['pubmed_ids'] != ""].reset_index(drop=True)
df = df.groupby(['pubmed_ids', 'type', 'data_source'])['external_model_id'].apply(list).reset_index()

In [29]:
df.to_csv('PMIDs_from_API.tsv', sep='\t', index=False)

In [16]:
from requests import get
import xml.etree.ElementTree as ET

response = get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=36264037,4357757,4353636")
# Parse the XML content
root = ET.fromstring(response.text)

# Extract information
articles = []
for article in root.findall('PubmedArticle'):
    pmid = article.find('.//PMID').text
    title = article.find('.//ArticleTitle').text
    journal_title = article.find('.//Journal//Title').text
    abstract = article.findall('.//AbstractText')
    abstract_text = ""
    for abs in abstract:
        text = ET.tostring(abs, encoding='unicode', method='text')
        abstract_text += replace_html_tags(text)
    pub_date_element = article.find('.//DateCompleted')
    year = pub_date_element.find('Year').text if pub_date_element.find('Year') is not None else ''
    month = pub_date_element.find('Month').text if pub_date_element.find('Month') is not None else ''
    publication_date = f"{month}-{year}"
    articles.append({
        'PMID': pmid,
        'Title': title,
        'Journal': journal_title,
        'Abstract': abstract_text,
        'Date': publication_date
        
    })
articles

[{'PMID': '36264037',
  'Title': 'Chemical proteomics reveals interactors of the alarmone diadenosine triphosphate in the cancer cell line H1299.',
  'Journal': 'Journal of peptide science : an official publication of the European Peptide Society',
  'Abstract': 'Intracellular dinucleoside polyphosphates (Npn Ns) have been known for decades but the functional role remains enigmatic. Diadenosine triphosphate (Ap3 A) is one of the most prominent examples, and its intercellular concentration was shown to increase upon cellular stress. By employment of previously reported Ap3 A-based photoaffinity-labeling probes (PALPs) in chemical proteomics, we investigated the Ap3 A interactome in the human lung carcinoma cell line H1299. The cell line is deficient of the fragile histidine triade (Fhit) protein, a hydrolase of Ap3 A and tumor suppressor. Overall, the number of identified potential interaction partners was significantly lower than in the previously investigated HEK293T cell line. Gene o

In [113]:
article.findall('.//AbstractText')

[<Element 'AbstractText' at 0x13e347680>]

In [35]:
from requests import get
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

def check_keywords(text, keywords):
    return any(keyword.lower() in text.lower() for keyword in keywords)

def get_article_details(papers):
    ids = ','.join(papers['pmid'].to_list())
    payload = {'db':'pubmed', 'id':ids}
    response = post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", data=payload)
    root = ET.fromstring(response.text)
    articles = []
    for article in root.findall('PubmedArticle'):
        pmid = article.find('.//PMID').text
        title = article.find('.//ArticleTitle').text
        journal_title = article.find('.//Journal//Title').text
        abstract = article.findall('.//AbstractText')
        abstract_text = ""
        for abs in abstract:
            if abs.text is not None:
                text = ET.tostring(abs, encoding='unicode', method='text')
                abstract_text += replace_html_tags(text)
        pub_date_element = article.find('.//JournalIssue/PubDate')
        year = pub_date_element.find('Year').text if pub_date_element.find('Year') is not None else ''
        month = pub_date_element.find('Month').text if pub_date_element.find('Month') is not None else ''
        publication_date = f"{month}-{year}"
        articles.append({
            'PMID': pmid,
            'Title': title,
            'Journal': journal_title,
            'Abstract': abstract_text,
            'Date': publication_date
            
        })
    abstract_df = pd.DataFrame(articles)
    abstract_df = papers.merge(abstract_df, left_on='pmid', right_on='PMID', how='left')[['filename','PubMedID', 'Title', 'Journal', 'Date', 'abstract', 'pmid']]
    xenograft_keywords = ['xenograft', 'PDX']
    organoid_keywords = ['organoid', 'PDO']
    cell_keywords = ['cell line']
    abstract_df['xenograft_pdx'] = abstract_df['abstract'].apply(lambda x: 'PDX' if check_keywords(x, xenograft_keywords) else '')
    abstract_df['organoid_pdo'] = abstract_df['abstract'].apply(lambda x: 'Organoid' if check_keywords(x, organoid_keywords) else '')
    abstract_df['cell_line_pdo'] = abstract_df['abstract'].apply(lambda x: 'Cell line' if check_keywords(x, cell_keywords) else '')
    
    abstract_df['stratify'] = abstract_df['xenograft_pdx'] + '_' + abstract_df['organoid_pdo']
    train, temp = train_test_split(abstract_df, test_size=0.4, stratify=abstract_df['stratify'], random_state=42)
    dev, test = train_test_split(temp, test_size=0.5, stratify=temp['stratify'], random_state=42)
    train['split'] = 'train'
    dev['split'] = 'dev'
    test['split'] = 'test'
    df_split = pd.concat([train, dev, test]).reset_index(drop=True)
    df_split = df_split.drop(columns=['stratify'])
    return df_split

corpus_path = "corpus-txt"
corpus_files = [join(corpus_path, f) for f in get_files(corpus_path) if f.endswith('.txt')]
corpus_df = pd.DataFrame()
temp = pd.DataFrame()
for f in corpus_files:
    d = open(f, "r")
    txt = d.read()
    temp['PubMedID'] = [f.replace(corpus_path, '').replace('/', '').replace('.txt', '').replace('_', ':')]
    temp['filename'] = f.replace(corpus_path, '').replace('/', '')
    temp['pmid'] = f.replace(corpus_path, '').replace('/', '').replace('.txt', '').replace('_', ':').replace('PMID:', '')
    temp['abstract'] = txt
    corpus_df = pd.concat([corpus_df, temp])

detailed_df = get_article_details(corpus_df)
detailed_df.to_csv('corpus_abtract_detailed.tsv', index=False, sep='\t')
detailed_df
#abstract_df.to_csv('corpus_abtract_detailed.tsv', index=False, sep='\t')

Unnamed: 0,filename,PubMedID,Title,Journal,Date,abstract,pmid,xenograft_pdx,organoid_pdo,cell_line_pdo,split
0,PMID_30146332.txt,PMID:30146332,Identification of Therapeutic Targets in Rhabd...,Cancer cell,Sep-2018,Personalized cancer therapy targeting somatic ...,30146332,,,,train
1,PMID_32553164.txt,PMID:32553164,Patient-Derived Ovarian Cancer Organoids Mimic...,Cell reports,Jun-2020,There remains an unmet need for preclinical mo...,32553164,,Organoid,,train
2,PMID_34020697.txt,PMID:34020697,Analysis of genomic and non-genomic signaling ...,Breast cancer research : BCR,May-2021,Endocrine therapies targeting estrogen signali...,34020697,PDX,,,train
3,PMID_29242316.txt,PMID:29242316,Colorectal Cancer Consensus Molecular Subtypes...,Clinical cancer research : an official journal...,Feb-2018,Purpose: Response to standard oncologic treatm...,29242316,PDX,,Cell line,train
4,PMID_25863122.txt,PMID:25863122,Development and characterization of a human or...,Developmental biology,Nov-2015,Neuroblastoma is a pediatric cancer of the dev...,25863122,PDX,,,train
...,...,...,...,...,...,...,...,...,...,...,...
95,PMID_37452026.txt,PMID:37452026,Oxidative phosphorylation is a metabolic vulne...,Nature communications,Jul-2023,Resistance to endocrine treatments and CDK4/6 ...,37452026,PDX,,Cell line,test
96,PMID_36657446.txt,PMID:36657446,Using patient-derived organoids to predict loc...,Cell reports. Medicine,Feb-2023,Predicting the clinical response to chemothera...,36657446,,Organoid,,test
97,PMID_32183023.txt,PMID:32183023,The Second Generation Antibody-Drug Conjugate ...,Cancers,Mar-2020,Trastuzumab-emtansine (T-DM1) is an antibody-d...,32183023,,,,test
98,PMID_28232476.txt,PMID:28232476,Potential Targets' Analysis Reveals Dual PI3K/...,Clinical cancer research : an official journal...,Mar-2017,Purpose: Uterine sarcomas are rare and heterog...,28232476,PDX,,,test


In [27]:
detailed_df.abstract.str.contains('PDX')

Index(['PubMedID', 'pmid', 'abstract', 'PMID', 'Title', 'Journal', 'Abstract',
       'Date'],
      dtype='object')

In [59]:
import os
import json
from requests import post, exceptions, get
from bs4 import BeautifulSoup


def pmid_to_pmcid(pmids):
    pmc_ids = {}
    
    for pmid in pmids:
        try:
            # Fetch metadata for the given PMID
            response = get(f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&format=json")
            response.raise_for_status()
            data = response.json()
            
            # Check if the data contains results and a PMCID
            if data.get("resultList", {}).get("result"):
                result = data["resultList"]["result"][0]
                pmcid = result.get("pmcid")
                
                # Store in dictionary
                pmc_ids[pmid] = str(pmcid) if pmcid else ""
            else:
                pmc_ids[pmid] = ''
        
        except exceptions.RequestException as e:
            print(f"Error retrieving PMCID for PMID {pmid}: {e}")
            pmc_ids[pmid] = "Error occurred"
    
    return pmc_ids

def fetch_full_text(pmids, output_folder="articles"):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    for pmid in pmids:
        if pmid == '':
            continue
        try:
            # Fetch article details
            response = get(f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmid}/fullTextXML")
            response.raise_for_status()
            
            if response.text:
                xml_content = response.text
                
                # Parse XML content with BeautifulSoup
                soup = BeautifulSoup(xml_content, "xml")
                
                # Extract and concatenate all text elements
                article_text = " ".join([p.get_text() for p in soup.find('body').find_all("p")])
                
                # Save parsed text to file
                file_path = os.path.join(output_folder, f"{pmid}.txt")
                with open(file_path, "w", encoding="utf-8") as file:
                    file.write(article_text)
                
                
                print(f"Full text for PMID {pmid} saved successfully.")
            else:
                print(f"No full text available for PMID {pmid}.")
        
        except exceptions.RequestException as e:
            print(f"Error retrieving PMID {pmid}: {e}")
    return xml_content

def read_in_full_text(input_dir):
    full_id2text = {}
    for fl in os.listdir(input_dir):
        abstract_id = re.search(r"PMC\d+", fl, flags=0).group()
        with open(os.path.join(input_dir, fl), "r") as fr:
            text = fr.read()
        full_id2text[abstract_id] = text
    return full_id2text


def tokenize_full_text(fulltext_dir):
    full_texts = read_in_full_text(fulltext_dir)
    tokenized_full_texts = {}
    for id, text in full_texts.items():
        tokenized = text.split('. ')
        tokenized_full_text = get_sentence_character_positions(tokenized)
        tokenized_full_texts[id] = tokenized_full_text
    return tokenized_full_texts
    
        
    
def get_sentence_character_positions(sentences):
    result = []
    char_start = 0  # Initialize character start position

    for idx, sentence in enumerate(sentences):
        char_end = char_start + len(sentence)  # Calculate character end position
        # Create the sentence dictionary
        sentence_info = {
            "sent_idx": idx,  # Indexing starts from 1
            "char_start": char_start,
            "char_end": char_end,
            "sent_text": sentence
        }
        result.append(sentence_info)  # Add to result list
        char_start = char_end + 1  # Update character start for next sentence

    return result

def save_sentences_to_json(sentences_with_positions, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(sentences_with_positions, json_file, indent=4)

# Example list of PMIDs
pmid_list = [pmid.replace('.txt', '').replace('PMID_', '') for pmid in os.listdir('/Users/tushar/CancerModels/set-aside/sa-y4/prompt_pdcr_ebi/abstract_texts_updated_split/dev') if pmid.__contains__('PMID')]
pmcid_list = pmid_to_pmcid(pmid_list)
xml = fetch_full_text(pmcid_list.values())
tokenized = tokenize_full_text('articles/')
save_sentences_to_json(tokenized, 'fulltext_tokenized_dev.json')

Full text for PMID PMC4815803 saved successfully.
Full text for PMID PMC9631073 saved successfully.
Error retrieving PMID PMC7556703: 404 Client Error: Not Found for url: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC7556703/fullTextXML
Error retrieving PMID PMC4301580: 404 Client Error: Not Found for url: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC4301580/fullTextXML
Full text for PMID PMC8132120 saved successfully.
Error retrieving PMID PMC8662740: 404 Client Error: Not Found for url: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC8662740/fullTextXML
Full text for PMID PMC5499209 saved successfully.
Error retrieving PMID PMC6342199: 404 Client Error: Not Found for url: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC6342199/fullTextXML
Full text for PMID PMC7471705 saved successfully.
Full text for PMID PMC7612638 saved successfully.
Error retrieving PMID PMC7350550: 404 Client Error: Not Found for url: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC73505

In [58]:
read_in_full_text('articles/').keys()

dict_keys(['PMC9631073', 'PMC7612638', 'PMC4815803', 'PMC5499209', 'PMC8132120', 'PMC8134568', 'PMC7471705', 'PMC9270001'])

In [40]:
re.search(r"PMC\d+", fl, flags=0)

<re.Match object; span=(0, 10), match='PMC5499209'>

In [64]:
pmcid_list

{'26695443': 'PMC4815803',
 '36058001': 'PMC9631073',
 '33421710': '',
 '31761724': '',
 '31883794': 'PMC7556703',
 '25444907': 'PMC4301580',
 '34027491': 'PMC8132120',
 '32075943': 'PMC8662740',
 '28561063': 'PMC5499209',
 '30395907': 'PMC6342199',
 '32884042': 'PMC7471705',
 '35410383': 'PMC7612638',
 '32294323': 'PMC7350550',
 '30146332': 'PMC6158019',
 '34011980': 'PMC8134568',
 '29458007': '',
 '35802820': 'PMC9270001',
 '30380421': '',
 '35508177': 'PMC9177814',
 '30017245': ''}

In [63]:
import json
import pandas as pd
from collections import defaultdict

# Load JSON data
with open('/Users/tushar/CancerModels/set-aside/sa-y4/prompt_pdcr_ebi/gpt-4o_dev_output_parsed.json', 'r') as file:
    data = json.load(file)

# Prepare a dictionary to hold lists of tags for each PMID
df_dict = defaultdict(lambda: defaultdict(list))

# Populate dictionary by iterating over each PMID
for pmid, annotations in data.items():
    for annotation in annotations:
        tag = annotation['tags'][0]  # Take the first tag from the list
        text_provided = annotation['textProvided']
        df_dict[pmid][tag].append(text_provided)  # Append text under the tag column for each PMID

# Convert to DataFrame
df = pd.DataFrame({pmid: {k: v for k, v in tags.items()} for pmid, tags in df_dict.items()}).T

# Optional: Join lists into comma-separated strings
#df = df.applymap(lambda x: ', '.join(x) if isinstance(x, list) else x)

df.to_csv('parsed_llm_output_gpt4o_combined.tsv', sep='\t')


In [16]:
soup = BeautifulSoup(xml, "xml")

In [20]:
body = soup.find('body')

In [31]:
for token in body.find_all('p')[0].get_text().split('. '):
    print(len(token))

182
233
265


In [32]:
body.find_all('p')[0].get_text().split('. ')

['A number of classification systems based on gene expression have been proposed that stratify colorectal cancer (CRC) in subgroups with distinct molecular and clinical features1234567',
 'Comparative analyses in different data sets have revealed substantial classification coherence across the various signatures, particularly in the case of a ‘Stem/Serrated/Mesenchymal’ (SSM) subtype endowed with negative prognosis8910',
 'These classification efforts have been recently consolidated by a multi-institutional initiative that comprehensively cross compared the different subtype assignments on a common set of samples, leading to the definition of the consensus molecular subtypes11 (CMS).']

In [3]:
pmids_positive = ['32553164', '34020697', '29242316', '25863122', '28854174', '30146162', '37610680', '29431699', '33776923', '31434953', '29054837', '30015632', '33889306', '28522592', '34445380', '36702949', '26479923', '35075805', '37944531', '37000626', '34605222', '37143108', '28797031', '29472484', '31694835', '26926157', '30282693', '33199443', '35221336', '28473534', '35914528', '30596880', '29245952', '35972511', '34740372', '31488816', '26696773', '25437539', '36852691', '30629588', '37170307', '33602919', '37178682', '26846818', '37029129', '32641470', '32042320', '30992437', '30344100', '28581516', '27613526', '37116492', '35365682', '31693904', '36864254', '26095073', '33009951', '26270481', '30244973', '28232476', '34027491', '30380421', '28561063', '29458007', '26695443', '35802820', '31761724', '35410383', '25444907', '32884042', '32294323', '35508177', '30017245', '30146332', '34011980', '31883794', '30395907', '36058001', '33421710', '32075943', '29079660', '31036555', '29093017', '35383171', '30232224', '33852917', '32792481', '34250755', '32414908', '29625057', '26124487', '35180770', '30213835', '27381626', '27750381', '37452026', '36657446', '32183023', '30859564', '36223547']

def get_pmids_from_cm():
    url = "https://www.cancermodels.org/api/model_metadata?select=model_id,data_source,provider_name,pubmed_ids"
    df = pd.read_json(url)
    df = df[~df['pubmed_ids'].isna()]
    df['pubmed_ids'] = df['pubmed_ids'].str.split(',')
    df = df.explode('pubmed_ids')
    df['pubmed_ids'] = df['pubmed_ids'].str.strip()
    df = df.drop_duplicates('pubmed_ids')
    df = df[df['pubmed_ids'] != ""]
    df['pmid'] = [pmid.replace('PMID: ', '').replace('PMID:', '') for pmid in df['pubmed_ids']]
    df = df.drop_duplicates('pmid')
    return df

df = get_pmids_from_cm()

In [16]:
for pmid in pmids_positive:
    if pmid not in df['pmid'].to_list():
        print(f"{pmid} is missing in CancerModels.Org")

32553164 is missing in CancerModels.Org
33776923 is missing in CancerModels.Org
34445380 is missing in CancerModels.Org
36702949 is missing in CancerModels.Org
35075805 is missing in CancerModels.Org
37944531 is missing in CancerModels.Org
37000626 is missing in CancerModels.Org
34605222 is missing in CancerModels.Org
37143108 is missing in CancerModels.Org
29472484 is missing in CancerModels.Org
33199443 is missing in CancerModels.Org
30596880 is missing in CancerModels.Org
35972511 is missing in CancerModels.Org
34740372 is missing in CancerModels.Org
31488816 is missing in CancerModels.Org
37170307 is missing in CancerModels.Org
33602919 is missing in CancerModels.Org
37178682 is missing in CancerModels.Org
32641470 is missing in CancerModels.Org
36864254 is missing in CancerModels.Org
35802820 is missing in CancerModels.Org
31761724 is missing in CancerModels.Org
32294323 is missing in CancerModels.Org
35508177 is missing in CancerModels.Org
31883794 is missing in CancerModels.Org


'32990596'