In [1]:
from tqdm import tqdm
import pandas as pd
from datetime import datetime
from Bio import Entrez

In [2]:
def fetch_title_and_date(pmid):
    Entrez.email = "tushar@ebi.ac.uk"  # Always provide an email for NCBI API
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="text")
        records = Entrez.read(handle)
        title = records['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        
        # Extract publication date
        pub_date = records['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        if 'Year' in pub_date and 'Month' in pub_date:
            pub_date_str = f"{pub_date['Year']}-{pub_date['Month']}-01"
        else:
            pub_date_str = f"{pub_date['Year']}-Jan-01"
        pub_date = datetime.strptime(pub_date_str, '%Y-%b-%d')
    except Exception as e:
        print(f"Error fetching data for PMID {pmid}: {e}")
        title, pub_date = None, None
    return title, pub_date

# Function to fetch citing publications by PMID
def fetch_citing_pmids(pmid):
    Entrez.email = "tushar@ebi.ac.uk"
    try:
        handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed_citedin")
        records = Entrez.read(handle)
        citing_pmids = [link['Id'] for link in records[0]['LinkSetDb'][0]['Link']]
    except Exception as e:
        print(f"Error fetching citing PMIDs for PMID {pmid}: {e}")
        citing_pmids = []
    return citing_pmids

In [3]:
model_publications = pd.read_json("https://www.cancermodels.org/api/model_information?publication_group_id=not.is.null")
publication_id = pd.read_json("https://www.cancermodels.org/api/publication_group")
model_publications = model_publications[['external_model_id', 'type', 'data_source', 'publication_group_id']].merge(publication_id, how='left', left_on='publication_group_id', right_on='id')


In [7]:
df = model_publications[['external_model_id', 'data_source', 'pubmed_ids']].drop_duplicates()
df = df.assign(pubmed_id=df['pubmed_ids'].str.replace(', ', ',').str.split(',')).explode('pubmed_id')
df['pubmed_id'] = df['pubmed_id'].str.strip().str.replace('PMID: ', '').str.replace('PMID:', '')
df.to_csv('data_source_publications.csv', index=False)
# Collect all PMIDs in the data
all_pmids = set(df['pubmed_id'])
df

Unnamed: 0,external_model_id,data_source,pubmed_ids,pubmed_id
0,HBCx-2,Curie-BC,"PMID: 17606733, PMID: 27374081, PMID: 29463559...",17606733
0,HBCx-2,Curie-BC,"PMID: 17606733, PMID: 27374081, PMID: 29463559...",27374081
0,HBCx-2,Curie-BC,"PMID: 17606733, PMID: 27374081, PMID: 29463559...",29463559
0,HBCx-2,Curie-BC,"PMID: 17606733, PMID: 27374081, PMID: 29463559...",36852691
0,HBCx-2,Curie-BC,"PMID: 17606733, PMID: 27374081, PMID: 29463559...",37029129
...,...,...,...,...
3794,TM00037,JAX,PMID: 26270481,26270481
3795,ICb-1078MB,LurieChildrens,PMID: 31693904,31693904
3796,PAC0015PR,TRACE,PMID: 27626319,27626319
3797,IC-1499EPB,LurieChildrens,PMID: 31693904,31693904


In [13]:
pmid_and_model_id = df.groupby('pubmed_id')['external_model_id'].agg(list).to_dict()

{'': ['T470',
  'T101',
  'OPBG-GBM001 Multifluo',
  'T192',
  'P3',
  'P13',
  'T233',
  'T158',
  'T16',
  'P8'],
 '1000501': ['SIDM00837',
  'SIDM00840',
  'SIDM00836',
  'SIDM00832',
  'SIDM00835',
  'SIDM00841',
  'SIDM00833'],
 '1000504': ['SIDM00146', 'SIDM01042', 'SIDM00148'],
 '1000505': ['SIDM00146', 'SIDM01042', 'SIDM00148'],
 '1000506': ['SIDM00146', 'SIDM00148'],
 '10022529': ['SIDM01033'],
 '10027410': ['SIDM00505',
  'SIDM00664',
  'SIDM00899',
  'SIDM00934',
  'SIDM00130',
  'SIDM00610',
  'SIDM00943',
  'SIDM00132',
  'SIDM01188'],
 '10037197': ['SIDM00792',
  'SIDM00767',
  'SIDM00927',
  'SIDM01131',
  'SIDM01121',
  'SIDM00915',
  'SIDM00719',
  'SIDM00706',
  'SIDM01739',
  'SIDM01128'],
 '10051639': ['SIDM01549',
  'SIDM00537',
  'SIDM00826',
  'SIDM00822',
  'SIDM01547',
  'SIDM01483',
  'SIDM00823',
  'SIDM01548',
  'SIDM01532',
  'SIDM00536'],
 '10069537': ['SIDM00590', 'SIDM00588', 'SIDM00489', 'SIDM00589'],
 '10070891': ['SIDM01907', 'SIDM01355', 'SIDM01356',

In [19]:
# Initialize lists to store results
results = []
citations = list()
# Set the filter date as April 1st, 2022
filter_date = datetime(2022, 4, 1)

# Process each PMID
for i in tqdm(range(1, len(all_pmids))):
    pmid = list(all_pmids)[i]
    title, pub_date = fetch_title_and_date(pmid)
    citing_pmids = fetch_citing_pmids(pmid)
    
    # Filter out PMIDs that are already in the DataFrame
    filtered_citing_pmids = [citing_pmid for citing_pmid in citing_pmids if citing_pmid not in all_pmids]
    
    # Check publication date of each citing PMID
    citing_after_filter = []
    for citing_pmid in filtered_citing_pmids:
        if citing_pmid not in citations:
            citations.append(citing_pmid)
            _, citing_pub_date = fetch_title_and_date(citing_pmid)
            if citing_pub_date and citing_pub_date > filter_date:
                citing_after_filter.append(citing_pmid)
    
    # Remove duplicates within the list
    citing_after_filter = list(set(citing_after_filter))
    
    # Append results
    results.append({
        'pubmed_id': pmid,
        'title': title,
        'cited_pmids': citing_after_filter,
        'citation_count': len(citing_after_filter)
    })

  0%|          | 3/3478 [03:58<76:35:00, 79.34s/it]


KeyboardInterrupt: 

In [None]:
# Create DataFrame from results
results_df = pd.DataFrame(results)

# Remove duplicates within 'cited_pmids'
results_df['cited_pmids'] = results_df['cited_pmids'].apply(lambda x: list(set(x)))

# Create a set of all unique cited PMIDs across the DataFrame
all_cited_pmids = set(pm for sublist in results_df['cited_pmids'] for pm in sublist)

# Create 'unique_citations' column with unique cited PMIDs
results_df['unique_citations'] = results_df['cited_pmids'].apply(lambda x: list(set(x) & all_cited_pmids))

# Add a new column for the count of unique citations
results_df['unique_citation_count'] = results_df['unique_citations'].apply(len)


In [47]:
results_df

Unnamed: 0,pubmed_id,title,cited_pmids,citation_count,unique_citations,unique_citation_count
0,19363654,Establishment and characterization of a new hi...,"[26913720, 33408328, 28783167, 28281638, 27253...",49,"[26913720, 33408328, 28783167, 28281638, 27253...",49
1,5668122,Glucose-6-phosphate dehydrogenase isoenzymes i...,"[1260149, 33805570, 29642437, 30641914, 793309...",20,"[1260149, 33805570, 29642437, 30641914, 793309...",20
2,8528516,"Morphologic, immunologic, biochemical, and cyt...","[10912355, 31662513, 34065977, 11817704, 16133...",8,"[10912355, 31662513, 34065977, 11817704, 16133...",8
3,1851909,Establishment and characterization of an Epste...,[7735567],1,[7735567],1
4,17121789,Functional p53 signaling in Kaposi's sarcoma-a...,"[30523620, 26109723, 17699571, 18625847, 33411...",73,"[30523620, 26109723, 17699571, 18625847, 33411...",73
...,...,...,...,...,...,...
3473,17171682,Molecular characterization of human multiple m...,"[33291672, 27699258, 26305418, 19196658, 25353...",30,"[33291672, 27699258, 26305418, 19196658, 25353...",30
3474,8386741,"Characterization of a novel Hodgkin cell line,...","[11696441, 14657218, 9399941, 25436766, 864486...",12,"[11696441, 14657218, 9399941, 25436766, 864486...",12
3475,10969801,Comparative genomic hybridization analysis of ...,"[12519956, 27900500, 15918899, 19259408, 12869...",91,"[12519956, 27900500, 15918899, 19259408, 12869...",91
3476,9573483,Effects of feeder cells (human cancer cell lin...,"[12971624, 12971621, 12889859]",3,"[12971624, 12971621, 12889859]",3


In [48]:
results_df.to_csv('citations_updated.csv', index=False)