In [1]:
import numpy as np
import pandas as pd
import re
import json
import requests
from xml.etree import ElementTree
import time
from dotenv import load_dotenv
import ast  # To safely evaluate string representation of a list
import bdikit as bdi
import os
from experiment_utils import *

In [2]:
pmid_doi_mapping_file = "exp_output/pmid_doi_mapping.json"
if os.path.exists(pmid_doi_mapping_file): 
    with open(pmid_doi_mapping_file, "r") as f:
        pmid_doi_mapping = json.load(f)  # read json file
else:
    pmid_doi_mapping = {}

In [3]:
pmid_pmcid_mapping_file = "exp_output/pmid_pmcid_mapping.json"
if os.path.exists(pmid_pmcid_mapping_file): 
    with open(pmid_pmcid_mapping_file, "r") as f:
        pmid_pmcid_mapping = json.load(f)  # read json file
else:
    pmid_pmcid_mapping = {}

In [4]:
doi_pmid_mapping_file = "exp_output/doi_pmid_mapping.json"
if os.path.exists(doi_pmid_mapping_file): 
    with open(doi_pmid_mapping_file, "r") as f:
        doi_pmid_mapping = json.load(f)  # read json file
else:
    doi_pmid_mapping = {}

In [5]:
doi_to_pmcid_mapping_file = "exp_output/doi_pmcid_mapping.json"
if os.path.exists(doi_to_pmcid_mapping_file):
    with open(doi_to_pmcid_mapping_file, "r") as f:
        doi_pmcid_mapping = json.load(f)  # read json file
else:
    doi_pmcid_mapping = {}

In [6]:
len(pmid_doi_mapping), len(pmid_pmcid_mapping), len(doi_pmid_mapping), len(doi_pmcid_mapping)

(144993, 144993, 125273, 125011)

## Extract the datasets data from ProteomeCentral and GEO

Start from the export file from ProteomeCentral: https://proteomecentral.proteomexchange.org/

In [7]:
export_df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
print(f"number of dataset ids in export dataframe from ProteomeCentral: {len(export_df)}")

number of dataset ids in export dataframe from ProteomeCentral: 40137


to get an idea of the most frequent values in df publication, for all the dataset ids from the source file

In [8]:
export_df['publication'].value_counts().head()

publication
Dataset with its publication pending                                                                 11719
no publication                                                                                        2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>          56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>       28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>          18
Name: count, dtype: int64

Clean the `publication` column by filtering out unwanted values like `Dataset with its publication pending`, `no publication`. Remove rows with unwanted values

In [9]:
filtered_df = export_df[~export_df['publication'].isin(["Dataset with its publication pending", "no publication"])]
print(len(filtered_df),'\n',filtered_df.columns)

25868 
 Index(['identifier', 'title', 'repository', 'species', 'instrument',
       'publication', 'lab_head', 'announce_date', 'keywords'],
      dtype='object')


Normalize the `publication` column by extracting the links from the html tags

In [10]:
filtered_df = filtered_df.copy()

filtered_df.loc[:, 'citing_publications_links'] = None

for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub)  # Extract href links
        filtered_df.at[i, 'citing_publications_links'] = match if match else None
    else:
        filtered_df.at[i, 'citing_publications_links'] = None

# Drop rows with missing links safely
filtered_df = filtered_df.dropna(subset=['citing_publications_links']).reset_index(drop=True)
filtered_df.drop(columns=['publication'], inplace=True)  # Drop original column

print(f"# n datasets with publications: {len(filtered_df)}") 
print(filtered_df.columns)

# n datasets with publications: 25696
Index(['identifier', 'title', 'repository', 'species', 'instrument',
       'lab_head', 'announce_date', 'keywords', 'citing_publications_links'],
      dtype='object')


Normalize links:


- if pubmed link, find the pmcid and the doi mapped to the pmid


- if doi link, find the doi and the pmcid mapped to the doi

In [11]:
filtered_df = filtered_df.explode('citing_publications_links', ignore_index=True)  # Split lists into rows
print(f"Length: {len(filtered_df)}")

filtered_df = extract_publication_ids_from_PX_export(filtered_df, pmid_doi_mapping, pmid_pmcid_mapping, doi_pmid_mapping, doi_pmcid_mapping)
print(f"Length should be the same: {len(filtered_df)}")

filtered_df = add_citing_publication_link_columns(filtered_df)
print(f"Length should be greater: {len(filtered_df)}")

filtered_df["citing_publications_links"] = filtered_df["citing_publications_links"].astype(str)
filtered_df["DOI"] = filtered_df["DOI"].astype(str)
filtered_df["PMCID"] = filtered_df["PMCID"].astype(str)

# Now drop duplicates
filtered_df.drop_duplicates(subset=['citing_publications_links', 'DOI', 'PMCID'], inplace=True)
print(f"Length could be a bit smaller: {len(filtered_df)}")

filtered_df[['citing_publications_links','DOI','PMID','PMCID']].sample(10)

Length: 39347
Length should be the same: 39347
Before explode sample:                           DOI        PMCID  \
0  10.1038/S41467-025-56720-1         None   
1           10.6019/PXD051312         None   
2      10.1002/prca.202400095  PMC11895760   
3      10.1002/prca.202400095  PMC11895760   
4    10.17159/SAJS.2025/18571         None   
5   10.1101/2024.04.03.587901         None   
6   10.1101/2025.02.05.636703         None   
7  10.1186/S13100-024-00339-4         None   
8  10.1186/s12964-025-02046-w  PMC11773904   
9  10.1186/s12964-025-02046-w  PMC11773904   

                           citing_publications_links  
0    [https://dx.doi.org/10.1038/S41467-025-56720-1]  
1             [https://dx.doi.org/10.6019/PXD051312]  
2  [https://dx.doi.org/10.1002/prca.202400095, ht...  
3  [https://dx.doi.org/10.1002/prca.202400095, ht...  
4      [https://dx.doi.org/10.17159/SAJS.2025/18571]  
5     [https://dx.doi.org/10.1101/2024.04.03.587901]  
6     [https://dx.doi.org/10.1101/2025

Unnamed: 0,citing_publications_links,DOI,PMID,PMCID
23039,https://dx.doi.org/10.1038/S41467-024-44988-8,10.1038/S41467-024-44988-8,,
20273,https://dx.doi.org/10.1038/s41423-024-01150-0,10.1038/s41423-024-01150-0,38689020.0,PMC11143349
41660,https://dx.doi.org/10.3390/cells10061306,10.3390/cells10061306,34074003.0,PMC8225046
43111,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,10.1161/ATVBAHA.120.315007,33356393.0,PMC8105275
1351,https://dx.doi.org/10.6019/PXD046271,10.6019/PXD046271,,
28362,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9...,10.1038/s41586-023-05780-8,36859550.0,PMC9995272
54036,https://dx.doi.org/10.1038/s41467-018-07321-8,10.1038/s41467-018-07321-8,30451861.0,PMC6242847
34061,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,10.1038/s42003-022-03259-2,35393494.0,PMC8991201
54198,https://dx.doi.org/10.6019/PXD009649,10.6019/PXD009649,,
13141,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,10.3389/fcimb.2019.00104,31069174.0,PMC6491454


Extraction of data from GEO

In [12]:
GEO_extract_file = "exp_input/GEO_data.csv"
GEO_df = pd.read_csv(GEO_extract_file, low_memory=False)
print(f"number of dataset ids in GEO dataframe: {len(GEO_df)}")

number of dataset ids in GEO dataframe: 248748


Extracting all the datasets with eutils api and get all the uids of GEO series datasets

In [13]:
root = ElementTree.fromstring(requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", 
                                           params={"db": "gds", "term": '"gse"[Entry Type]', "retmax": "1000000",  "retmode": "xml"}
                                           ).content)

gse_ids = [id_elem.text for id_elem in root.findall(".//Id")]
missing_ids = set(gse_ids)-set([str(x) for x in GEO_df.uid])

print(f"Total datasets found: {len(gse_ids)}")
print("Missing GSE IDs:", len(missing_ids))  # Show difference with data we already got

Total datasets found: 249060
Missing GSE IDs: 326


Add new datasets from GEO to the GEO dataframe

In [14]:
new_GEO_series_data = {}
i, mxm = 0, 1
while True:
    print(f"Progress: {min(i / len(missing_ids), 1) * 100} %")
    if i > len(missing_ids):
        break
    try:
        new_data = fetch_GEO_data(list(missing_ids),"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",i,mxm)
        for uid, details in new_data["result"].items():
            if uid == "uids":  # Ignore metadata key
                continue
            new_GEO_series_data[uid] = details
    except Exception as e:
        print(f"Error at {i}, {mxm} with error: {e}")
            
    i += 1
    mxm += 1
    time.sleep(0.005) 

add_new_data = False # add new datasets to the GEO_df
if add_new_data:
    GEO_df = pd.concat([GEO_df, pd.DataFrame(new_GEO_series_data).T], axis=0)

Progress: 0.0 %
Progress: 0.3067484662576687 %
Progress: 0.6134969325153374 %
Progress: 0.9202453987730062 %
Error at 3, 4 with error: 'result'
Progress: 1.2269938650306749 %
Error at 4, 5 with error: 'result'
Progress: 1.5337423312883436 %
Error at 5, 6 with error: 'result'
Progress: 1.8404907975460123 %
Error at 6, 7 with error: 'result'
Progress: 2.147239263803681 %
Error at 7, 8 with error: 'result'
Progress: 2.4539877300613497 %
Progress: 2.7607361963190185 %
Progress: 3.067484662576687 %
Error at 10, 11 with error: 'result'
Progress: 3.374233128834356 %
Error at 11, 12 with error: 'result'
Progress: 3.6809815950920246 %
Error at 12, 13 with error: 'result'
Progress: 3.9877300613496933 %
Error at 13, 14 with error: 'result'
Progress: 4.294478527607362 %
Error at 14, 15 with error: 'result'
Progress: 4.601226993865031 %
Error at 15, 16 with error: 'result'
Progress: 4.9079754601226995 %
Progress: 5.214723926380368 %
Error at 17, 18 with error: 'result'
Progress: 5.521472392638037 %

We will update the filtered_df with the new datasets from GEO, after matching the schema

Add the url to the GEO datasets by reconstruction from pubmedids

In [15]:
# eval string to list
GEO_df['pubmedids'] = GEO_df['pubmedids'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
print(len(GEO_df))
GEO_df = GEO_df[GEO_df['pubmedids'].astype(bool)]  # Drop rows with empty lists
print(len(GEO_df))
GEO_df = GEO_df.explode('pubmedids', ignore_index=True)  # Split lists into rows
print(len(GEO_df))

248748
194093
208151


- if pubmed link, find the pmcid and the doi mapped to the pmid


- Create Links for the publications

In [16]:
GEO_df['PMCID'] = GEO_df['pubmedids'].map(
    lambda x: pmid_pmcid_mapping.get(x, None) if isinstance(x, str) else None )
GEO_df['DOI'] = GEO_df['pubmedids'].map(
    lambda x: pmid_doi_mapping.get(x, None) if isinstance(x, str) else None)
GEO_df[['pubmedids','PMCID','DOI']].sample(10)

Unnamed: 0,pubmedids,PMCID,DOI
160949,25204429,PMC4177250,10.1186/1476-4598-13-209
72254,34017133,,
75702,22955616,PMC3439153,10.1038/nature11247
10356,38922876,,
203094,17910761,PMC2078596,10.1186/1471-2164-8-349
41741,36428978,PMC9688917,10.3390/cells11223549
32969,37108567,PMC10138523,10.3390/ijms24087403
54127,34010639,PMC9678241,10.1016/j.celrep.2021.109136
146641,27058788,PMC4826471,10.1016/j.molcel.2016.03.001
168889,24317252,PMC4018771,10.1038/nmeth.2762


In [17]:
# Generate citing_publications_links as a list of links
GEO_df = add_citing_publication_link_columns(GEO_df)

GEO_df[['pubmedids','PMCID','DOI','citing_publications_links']].sample(10)

Before explode sample:                        DOI        PMCID  \
0                     None         None   
1   10.1093/immhor/vlae005  PMC11841973   
2      10.1093/nar/gkaf109  PMC11840560   
3                     None         None   
4                     None         None   
5                     None         None   
6                     None         None   
7                     None         None   
8                     None         None   
9  10.1073/pnas.2416384122  PMC11892594   

                           citing_publications_links  
0                                                 []  
1  [https://dx.doi.org/10.1093/immhor/vlae005, ht...  
2  [https://dx.doi.org/10.1093/nar/gkaf109, https...  
3                                                 []  
4                                                 []  
5                                                 []  
6                                                 []  
7                                                 []  
8       

Unnamed: 0,pubmedids,PMCID,DOI,citing_publications_links
112075,35184570,PMC8923958,10.1161/CIRCULATIONAHA.121.057613,https://dx.doi.org/10.1161/CIRCULATIONAHA.121....
171466,33420030,PMC7794439,10.1038/s41467-020-20447-y,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
39904,38278841,PMC10817939,10.1038/s41467-024-45141-1,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
250313,28334889,PMC5449619,10.1093/nar/gkx168,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
86351,36073526,PMC9553214,10.7554/eLife.78074,https://dx.doi.org/10.7554/eLife.78074
247816,31677552,PMC6920132,10.1016/j.redox.2019.101359,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
181385,37172728,PMC10267520,10.1016/j.jbc.2023.104805,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
113274,35140216,PMC8828724,10.1038/s41467-022-28454-x,https://dx.doi.org/10.1038/s41467-022-28454-x
38338,39041647,PMC11605269,10.1002/art.42958,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
156674,33804598,PMC8003772,10.3390/ijms22063164,https://dx.doi.org/10.3390/ijms22063164


In [18]:
# drop NaN citing_publications_links
GEO_df = GEO_df.dropna(subset=['citing_publications_links'])  # Remove NaNs
print(len(GEO_df))

352982


Union of the two dataframes

In [19]:
print(f"len filtered_df: {len(filtered_df)}")
print(f"len GEO_df: {len(GEO_df)}")

len filtered_df: 38688
len GEO_df: 352982


In [20]:
column_mappings = { # Manually map columns
    'accession': 'identifier',
    'taxon': 'species',
    'repository': 'entrytype',
    'publication_date': 'pdat',
    'pubmedids': 'PMID',
}

In [21]:
GEO_df = GEO_df.rename(columns=column_mappings)  # Rename columns
GEO_df['repository'] = 'GEO'  # Add repository column
filtered_df = pd.concat([filtered_df, GEO_df], axis=0, ignore_index=True)  # Concatenate dataframes

In [22]:
filtered_df.iloc[::10000].head(10)  # Show every 10000th row

Unnamed: 0,identifier,title,repository,species,instrument,lab_head,announce_date,keywords,citing_publications_links,PMID,...,pdat,suppfile,samples,relations,extrelations,n_samples,projects,ftplink,geo2r,bioproject
0,PXD059466,Endothelial SHANK3 Regulates Tight Junctions i...,PRIDE,Mus musculus,Orbitrap Fusion Lumos,Il Hwan Kim,2025-02-07,"BioID2, SHANK3, bEnd3. Cells",https://dx.doi.org/10.1038/S41467-025-56720-1,,...,,,,,,,,,,
10000,PXD049417,Multiomics approach to decipher pathophysiolog...,PRIDE,Mus musculus,Orbitrap Fusion,Jiri Petrak,2024-07-03,"Cystathionine beta-synthase, homocystinuria, m...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,38843767.0,...,,,,,,,,,,
20000,PXD031142,Marine Synechococcus sp. strain WH7803 shows s...,PRIDE,"Bacteria, Synechococcus sp. WH 7803",Orbitrap Fusion,José Manuel García Fernández,2022-07-11,"Orbitrap Fusion, Synechococcus WH7803, nitroge...",https://dx.doi.org/10.6019/PXD031142,,...,,,,,,,,,,
30000,PXD014667,Listeria monocytogenes exploits the MICOS comp...,PRIDE,Homo sapiens,Q Exactive,Pascale Cossart,2020-02-17,"Listeria monocytogenes, MICOS complex, Mic10, ...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,32019800.0,...,,,,,,,,,,
40000,GSE268565,BMPR1A-mediated BMP signalling ensures correct...,GEO,Mus musculus,,,,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,39817676.0,...,2025/01/16,TXT,"[{'accession': 'GSM8294327', 'title': 'E14.5 f...",[],[],8.0,[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE268nn...,no,PRJNA1117811
50000,GSE240634,Identification of IKKβ Inhibitor by utilizing ...,GEO,Homo sapiens,,,,,https://dx.doi.org/10.1681/ASN.0000000000000460,39133556.0,...,2024/08/15,TXT,"[{'accession': 'GSM7706552', 'title': 'Sample ...",[],[],18.0,[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE240nn...,yes,PRJNA1004406
60000,GSE245526,Replication-competent HIV-1 gRNA Constructs fo...,GEO,Human immunodeficiency virus 1,,,,,https://dx.doi.org/10.1038/s41467-024-48228-x,38714682.0,...,2024/04/15,"TAB, XLSX","[{'accession': 'GSM7844529', 'title': 'NL4.3 C...",[],[],49.0,[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE245nn...,no,PRJNA1028668
70000,GSE255988,Polygenic Risk for Alcohol Use Disorder Affect...,GEO,Homo sapiens,,,,,https://dx.doi.org/10.1126/sciadv.ado5820,39514655.0,...,2024/02/19,TSV,"[{'accession': 'GSM8083849', 'title': 'Microgl...",[],[],18.0,[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE255nn...,yes,PRJNA1077355
80000,GSE236263,MicroRNA-1 protects the endothelium in acute l...,GEO,Homo sapiens,,,,,https://dx.doi.org/10.1172/jci.insight.164816,37737266.0,...,2023/10/25,TXT,"[{'accession': 'GSM7524125', 'title': 'HPMEC c...",[],[],3.0,[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE236nn...,yes,PRJNA989591
90000,GSE230619,Gene expression data from human adipose-derive...,GEO,Homo sapiens,,,,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,37445596.0,...,2023/07/19,"CEL, CHP, XLSX","[{'accession': 'GSM7230019', 'title': 'hASCs, ...",[],[],9.0,[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE230nn...,yes,PRJNA962011


In [23]:
filtered_df.to_parquet('exp_input/Table_datasets.parquet', index=False)

## Creating Publications Data Table

We start from ProteomeCentral and GEO table we created before

In [34]:
filtered_df = pd.read_parquet('exp_input/Table_datasets.parquet')

In [35]:
print(len(filtered_df))
filtered_df.head(3)

391670


Unnamed: 0,identifier,title,repository,species,instrument,lab_head,announce_date,keywords,citing_publications_links,PMID,...,pdat,suppfile,samples,relations,extrelations,n_samples,projects,ftplink,geo2r,bioproject
0,PXD059466,Endothelial SHANK3 Regulates Tight Junctions i...,PRIDE,Mus musculus,Orbitrap Fusion Lumos,Il Hwan Kim,2025-02-07,"BioID2, SHANK3, bEnd3. Cells",https://dx.doi.org/10.1038/S41467-025-56720-1,,...,,,,,,,,,,
1,PXD051312,Systemic changes in early pregnancy in the mar...,PRIDE,Equus caballus,"Exactive Plus, Orbitrap Exploris 480",Dr Aleona,2025-02-07,"MRP, TTR, biomarker, early pregnancy loss, sec...",https://dx.doi.org/10.6019/PXD051312,,...,,,,,,,,,,
2,PXD051312,Systemic changes in early pregnancy in the mar...,PRIDE,Equus caballus,"Exactive Plus, Orbitrap Exploris 480",Dr Aleona,2025-02-07,"MRP, TTR, biomarker, early pregnancy loss, sec...",https://dx.doi.org/10.1002/prca.202400095,39912552.0,...,,,,,,,,,,


Some statistics on our publications data

look at the publications one by one

In [37]:
filtered_df = filtered_df.explode('citing_publications_links')  # Split lists into rows

In [41]:
df_publications = filtered_df[['citing_publications_links','DOI','PMID','PMCID']]

df_publications = df_publications.sort_values(by='citing_publications_links', key=lambda x: x.str.contains('doi', na=False), ascending=False)

df_publications = df_publications.drop_duplicates(subset=['PMCID', 'DOI'], keep='first')

len(df_publications)

126461

In [42]:
print(len(df_publications))
print(df_publications.dtypes)
df_publications.head(3)

126461
citing_publications_links    object
DOI                          object
PMID                         object
PMCID                        object
dtype: object


Unnamed: 0,citing_publications_links,DOI,PMID,PMCID
0,https://dx.doi.org/10.1038/S41467-025-56720-1,10.1038/S41467-025-56720-1,,
313946,https://dx.doi.org/10.1038/srep10438,10.1038/srep10438,26039282.0,PMC4453922
163224,https://dx.doi.org/10.1038/nature11247,10.1038/nature11247,22955616.0,PMC3439153


In [44]:
df_publications.astype(str).to_parquet('exp_input/Table_publications.parquet', index=False)

## Create the ground truth table

each row is a record of a dataset citation in a doi

In [45]:
Publications_table = pd.read_parquet('exp_input/Table_publications.parquet')
Datasets_table = pd.read_parquet('exp_input/Table_datasets.parquet')

We iterate over all the datasets and use their publication urls to find the corresponding dois. Since not all papers had dois, we will create a Publication Identifier table

In [47]:
Publication_Dataset_Citations_Ground_Truth = Datasets_table[['identifier','repository','citing_publications_links']].copy()

In [49]:
Publication_Dataset_Citations_Ground_Truth.to_parquet('exp_input/dataset_citation_records_Table.parquet', index=False)