# Extract Ground Truth

This Notebook takes as input the Export File from ProteomeCentral: https://proteomecentral.proteomexchange.org/: proteomexchange_search.tsv


The final output will be a json file with ground truth public information on datasets

In [1]:
import pandas as pd
import re
import json
import requests
from xml.etree import ElementTree
import time
from dotenv import load_dotenv
import ast  # To safely evaluate string representation of a list
import bdikit as bdi
import requests
import os

In [2]:
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
print(f"number of dataset ids: {len(df)}")
datasets = df[['publication','identifier','repository','title','keywords']]

number of dataset ids: 40137


## publications values frequencies

to get an idea of the most frequent values in df publication, for all the dataset ids from the source file

In [3]:
datasets['publication'].value_counts()

publication
Dataset with its publication pending                                                                                                                                                                      11719
no publication                                                                                                                                                                                             2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>                                                                                                               56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>                                                                                                            28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>                                                              

Clean the `publication` column by filtering out unwanted values like `Dataset with its publication pending`, `no publication`. Remove rows with unwanted values

In [4]:
filtered_df = datasets[~datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]
len(filtered_df)

25868

## extract the citing Publications for each dataset

Create a new column for the links of the citing publications for each dataset

In [5]:
filtered_df = filtered_df.copy()

filtered_df.loc[:, 'citing_publications_links'] = None

for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub)  # Extract href links
        filtered_df.at[i, 'citing_publications_links'] = match if match else None
    else:
        filtered_df.at[i, 'citing_publications_links'] = None

# Drop rows with missing links safely
filtered_df = filtered_df.dropna(subset=['citing_publications_links']).reset_index(drop=True)

In [6]:
len(filtered_df) # n datasets with publications

25696

In [7]:
m = 0
for i,row in filtered_df.iterrows():
    id = row['identifier']
    m+=len(row['citing_publications_links'])
print(f"Total number of publications: {m}")
print(f"Average number of publications per dataset: {m/len(filtered_df)}")

Total number of publications: 39347
Average number of publications per dataset: 1.53125


In [8]:
filtered_df.columns

Index(['publication', 'identifier', 'repository', 'title', 'keywords',
       'citing_publications_links'],
      dtype='object')

In [9]:
filtered_df = filtered_df.explode('citing_publications_links')  # Split lists into rows

df_grouped = filtered_df.groupby('citing_publications_links').apply(lambda x: {
    "citing_publication_link": x.name,  # Grouped key
    "cited_datasets_count": len(x),
    "cited_datasets": x.apply(lambda row: {
        "dataset_id": row["identifier"] if pd.notna(row["identifier"]) else "n/a",
        "dataset_repo": row["repository"] if pd.notna(row["repository"]) else "n/a",
        "dataset_keywords": row["keywords"] if pd.notna(row["keywords"]) else "n/a",
        "dataset_title": row["title"] if pd.notna(row["title"]) else "n/a",
        # count of dataset_ids
        
    }, axis=1).tolist()
}).tolist()

json_output = json.dumps(df_grouped, indent=4)

with open("exp_ground_truth/publications_ground_truth.json", "w") as json_file:
    json_file.write(json_output)

## Integration of dataset references from Gene Expression Omnibus   

We want to integrate the GEO datasets

In [10]:
input_ground_truth_file = "exp_ground_truth/publications_ground_truth.json"
output_ground_truth_file = "exp_ground_truth/publications_ground_truth_PXD_GSE.json"
GEO_extract = "exp_input/GEO_data.csv"

### Extracting all the datasets with eutils api

We get all the uids of GEO series datasets

In [11]:
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "gds",  # Search in GEO database
    "term": '"gse"[Entry Type]',  # Only fetch GEO Series (GSE)
    "retmax": "1000000",  # Maximum records to retrieve (adjust as needed)
    "retmode": "xml"
}

root = ElementTree.fromstring(requests.get(esearch_url, params=params).content)

gse_ids = [id_elem.text for id_elem in root.findall(".//Id")]

print(f"Total datasets found: {len(gse_ids)}")
print("Sample GSE IDs:", gse_ids[:10])  # Show first 10 results

Total datasets found: 248808
Sample GSE IDs: ['200291629', '200291614', '200291440', '200291300', '200291298', '200291297', '200291295', '200291292', '200291265', '200291226']


In [12]:
def fetch_GEO_data(IDs,request_url,start,stop):
    params = {
        "db": "gds",
        "id": ",".join(IDs[start:stop]),  # Query window
        "retmode": "json"
    }
    response = requests.get(request_url, params=params)
    
    try:
        data = response.json()
    except:
        raise ValueError("Failed to parse JSON response! Please check the response content.")
    
    return data

In [70]:
fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",0,10)

{'header': {'type': 'esummary', 'version': '0.3'},
 'esummaryresult': ['Empty id list - nothing todo']}

In [16]:
data = {}
i = 0
mxm = 300
while True:
    print(f"Progress: {i / len(gse_ids) * 100} %")
    if i > len(gse_ids):
        break
    try:
        new_data = fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",i,mxm)
        #print(f"Data: {new_data}")
        for uid, details in new_data["result"].items():
            if uid == "uids":  # Ignore metadata key
                continue
            data[uid] = details
    except:
        print(f"Error at {i}, {mxm}")
            
    i += 300
    mxm += 300
    time.sleep(0.05) 

Progress: 0.0 %
Progress: 0.12057490112858107 %
Progress: 0.24114980225716215 %
Progress: 0.3617247033857432 %
Progress: 0.4822996045143243 %
Progress: 0.6028745056429053 %
Progress: 0.7234494067714864 %
Progress: 0.8440243079000675 %
Progress: 0.9645992090286486 %
Progress: 1.0851741101572296 %
Progress: 1.2057490112858107 %
Progress: 1.326323912414392 %
Progress: 1.4468988135429728 %
Progress: 1.567473714671554 %
Progress: 1.688048615800135 %
Progress: 1.808623516928716 %
Progress: 1.9291984180572972 %
Progress: 2.0497733191858782 %
Progress: 2.1703482203144593 %
Progress: 2.2909231214430403 %
Progress: 2.4114980225716214 %
Progress: 2.5320729237002024 %
Progress: 2.652647824828784 %
Progress: 2.7732227259573645 %
Progress: 2.8937976270859456 %
Progress: 3.014372528214527 %
Progress: 3.134947429343108 %
Progress: 3.255522330471689 %
Progress: 3.37609723160027 %
Progress: 3.4966721327288517 %
Progress: 3.617247033857432 %
Progress: 3.7378219349860133 %
Progress: 3.8583968361145944 %
P

In [23]:
def PMID_to_url(pmid):
    base_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
    return base_url + str(pmid)

def PMID_to_doi(pmid):
    base = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
    params = {"tool": "mytool", "email": "myemail@example.com", "ids": pmid, "format": "json"}    
    
    response = requests.get(base, params=params)
    
    if response.status_code == 200:
        data = response.json()
        records = data.get("records", [])
        if records and "doi" in records[0]:
            return records[0]["pmid"]
        else:
            return None  # No PMID found
    else:
        return None  # Request failed
    
def batch_PMID_to_doi(pmids, batch_size=100):
    base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
    results = {}

    for i in range(0, len(pmids), batch_size):
        progress = i / len(pmids) * 100
        print(f"Processing batch {i}-{i+batch_size} ({progress:.2f}%)")
        batch = pmids[i:i+batch_size]  # Get a batch of PMIDs
        params = {"tool": "mytool", "email": "myemail@example.com", "ids": ",".join(batch), "format": "json"}
        
        response = requests.get(base_url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            records = data.get("records", [])

            for record in records:
                pmid = record.get("pmid")
                doi = record.get("doi", None)  # Get DOI if available
                
                if pmid and pmid not in results:
                    results[pmid] = doi  # Store in dictionary
        
        else:
            print(f"API request failed for batch {i}-{i+batch_size}: {response.status_code}")
        
        time.sleep(0.5)  # Prevent hitting API rate limits (adjust as needed)

    return results

In [71]:
pmids_to_gse_mapping = {} # mapping of pubmed ids to accession codes of GEO datasets
not_matched_GSEs = []
no_pmid = 0
pmids_found = []
cnt=0

for entry in data: # iterate dataset entries
    if type(data[entry]['pubmedids']) == list and len(data[entry]['pubmedids']) == 0:
        not_matched_GSEs.append(data[entry]['accession'])
        
    elif type(data[entry]['pubmedids']) == list:
        pmids_found.extend(data[entry]['pubmedids'])
        cnt += 1
        for pmid in data[entry]['pubmedids']:
            if pmid not in pmids_to_gse_mapping:
                pmids_to_gse_mapping[pmid] = [data[entry]['accession']]
            else:
                pmids_to_gse_mapping[pmid].append(data[entry]['accession'])  # Append separately

    else:
        print(f"Check why pmid not extracted at: {entry}!")

print(f"Unique PMIDs found: {len(pmids_to_gse_mapping)}")
print(f"# of Datasets with a pmid citation: {cnt}")
print(f"# of Datasets without a pmid citation: {len(not_matched_GSEs)}")

Unique PMIDs found: 127307
# of Datasets with a pmid citation: 194205
# of Datasets without a pmid citation: 54303


In [53]:
pmid_doi_mapping_file = "exp_output/pmid_doi_mapping.json"
if os.path.exists("exp_output/pmid_doi_mapping.json"):
    
    with open(pmid_doi_mapping_file, "r") as f:
        pmid_doi_mapping = json.load(f)  # read json file
else:
    pmid_doi_mapping = batch_PMID_to_doi(list(set(pmids_found)))
    
    os.makedirs(os.path.dirname(pmid_doi_mapping_file), exist_ok=True)
    with open(pmid_doi_mapping_file, "w") as f:
        json.dump(pmid_doi_mapping, f, indent=4) # Save to JSON

In [54]:
doi_to_pmid_mapping = {v: k for k, v in pmid_doi_mapping.items()} # reverse the mapping

In [72]:
# count how many times dx.doi.org is in df_groud_truth['publication']
with open(input_ground_truth_file, "r") as f:
    input_ground_truth = json.load(f)

count, pmcnt = 0,0

for entry in input_ground_truth: # Iterate through the JSON data
    publication = entry.get("citing_publication_link", "")  # Safely get citing_publication_link field

    if isinstance(publication, str):  # Ensure publication is a string
        if "dx.doi.org" in publication:
            count += 1
        elif "pubmed" in publication:
            pmcnt += 1

print(f"Total dx.doi.org found: {count}")
print(f"Total pubmed found: {pmcnt}")
print(f"Total publications: {len(input_ground_truth)}")
print(f"Total matches: {count + pmcnt}")

Total dx.doi.org found: 14887
Total pubmed found: 19992
Total publications: 34879
Total matches: 34879


In [76]:
# for each entry in ground truth, we will create a column with GSE ids. We can use the publication_col to find the DOI or the PM id
# then we can use the mapping {pmid -> doi} to find the PM id for doi
# then we can use the mapping {pmid -> gse} to find the GSE id for the PM id
import re

# Convert `data` into a lookup dictionary using `accession` as the key
gse_lookup = {v["accession"]: v for v in data.values()}

count, pmcnt = 0, 0
cnt_tot_pm, cnt_tot_doi = 0, 0

for entry in input_ground_truth:

    publication = entry.get("citing_publication_link", "")

    if publication and "dx.doi.org" in publication:
        match = re.search(r'dx.doi.org/([a-zA-Z0-9./-]+)', publication)
        if match:
            doi = match.group(1)
            count += 1
            pmid = doi_to_pmid_mapping.get(doi)
            if pmid:
                gse_ids = pmids_to_gse_mapping.get(pmid, [])  # Ensure it's a list

                for gse_id in gse_ids:  # Iterate over multiple GSEs
                    cnt_tot_doi += 1
                    dataset_info = gse_lookup.get(gse_id, {})  # Get dataset info

                    entry.setdefault("cited_datasets", []).append({
                        "dataset_id": gse_id,
                        "dataset_repo": "GEO",
                        "dataset_summary": dataset_info.get("summary", "N/A"),
                        "dataset_title": dataset_info.get("title", "N/A")
                    })

    elif publication and "pubmed" in publication:
        match = re.search(r'pubmed/([0-9]+)', publication)
        if match:
            pmid = match.group(1)
            pmcnt += 1
            gse_ids = pmids_to_gse_mapping.get(pmid, [])  # Ensure it's a list

            for gse_id in gse_ids:  # Iterate over multiple GSEs
                cnt_tot_pm += 1
                dataset_info = gse_lookup.get(gse_id, {})

                entry.setdefault("cited_datasets", []).append({
                    "dataset_id": gse_id,
                    "dataset_repo": "GEO",
                    "dataset_summary": dataset_info.get("summary", "N/A"),
                    "dataset_title": dataset_info.get("title", "N/A")
                })

# Print results
print(f"Total dx.doi.org found: {count}")
print(f"Total pubmed found: {pmcnt}")
print(f"Total matches of gse accession codes: in DOI: {cnt_tot_doi}, in PMID: {cnt_tot_pm}")

Total dx.doi.org found: 14887
Total pubmed found: 19987
Total matches of gse accession codes: in DOI: 1497, in PMID: 3410


In [67]:
with open(output_ground_truth_file, 'w') as f:
    json.dump(input_ground_truth, f, indent=4) # Save to JSON

In [69]:
len(input_ground_truth)

34879