In [1]:
import re
import requests
from xml.etree import ElementTree
import json
import pandas as pd
import time
from data_fetcher import *
from parser import *
from orchestrator import *
from dotenv import load_dotenv
import ast  # To safely evaluate string representation of a list

In [2]:
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "gds",  # Search in GEO database
    "term": '"gse"[Entry Type]',  # Only fetch GEO Series (GSE)
    "retmax": "1000000",  # Maximum records to retrieve (adjust as needed)
    "retmode": "xml"
}

# Request dataset IDs from NCBI
response = requests.get(esearch_url, params=params)
root = ElementTree.fromstring(response.content)

# Extract dataset IDs
gse_ids = [id_elem.text for id_elem in root.findall(".//Id")]

print(f"Total datasets found: {len(gse_ids)}")
print("Sample GSE IDs:", gse_ids[:10])  # Show first 10 results

Total datasets found: 248333
Sample GSE IDs: ['200290925', '200290913', '200290912', '200290830', '200290828', '200290819', '200290818', '200290788', '200290787', '200290706']


In [3]:
def fetch_GEO_data(IDs,request_url,start,stop):
    params = {
        "db": "gds",
        "id": ",".join(IDs[start:stop]),  # Query for first 10 datasets
        "retmode": "json"
    }

    response = requests.get(request_url, params=params)
    
    try:
        data = response.json()
    except:
        raise ValueError("Failed to parse JSON response! Please check the response content.")
    
    return data

In [4]:
pd.DataFrame(fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",0,2))

Unnamed: 0,header,result
type,esummary,
version,0.3,
uids,,"[200290925, 200290913]"
200290925,,"{'uid': '200290925', 'accession': 'GSE290925',..."
200290913,,"{'uid': '200290913', 'accession': 'GSE290913',..."


In [5]:
# load the data from local file
print("Loading local data...")
try:
    df0 = pd.read_csv("exp_input/GEO_data.csv")
    df0 = df0.drop(columns=["Unnamed: 0"])
    print(f"Rows: {len(df0)}")
    print(f"Columns: {df0.columns}")
except:
    df0 = None
    print("No local data found.")

Loading local data...


  df0 = pd.read_csv("exp_input/GEO_data.csv")


Rows: 248333
Columns: Index(['uid', 'accession', 'gds', 'title', 'summary', 'gpl', 'gse', 'taxon',
       'entrytype', 'gdstype', 'ptechtype', 'valtype', 'ssinfo', 'subsetinfo',
       'pdat', 'suppfile', 'samples', 'relations', 'extrelations', 'n_samples',
       'seriestitle', 'platformtitle', 'platformtaxa', 'samplestaxa',
       'pubmedids', 'projects', 'ftplink', 'geo2r', 'bioproject'],
      dtype='object')


In [6]:
# update dataset with new data whose id is not in the local dataset
if df0 is not None:
    all_ids = set(gse_ids)
    old_ids = set(list(df0["uid"].apply(str)))
    missing_ids = all_ids - old_ids
    
    print(f"Total missing datasets found: {len(missing_ids)}")
    
    data = {}
    step = 1
    
    for i, id in enumerate(missing_ids):
        if i % (len(missing_ids)//10) == 0 and i > 0:
            print(f"Iter {i}")
        try:
            new_data = fetch_GEO_data(list(missing_ids),"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi", i, i+step)
            # print(f"Data: {type(new_data)}")
            # print(f"Data: {new_data.keys()}")
            if "result" not in new_data:
                print(f"new_data: {new_data}")
            for uid, details in new_data['result'].items():
                if uid == "uids":  # Ignore metadata key
                    continue
                data[uid] = details
        except Exception as e:
            raise ValueError(f"Failed to fetch data for {id}! Error: {e}")
        
        time.sleep(0.33)  # Be nice to the server

else:
    data = {}
    i = 0
    mxm = 300
    while True:
        print(i)
        if i > len(gse_ids):
            break
        try:
            new_data = fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",i,mxm)
            #print(f"Data: {new_data}")
            for uid, details in new_data["result"].items():
                if uid == "uids":  # Ignore metadata key
                    continue
                data[uid] = details
        except:
            print(f"Error at {i}, {mxm}")
            
        i += 300
        mxm += 300
        time.sleep(0.1) 

Total missing datasets found: 0


In [7]:
df = pd.DataFrame(data).T
print(f"Rows: {len(df)}")
print(f"Columns: {df.columns}")

Rows: 0
Columns: RangeIndex(start=0, stop=0, step=1)


In [8]:
# union of both dataframes
df = pd.concat([df0,df])
print(f"Rows: {len(df)}")
print(f"Columns: {df.columns}")

Rows: 248333
Columns: Index(['uid', 'accession', 'gds', 'title', 'summary', 'gpl', 'gse', 'taxon',
       'entrytype', 'gdstype', 'ptechtype', 'valtype', 'ssinfo', 'subsetinfo',
       'pdat', 'suppfile', 'samples', 'relations', 'extrelations', 'n_samples',
       'seriestitle', 'platformtitle', 'platformtaxa', 'samplestaxa',
       'pubmedids', 'projects', 'ftplink', 'geo2r', 'bioproject'],
      dtype='object')


In [44]:
df.to_csv("exp_input/GEO_data.csv")

In [22]:
# use False Positives from a data-gatherer model run
gse_FPs = []
# Sample false positives load from txt file where separated by newline
with open("exp_output/model_FPs.txt", "r") as file:
    false_positives = file.read().splitlines()

for fp in false_positives:
    if 'gse' in fp and 'https' in fp:
        fp = fp.split("=")[-1]
        gse_FPs.append(fp)
    elif 'gse' in fp:
        gse_FPs.append(fp)

print(f"Total false positives found: {len(gse_FPs)}")

Total false positives found: 38


In [23]:
# no capital case in accession
df["accession"] = df["accession"].astype(str).str.lower()
# iterate
i = 0
gse_uids = []
for fp in gse_FPs:
    # check if false positive is in the dataset
    if fp in df["accession"].values:
        i+=1
        # drop false positive from dataset
        #print(f"False positive {fp} FOUND in dataset.")
        gse_uids.append(fp)
    else:
        print(f"False positive {fp} NOT FOUND in dataset.")

print(f"Total false positives found also in local data: {i}")
#print(f"Unique false positives found: {len(set(gse_uids))}")

Total false positives found also in local data: 38


In [24]:
# get all false positives from dataframe
df_FPs = df[df["accession"].isin(gse_uids)]
df_FPs

Unnamed: 0,uid,accession,gds,title,summary,gpl,gse,taxon,entrytype,gdstype,...,n_samples,seriestitle,platformtitle,platformtaxa,samplestaxa,pubmedids,projects,ftplink,geo2r,bioproject
1472,200277464,gse277464,,Impact of oxygen availability on the organelle...,The yeast Komagataella phaffii (syn. Pichia pa...,34912,277464,Komagataella phaffii CBS 7435,GSE,Expression profiling by high throughput sequen...,...,16.0,,,,,['39937160'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE277nn...,no,PRJNA1162479
1936,200247291,gse247291,,Adipocyte-specific Steap4 deficiency reduced t...,"Steap4, highly expressed in adipose tissue, is...",24247,247291,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,6.0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE247nn...,no,PRJNA1037060
2349,200195983,gse195983,,Transcriptome profile of EZR_KO cells,To analyze gene expression differences between...,24247,195983,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,8.0,,,,,['39937579'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE195nn...,no,PRJNA802830
2749,200272353,gse272353,,Genetic context modulates aging and degenerati...,Background Age is the principal risk factor fo...,24247,272353,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,201.0,,,,,['39833899'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE272nn...,no,PRJNA1136503
2759,200252821,gse252821,,Sphingolipid metabolism orchestrates the estab...,Bioactive sphingolipids serve as an essential ...,19057,252821,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,4.0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE252nn...,no,PRJNA1062771
2850,200266649,gse266649,,Contribution of hypoxia-inducible 1alpha to pa...,Aims: Hypertrophic cardiomyopathy (HCM) caused...,19057,266649,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,4.0,,,,,['39820339'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE266nn...,no,PRJNA1108190
2890,200287583,gse287583,,Dysfunctional b-cell autophagy induces b-cell ...,"We knocked out the critical autophagy enzyme, ...",24247,287583,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,6.0,,,,,['39944686'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE287nn...,no,PRJNA1213782
6823,200283213,gse283213,,Comparative proteomic landscapes provide insig...,Understanding mammalian preimplantation develo...,21273,283213,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,24.0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE283nn...,no,PRJNA1191977
14685,200273052,gse273052,,Brown Adipose Tissue undergoes pathological pe...,Amyotrophic lateral sclerosis (ALS) is a progr...,24247,273052,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,8.0,,,,,['39916853'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE273nn...,no,PRJNA1139843
16154,200269782,gse269782,,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,14550,269782,Homo sapiens,GSE,Expression profiling by array,...,9.0,,,,,['39251588'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nn...,yes,PRJNA1123684


In [25]:
# function to get PMCid and doi from pubmed id
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
orchestrator.setup_data_fetcher()

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp
orchestrator.py - line 45 - INFO - Data fetcher setup completed.


<selenium.webdriver.firefox.webdriver.WebDriver (session="f789bd8b-35c5-42a4-989a-b656774221da")>

In [26]:
for i, row in df_FPs.iterrows():
    pubmed_id = row["pubmedids"]
    print(pubmed_id)  
    
    # Convert string representation of list to an actual list
    if isinstance(pubmed_id, str) and pubmed_id.startswith("["):
        pubmed_id = ast.literal_eval(pubmed_id)  # Converts to a real list
        pubmed_id = pubmed_id[0] if pubmed_id else None  # Extract the first element
        
    if pubmed_id is None:
        continue

    print(type(pubmed_id))  # Should now be <class 'str'>
    print(pubmed_id)  # Should print only the number
    
    pmc_id, doi = orchestrator.data_fetcher.get_opendata_from_pubmed_id(pubmed_id)
        
    print(f"PMC ID: {pmc_id}")
    print(f"DOI: {doi}")
    
    # Store results
    df_FPs.at[i, "pmcid"] = pmc_id
    df_FPs.at[i, "doi"] = doi

orchestrator.data_fetcher.quit()

data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39937160/


['39937160']
<class 'str'>
39937160


data_fetcher.py - line 207 - INFO - PMCID: PMC11816699
data_fetcher.py - line 215 - INFO - DOI: 10.1111/1751-7915.70106
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_FPs.at[i, "pmcid"] = pmc_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_FPs.at[i, "doi"] = doi
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39937579/


PMC ID: PMC11816699
DOI: 10.1111/1751-7915.70106
[]
['39937579']
<class 'str'>
39937579


data_fetcher.py - line 207 - INFO - PMCID: PMC11820125
data_fetcher.py - line 215 - INFO - DOI: 10.7554/eLife.98523
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39833899/


PMC ID: PMC11820125
DOI: 10.7554/eLife.98523
['39833899']
<class 'str'>
39833899


data_fetcher.py - line 207 - INFO - PMCID: PMC11744848
data_fetcher.py - line 215 - INFO - DOI: 10.1186/s13024-025-00800-9
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39820339/


PMC ID: PMC11744848
DOI: 10.1186/s13024-025-00800-9
[]
['39820339']
<class 'str'>
39820339


data_fetcher.py - line 207 - INFO - PMCID: PMC11739497
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41598-025-85187-9
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39944686/


PMC ID: PMC11739497
DOI: 10.1038/s41598-025-85187-9
['39944686']
<class 'str'>
39944686


data_fetcher.py - line 207 - INFO - PMCID: PMC11814175
data_fetcher.py - line 215 - INFO - DOI: 10.3389/fimmu.2025.1504583
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39916853/


PMC ID: PMC11814175
DOI: 10.3389/fimmu.2025.1504583
[]
['39916853']
<class 'str'>
39916853


data_fetcher.py - line 207 - INFO - PMCID: PMC11800085
data_fetcher.py - line 215 - INFO - DOI: 10.1016/j.heliyon.2025.e41801
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39251588/


PMC ID: PMC11800085
DOI: 10.1016/j.heliyon.2025.e41801
['39251588']
<class 'str'>
39251588


data_fetcher.py - line 207 - INFO - PMCID: PMC11385228
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41467-024-51831-7
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39198643/


PMC ID: PMC11385228
DOI: 10.1038/s41467-024-51831-7
['39198643']
<class 'str'>
39198643


data_fetcher.py - line 207 - INFO - PMCID: PMC11424477
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41586-024-07873-4
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/37291385/


PMC ID: PMC11424477
DOI: 10.1038/s41586-024-07873-4
['37291385']
<class 'str'>
37291385


data_fetcher.py - line 207 - INFO - PMCID: PMC10307624
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41590-023-01522-0
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/37024923/


PMC ID: PMC10307624
DOI: 10.1038/s41590-023-01522-0
['37024923']
<class 'str'>
37024923


data_fetcher.py - line 207 - INFO - PMCID: PMC10080794
data_fetcher.py - line 215 - INFO - DOI: 10.1186/s13059-023-02918-9
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/36074823/


PMC ID: PMC10080794
DOI: 10.1186/s13059-023-02918-9
['36074823']
<class 'str'>
36074823


data_fetcher.py - line 207 - INFO - PMCID: None
data_fetcher.py - line 215 - INFO - DOI: 10.1126/science.abo7923
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/37130519/


PMC ID: None
DOI: 10.1126/science.abo7923
['37130519']
<class 'str'>
37130519


data_fetcher.py - line 207 - INFO - PMCID: None
data_fetcher.py - line 215 - INFO - DOI: 10.1016/j.chembiol.2023.04.005
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/35697785/


PMC ID: None
DOI: 10.1016/j.chembiol.2023.04.005
['35697785']
<class 'str'>
35697785


data_fetcher.py - line 207 - INFO - PMCID: None
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41556-022-00928-6
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/35108058/


PMC ID: None
DOI: 10.1038/s41556-022-00928-6
['35108058', '36333315']
<class 'str'>
35108058


data_fetcher.py - line 207 - INFO - PMCID: PMC8809684
data_fetcher.py - line 215 - INFO - DOI: 10.1126/sciadv.abj3967
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/32461551/


PMC ID: PMC8809684
DOI: 10.1126/sciadv.abj3967
['32461551']
<class 'str'>
32461551


data_fetcher.py - line 207 - INFO - PMCID: PMC7253418
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41467-020-16444-w
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/32832598/


PMC ID: PMC7253418
DOI: 10.1038/s41467-020-16444-w
['32832598']
<class 'str'>
32832598


data_fetcher.py - line 207 - INFO - PMCID: PMC7439444
data_fetcher.py - line 215 - INFO - DOI: 10.1126/sciadv.aba1972
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/30609749/


PMC ID: PMC7439444
DOI: 10.1126/sciadv.aba1972
['30609749']
<class 'str'>
30609749


data_fetcher.py - line 207 - INFO - PMCID: PMC6356910
data_fetcher.py - line 215 - INFO - DOI: 10.3390/cancers11010036
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/31109297/


PMC ID: PMC6356910
DOI: 10.3390/cancers11010036
['31109297']
<class 'str'>
31109297


data_fetcher.py - line 207 - INFO - PMCID: PMC6528255
data_fetcher.py - line 215 - INFO - DOI: 10.1186/s12864-019-5775-1
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/29170441/


PMC ID: PMC6528255
DOI: 10.1186/s12864-019-5775-1
['29170441']
<class 'str'>
29170441


data_fetcher.py - line 207 - INFO - PMCID: PMC5701008
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41467-017-01981-8
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/30291293/


PMC ID: PMC5701008
DOI: 10.1038/s41467-017-01981-8
['30291293']
<class 'str'>
30291293


data_fetcher.py - line 207 - INFO - PMCID: PMC6173712
data_fetcher.py - line 215 - INFO - DOI: 10.1038/s41598-018-33190-8
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/28622766/


PMC ID: PMC6173712
DOI: 10.1038/s41598-018-33190-8
['28622766']
<class 'str'>
28622766


data_fetcher.py - line 207 - INFO - PMCID: PMC5473967
data_fetcher.py - line 215 - INFO - DOI: 10.1186/s13059-017-1222-2
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/28541577/


PMC ID: PMC5473967
DOI: 10.1186/s13059-017-1222-2
['28541577']
<class 'str'>
28541577


data_fetcher.py - line 207 - INFO - PMCID: PMC5570006
data_fetcher.py - line 215 - INFO - DOI: 10.1093/nar/gkx469
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/26893308/


PMC ID: PMC5570006
DOI: 10.1093/nar/gkx469
['26893308']
<class 'str'>
26893308


data_fetcher.py - line 207 - INFO - PMCID: PMC4770386
data_fetcher.py - line 215 - INFO - DOI: 10.15252/msb.20156662
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/26430118/


PMC ID: PMC4770386
DOI: 10.15252/msb.20156662
['26430118']
<class 'str'>
26430118


data_fetcher.py - line 207 - INFO - PMCID: None
data_fetcher.py - line 215 - INFO - DOI: 10.1126/science.aac7368
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/27135913/


PMC ID: None
DOI: 10.1126/science.aac7368
['27135913']
<class 'str'>
27135913


data_fetcher.py - line 207 - INFO - PMCID: PMC4802414
data_fetcher.py - line 215 - INFO - DOI: 10.1016/j.cels.2015.08.012
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/25871388/


PMC ID: PMC4802414
DOI: 10.1016/j.cels.2015.08.012
['25871388']
<class 'str'>
25871388


data_fetcher.py - line 207 - INFO - PMCID: PMC4496364
data_fetcher.py - line 215 - INFO - DOI: 10.18632/oncotarget.3389
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/25870145/


PMC ID: PMC4496364
DOI: 10.18632/oncotarget.3389
[]
['25870145']
<class 'str'>
25870145


data_fetcher.py - line 207 - INFO - PMCID: PMC4605607
data_fetcher.py - line 215 - INFO - DOI: 10.1158/0008-5472.CAN-14-3167
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/25215492/


PMC ID: PMC4605607
DOI: 10.1158/0008-5472.CAN-14-3167
['25215492', '27161320']
<class 'str'>
25215492


data_fetcher.py - line 207 - INFO - PMCID: PMC4163055
data_fetcher.py - line 215 - INFO - DOI: 10.1016/j.cell.2014.08.012
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/23934149/


PMC ID: PMC4163055
DOI: 10.1016/j.cell.2014.08.012
['23934149', '32968278']
<class 'str'>
23934149


data_fetcher.py - line 207 - INFO - PMCID: None
data_fetcher.py - line 215 - INFO - DOI: 10.1038/nsmb.2660
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/22961667/


PMC ID: None
DOI: 10.1038/nsmb.2660
['22961667']
<class 'str'>
22961667


data_fetcher.py - line 207 - INFO - PMCID: PMC3477553
data_fetcher.py - line 215 - INFO - DOI: 10.1158/2159-8290.CD-12-0103
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/22751098/


PMC ID: PMC3477553
DOI: 10.1158/2159-8290.CD-12-0103
['22751098']
<class 'str'>
22751098


data_fetcher.py - line 207 - INFO - PMCID: PMC3408577
data_fetcher.py - line 215 - INFO - DOI: 10.1038/ng.2330
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/22552098/


PMC ID: PMC3408577
DOI: 10.1038/ng.2330
['22552098']
<class 'str'>
22552098


data_fetcher.py - line 207 - INFO - PMCID: PMC3347774
data_fetcher.py - line 215 - INFO - DOI: 10.1038/nature11083
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/22080568/


PMC ID: PMC3347774
DOI: 10.1038/nature11083
['22080568', '23028479']
<class 'str'>
22080568


data_fetcher.py - line 207 - INFO - PMCID: None
data_fetcher.py - line 215 - INFO - DOI: 10.1158/0008-5472.CAN-11-1403


PMC ID: None
DOI: 10.1158/0008-5472.CAN-11-1403


data_fetcher.py - line 255 - INFO - WebScraper driver quit.


In [19]:
# for i, row in df_FPs.iterrows():
#     pubmed_id = row["pubmedids"]
#     print(pubmed_id)  
#     
#     # Convert string representation of list to an actual list
#     if isinstance(pubmed_id, str) and pubmed_id.startswith("["):
#         pubmed_id = ast.literal_eval(pubmed_id)  # Converts to a real list
#         pubmed_id = pubmed_id[0] if pubmed_id else None  # Extract the first element
#         
#     if pubmed_id is None:
#         continue
# 
#     print(type(pubmed_id))  # Should now be <class 'str'>
#     print(pubmed_id)  # Should print only the number
#     
#     url = orchestrator.data_fetcher.get_url_from_pubmed_id(pubmed_id)
#     print(url)
#     
#     html = orchestrator.data_fetcher.fetch_data(url)
#     
#     # Parse PMC ID and DOI from the HTML content
#     soup = BeautifulSoup(html, 'html.parser')
# 
#     # Extract PMC ID
#     pmc_tag = soup.find("a", {"data-ga-action": "PMCID"})
#     pmc_id = pmc_tag.text.strip() if pmc_tag else None  # Extract text safely
# 
#     # Extract DOI
#     doi_tag = soup.find("a", {"data-ga-action": "DOI"})
#     doi = doi_tag.text.strip() if doi_tag else None  # Extract text safely
#         
#     print(f"PMC ID: {pmc_id}")
#     print(f"DOI: {doi}")
#     
#     # Store results
#     df_FPs.at[i, "pmcid"] = pmc_id
#     df_FPs.at[i, "doi"] = doi
# 
# orchestrator.data_fetcher.quit()

<class 'str'>
39937160
https://pubmed.ncbi.nlm.nih.gov/39937160/
PMC ID: PMC11816699
DOI: 10.1111/1751-7915.70106
<class 'NoneType'>
None
https://pubmed.ncbi.nlm.nih.gov/None/
PMC ID: None
DOI: None
<class 'str'>
39937579
https://pubmed.ncbi.nlm.nih.gov/39937579/
PMC ID: PMC11820125
DOI: 10.7554/eLife.98523
<class 'NoneType'>
None
https://pubmed.ncbi.nlm.nih.gov/None/
PMC ID: None
DOI: None
<class 'str'>
39820339
https://pubmed.ncbi.nlm.nih.gov/39820339/
PMC ID: PMC11739497
DOI: 10.1038/s41598-025-85187-9
<class 'str'>
39944686
https://pubmed.ncbi.nlm.nih.gov/39944686/
PMC ID: PMC11814175
DOI: 10.3389/fimmu.2025.1504583
<class 'str'>
39790481
https://pubmed.ncbi.nlm.nih.gov/39790481/
PMC ID: PMC11707872
DOI: 10.1093/femsml/uqae026
<class 'str'>
39831303
https://pubmed.ncbi.nlm.nih.gov/39831303/
PMC ID: PMC11744191
DOI: 10.1093/nar/gkaf011
<class 'str'>
39898029
https://pubmed.ncbi.nlm.nih.gov/39898029/
PMC ID: PMC11787672
DOI: 10.1016/j.isci.2025.111745
<class 'str'>
39747972
https://p

data_fetcher.py - line 227 - INFO - WebScraper driver quit.


In [74]:
df_FPs.head()

Unnamed: 0,uid,accession,gds,title,summary,gpl,gse,taxon,entrytype,gdstype,...,platformtitle,platformtaxa,samplestaxa,pubmedids,projects,ftplink,geo2r,bioproject,pmcid,doi
1472,200277464,gse277464,,Impact of oxygen availability on the organelle...,The yeast Komagataella phaffii (syn. Pichia pa...,34912,277464,Komagataella phaffii CBS 7435,GSE,Expression profiling by high throughput sequen...,...,,,,['39937160'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE277nn...,no,PRJNA1162479,PMC11816699,10.1111/1751-7915.70106
1936,200247291,gse247291,,Adipocyte-specific Steap4 deficiency reduced t...,"Steap4, highly expressed in adipose tissue, is...",24247,247291,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE247nn...,no,PRJNA1037060,,
2349,200195983,gse195983,,Transcriptome profile of EZR_KO cells,To analyze gene expression differences between...,24247,195983,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39937579'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE195nn...,no,PRJNA802830,PMC11820125,10.7554/eLife.98523
2749,200272353,gse272353,,Genetic context modulates aging and degenerati...,Background Age is the principal risk factor fo...,24247,272353,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39833899'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE272nn...,no,PRJNA1136503,PMC11744848,10.1186/s13024-025-00800-9
2759,200252821,gse252821,,Sphingolipid metabolism orchestrates the estab...,Bioactive sphingolipids serve as an essential ...,19057,252821,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE252nn...,no,PRJNA1062771,,


In [89]:
df1 = pd.read_csv("exp_input/PRIDEid_HTML_data.csv")
df1 = df1.drop(columns=["Unnamed: 0"])
print(f"Rows: {len(df1)}")
print(f"Columns: {df1.columns}")

Rows: 236
Columns: Index(['publication', 'dataset_uid', 'repo_name', 'raw_html', 'publisher',
       'smallest_elements'],
      dtype='object')


In [90]:
df1.head()

Unnamed: 0,publication,dataset_uid,repo_name,raw_html,publisher,smallest_elements
0,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,PRIDE,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,na
1,https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The mass spectrometry proteomics data ha...
2,https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The data that support the findings of th...
3,https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>Generation of the protein library and SW...
4,https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>The mass spectrometry proteomics data ha...


In [91]:
df_FPs

Unnamed: 0,uid,accession,gds,title,summary,gpl,gse,taxon,entrytype,gdstype,...,platformtitle,platformtaxa,samplestaxa,pubmedids,projects,ftplink,geo2r,bioproject,pmcid,doi
1472,200277464,gse277464,,Impact of oxygen availability on the organelle...,The yeast Komagataella phaffii (syn. Pichia pa...,34912,277464,Komagataella phaffii CBS 7435,GSE,Expression profiling by high throughput sequen...,...,,,,['39937160'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE277nn...,no,PRJNA1162479,PMC11816699,10.1111/1751-7915.70106
1936,200247291,gse247291,,Adipocyte-specific Steap4 deficiency reduced t...,"Steap4, highly expressed in adipose tissue, is...",24247,247291,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE247nn...,no,PRJNA1037060,,
2349,200195983,gse195983,,Transcriptome profile of EZR_KO cells,To analyze gene expression differences between...,24247,195983,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39937579'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE195nn...,no,PRJNA802830,PMC11820125,10.7554/eLife.98523
2749,200272353,gse272353,,Genetic context modulates aging and degenerati...,Background Age is the principal risk factor fo...,24247,272353,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39833899'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE272nn...,no,PRJNA1136503,PMC11744848,10.1186/s13024-025-00800-9
2759,200252821,gse252821,,Sphingolipid metabolism orchestrates the estab...,Bioactive sphingolipids serve as an essential ...,19057,252821,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE252nn...,no,PRJNA1062771,,
2850,200266649,gse266649,,Contribution of hypoxia-inducible 1alpha to pa...,Aims: Hypertrophic cardiomyopathy (HCM) caused...,19057,266649,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39820339'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE266nn...,no,PRJNA1108190,PMC11739497,10.1038/s41598-025-85187-9
2890,200287583,gse287583,,Dysfunctional b-cell autophagy induces b-cell ...,"We knocked out the critical autophagy enzyme, ...",24247,287583,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39944686'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE287nn...,no,PRJNA1213782,PMC11814175,10.3389/fimmu.2025.1504583
6823,200283213,gse283213,,Comparative proteomic landscapes provide insig...,Understanding mammalian preimplantation develo...,21273,283213,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE283nn...,no,PRJNA1191977,,
14685,200273052,gse273052,,Brown Adipose Tissue undergoes pathological pe...,Amyotrophic lateral sclerosis (ALS) is a progr...,24247,273052,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,,,,['39916853'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE273nn...,no,PRJNA1139843,PMC11800085,10.1016/j.heliyon.2025.e41801
16154,200269782,gse269782,,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,14550,269782,Homo sapiens,GSE,Expression profiling by array,...,,,,['39251588'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nn...,yes,PRJNA1123684,PMC11385228,10.1038/s41467-024-51831-7


In [94]:
df_FPs.loc[:, "doi"] = df_FPs["doi"].astype(str).str.lower()
df1.loc[:, "publication"] = df1["publication"].astype(str).str.lower()

print(f'FP DOI: {df_FPs["doi"]}')
print(f'df1 publication{df1["publication"]}')

FP DOI: 1472             10.1111/1751-7915.70106
1936                                 nan
2349                 10.7554/elife.98523
2749          10.1186/s13024-025-00800-9
2759                                 nan
2850          10.1038/s41598-025-85187-9
2890          10.3389/fimmu.2025.1504583
6823                                 nan
14685      10.1016/j.heliyon.2025.e41801
16154         10.1038/s41467-024-51831-7
17444         10.1038/s41586-024-07873-4
46541         10.1038/s41590-023-01522-0
51322         10.1186/s13059-023-02918-9
62852            10.1126/science.abo7923
71503     10.1016/j.chembiol.2023.04.005
74285         10.1038/s41556-022-00928-6
82664             10.1126/sciadv.abj3967
124924        10.1038/s41467-020-16444-w
130045            10.1126/sciadv.aba1972
139212           10.3390/cancers11010036
145418         10.1186/s12864-019-5775-1
153914        10.1038/s41467-017-01981-8
156237        10.1038/s41598-018-33190-8
164315         10.1186/s13059-017-1222-2
168059  

In [95]:
# Create mapping of DOI to concatenated GSE accession codes
doi_to_gse = (
    df_FPs.groupby("doi")["accession"]
    .apply(lambda x: ",".join(set(x.astype(str))))  # Ensures unique values
    .to_dict()
)
doi_to_gse

{'10.1016/j.cell.2014.08.012': 'gse53743',
 '10.1016/j.cels.2015.08.012': 'gse66715',
 '10.1016/j.chembiol.2023.04.005': 'gse202350',
 '10.1016/j.heliyon.2025.e41801': 'gse273052',
 '10.1038/nature11083': 'gse36892',
 '10.1038/ng.2330': 'gse38121',
 '10.1038/nsmb.2660': 'gse36552',
 '10.1038/s41467-017-01981-8': 'gse94460',
 '10.1038/s41467-020-16444-w': 'gse138760',
 '10.1038/s41467-024-51831-7': 'gse269782',
 '10.1038/s41556-022-00928-6': 'gse165782',
 '10.1038/s41586-024-07873-4': 'gse268813',
 '10.1038/s41590-023-01522-0': 'gse229376',
 '10.1038/s41598-018-33190-8': 'gse106765',
 '10.1038/s41598-025-85187-9': 'gse266649',
 '10.1093/nar/gkx469': 'gse87328',
 '10.1111/1751-7915.70106': 'gse277464',
 '10.1126/sciadv.aba1972': 'gse135893',
 '10.1126/sciadv.abj3967': 'gse169632',
 '10.1126/science.aac7368': 'gse72064',
 '10.1126/science.abo7923': 'gse197265',
 '10.1158/0008-5472.can-11-1403': 'gse31210',
 '10.1158/0008-5472.can-14-3167': 'gse59239',
 '10.1158/2159-8290.cd-12-0103': 'gse

In [96]:
# Function to find matching DOIs for a given publication
def find_matching_gse(publication, doi_dict):
    matched_gse_codes = []
    for doi, gse_codes in doi_dict.items():
        if pd.notna(publication) and doi in publication:  # Substring match
            matched_gse_codes.append(gse_codes)

    return ",".join(set(matched_gse_codes)) if matched_gse_codes else None

# Apply the function to find matches and update dataset_uid
df1["new_dataset_uids"] = df1["publication"].apply(lambda pub: find_matching_gse(pub, doi_to_gse))

# Concatenate new dataset IDs with existing ones (if any)
df1["dataset_uid"] = df1.apply(
    lambda row: f"{row['dataset_uid']},{row['new_dataset_uids']}" if pd.notna(row["new_dataset_uids"]) else row["dataset_uid"], axis=1
)

# Remove unnecessary column and strip redundant commas
df1.drop(columns=["new_dataset_uids"], inplace=True)
df1["dataset_uid"] = df1["dataset_uid"].str.strip(",")

In [100]:
df1[["publication","dataset_uid"]]

Unnamed: 0,publication,dataset_uid
0,https://dx.doi.org/10.1001/jamaneurol.2024.4763,PXD056570
1,https://dx.doi.org/10.1002/cbic.202400831,PXD055649
2,https://dx.doi.org/10.1002/cbic.202400882,PXD060372
3,https://dx.doi.org/10.1002/prca.202300107,PXD028078
4,https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045"
...,...,...
231,https://www.ncbi.nlm.nih.gov/pubmed/39884247,PXD054970
232,https://www.ncbi.nlm.nih.gov/pubmed/39900909,PXD058955
233,https://www.ncbi.nlm.nih.gov/pubmed/39910101,PXD060051
234,https://www.ncbi.nlm.nih.gov/pubmed/39910614,PXD060193


In [101]:
df1.to_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")

In [None]:
# we have dataframe df1 with all our scraped Ground Truth data (raw_html, publication, dataset_uid, ...)
# we have dataframe df_FPs with all our (supposedly) false positives on Ground Truth (accession, pmcid, doi)
# with the previous steps in the GEO dataset creation notebook, we now have ground truth for these "False Positives"
# look for matches, then update ground truth with the new data

# for i, row in df1.iterrows():
#     publication = row["publication"].lower()
#     for fp in df_FPs["doi"]:
#         fp = fp.lower()
#         if fp is not None and (fp in publication or fp == publication):
#             print(f"Publication: {publication}")
#             print(f"DOI of ex-false positive: {fp}")
#             vals = df_FPs[df_FPs["doi"] == fp]
#             print(f"ex-False Positive id: {vals['accession'].values}")
#             addenda = ',' + df_FPs[df_FPs["doi"] == fp]["accession"].values
#             #df1.at[i, "dataset_uid"] += addenda
#             #print(df1.at[i, "dataset_uid"])
#             #print(addenda)
#         elif fp is None:
#             print(None)