In [72]:
import re
import requests
from xml.etree import ElementTree
import json
import pandas as pd
import time
from data_fetcher import *
from parser import *
from orchestrator import *
from dotenv import load_dotenv
import ast  # To safely evaluate string representation of a list
import bdikit as bdi

In [73]:
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "gds",  # Search in GEO database
    "term": '"gse"[Entry Type]',  # Only fetch GEO Series (GSE)
    "retmax": "1000000",  # Maximum records to retrieve (adjust as needed)
    "retmode": "xml"
}

# Request dataset IDs from NCBI
response = requests.get(esearch_url, params=params)
root = ElementTree.fromstring(response.content)

# Extract dataset IDs
gse_ids = [id_elem.text for id_elem in root.findall(".//Id")]

print(f"Total datasets found: {len(gse_ids)}")
print("Sample GSE IDs:", gse_ids[:10])  # Show first 10 results

Total datasets found: 248384
Sample GSE IDs: ['200291028', '200290741', '200290722', '200290718', '200290678', '200290658', '200290614', '200290157', '200289999', '200289807']


In [74]:
def fetch_GEO_data(IDs,request_url,start,stop):
    params = {
        "db": "gds",
        "id": ",".join(IDs[start:stop]),  # Query for first 10 datasets
        "retmode": "json"
    }

    response = requests.get(request_url, params=params)
    
    try:
        data = response.json()
    except:
        raise ValueError("Failed to parse JSON response! Please check the response content.")
    
    return data

In [75]:
pd.DataFrame(fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",0,2))

Unnamed: 0,header,result
type,esummary,
version,0.3,
uids,,"[200291028, 200290741]"
200291028,,"{'uid': '200291028', 'accession': 'GSE291028',..."
200290741,,"{'uid': '200290741', 'accession': 'GSE290741',..."


In [76]:
# load the data from local file
print("Loading local data...")
try:
    df0 = pd.read_csv("exp_input/GEO_data.csv")
    df0 = df0.drop(columns=["Unnamed: 0"])
    # drop null columns
    df0 = df0.dropna(axis=1, how='all')
    print(f"Shape: {df0.shape}")
    print(f"Columns: {df0.columns}")
except:
    df0 = None
    print("No local data found.")

Loading local data...


  df0 = pd.read_csv("exp_input/GEO_data.csv")


Shape: (248387, 21)
Columns: Index(['uid', 'accession', 'gds', 'title', 'summary', 'gpl', 'gse', 'taxon',
       'entrytype', 'gdstype', 'pdat', 'suppfile', 'samples', 'relations',
       'extrelations', 'n_samples', 'pubmedids', 'projects', 'ftplink',
       'geo2r', 'bioproject'],
      dtype='object')


In [77]:
# update dataset with new data whose id is not in the local dataset
if df0 is not None:
    all_ids = set(gse_ids)
    old_ids = set(list(df0["uid"].apply(str)))
    missing_ids = all_ids - old_ids
    
    print(f"Total missing datasets found: {len(missing_ids)}")
    
    data = {}
    step = 1
    
    for i, id in enumerate(missing_ids):
        if i % (len(missing_ids)//10) == 0 and i > 0:
            print(f"Iter {i}")
        try:
            new_data = fetch_GEO_data(list(missing_ids),"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi", i, i+step)
            # print(f"Data: {type(new_data)}")
            # print(f"Data: {new_data.keys()}")
            if "result" not in new_data:
                print(f"new_data: {new_data}")
            for uid, details in new_data['result'].items():
                if uid == "uids":  # Ignore metadata key
                    continue
                data[uid] = details
        except Exception as e:
            raise ValueError(f"Failed to fetch data for {id}! Error: {e}")
        
        time.sleep(0.33)  # Be nice to the server

else:
    data = {}
    i = 0
    mxm = 300
    while True:
        print(i)
        if i > len(gse_ids):
            break
        try:
            new_data = fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",i,mxm)
            #print(f"Data: {new_data}")
            for uid, details in new_data["result"].items():
                if uid == "uids":  # Ignore metadata key
                    continue
                data[uid] = details
        except:
            print(f"Error at {i}, {mxm}")
            
        i += 300
        mxm += 300
        time.sleep(0.1) 

Total missing datasets found: 0


In [78]:
df = pd.DataFrame(data).T
df = df.dropna(axis=1, how='all')
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")

Shape: (0, 0)
Columns: RangeIndex(start=0, stop=0, step=1)


In [79]:
df

In [80]:
# union of both dataframes
df = pd.concat([df0,df])
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")

Shape: (248387, 21)
Columns: Index(['uid', 'accession', 'gds', 'title', 'summary', 'gpl', 'gse', 'taxon',
       'entrytype', 'gdstype', 'pdat', 'suppfile', 'samples', 'relations',
       'extrelations', 'n_samples', 'pubmedids', 'projects', 'ftplink',
       'geo2r', 'bioproject'],
      dtype='object')


In [81]:
df.to_csv("exp_input/GEO_data.csv")

In [82]:
# use False Positives from a data-gatherer model run
gse_FPs = []
# Sample false positives load from txt file where separated by newline
with open("exp_output/model_FPs.txt", "r") as file:
    false_positives = file.read().splitlines()

for fp in false_positives:
    if 'gse' in fp and 'https' in fp:
        fp = fp.split("=")[-1]
        gse_FPs.append(fp)
    elif 'gse' in fp:
        gse_FPs.append(fp)

print(f"Total false positives found: {len(gse_FPs)}")

Total false positives found: 36


In [83]:
# no capital case in accession
df["accession"] = df["accession"].astype(str).str.lower()
# iterate
i = 0
gse_uids = []
for fp in gse_FPs:
    # check if false positive is in the dataset
    if fp in df["accession"].values:
        i+=1
        # drop false positive from dataset
        #print(f"False positive {fp} FOUND in dataset.")
        gse_uids.append(fp)
    else:
        print(f"False positive {fp} NOT FOUND in dataset.")

print(f"Total false positives found also in local GEO data: {i}")
#print(f"Unique false positives found: {len(set(gse_uids))}")

Total false positives found also in local GEO data: 36


In [84]:
# get all false positives from dataframe
df_FPs = df[df["accession"].isin(gse_uids)]
df_FPs

Unnamed: 0,uid,accession,gds,title,summary,gpl,gse,taxon,entrytype,gdstype,...,suppfile,samples,relations,extrelations,n_samples,pubmedids,projects,ftplink,geo2r,bioproject
1936,200247291,gse247291,,Adipocyte-specific Steap4 deficiency reduced t...,"Steap4, highly expressed in adipose tissue, is...",24247,247291,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,TXT,"[{'accession': 'GSM7886717', 'title': 'Steap4f...",[],[],6.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE247nn...,no,PRJNA1037060
2349,200195983,gse195983,,Transcriptome profile of EZR_KO cells,To analyze gene expression differences between...,24247,195983,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,TXT,"[{'accession': 'GSM5857265', 'title': 'MEFs, w...",[],[],8.0,['39937579'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE195nn...,no,PRJNA802830
2759,200252821,gse252821,,Sphingolipid metabolism orchestrates the estab...,Bioactive sphingolipids serve as an essential ...,19057,252821,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,"H5AD, MTX, TSV, TXT","[{'accession': 'GSM8008506', 'title': 'Epiderm...",[],[],4.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE252nn...,no,PRJNA1062771
4767,200267957,gse267957,,Transcriptome analyses of Leishmania donovani ...,This study was performed with the goal of unde...,34491,267957,Leishmania donovani,GSE,Expression profiling by high throughput sequen...,...,CSV,"[{'accession': 'GSM8282498', 'title': 'exp 2 s...",[],[],6.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE267nn...,no,PRJNA1113700
6823,200283213,gse283213,,Comparative proteomic landscapes provide insig...,Understanding mammalian preimplantation develo...,21273,283213,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,TXT,"[{'accession': 'GSM8658087', 'title': 'Zfp622_...",[],[],24.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE283nn...,no,PRJNA1191977
16154,200269782,gse269782,,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,14550,269782,Homo sapiens,GSE,Expression profiling by array,...,TXT,"[{'accession': 'GSM8327196', 'title': 'AS2-Vec...",[],[],9.0,['39251588'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nn...,yes,PRJNA1123684
46541,200229376,gse229376,,Defining blood-induced microglia functions in ...,This SuperSeries is composed of the SubSeries ...,21103;24247,229376,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,"MTX, TSV","[{'accession': 'GSM7159423', 'title': 'RNAseq_...",[],[],63.0,['37291385'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE229nn...,no,PRJNA954299
51322,200228201,gse228201,,Reading and writing of mRNA m6A modification o...,Background: Maternal and zygotic mRNAs play cr...,21273,228201,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,"TSV, TXT","[{'accession': 'GSM7116568', 'title': 'IP-L2C-...",[],[],24.0,['37024923'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE228nn...,no,PRJNA947496
62852,200197265,gse197265,,Translatome and transcriptome co-profiling rev...,Translational regulation plays a critical role...,24676,197265,Homo sapiens,GSE,Expression profiling by high throughput sequen...,...,"BEDGRAPH, BW, TXT","[{'accession': 'GSM5911848', 'title': 'h_siTPR...",[],[],226.0,['36074823'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nn...,yes,PRJNA809587
74285,200165782,gse165782,,Ultrasensitive Ribo-seq reveals translational ...,"In mammals, translational control plays critic...",18480;29177,165782,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,"BW, TXT","[{'accession': 'GSM5049855', 'title': 'mES_sma...",[],"[{'relationtype': 'SRA', 'targetobject': 'SRP3...",105.0,['35697785'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE165nn...,no,PRJNA697913


In [85]:
# function to get PMCid and doi from pubmed id
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
orchestrator.setup_data_fetcher()

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp
orchestrator.py - line 45 - INFO - Data fetcher setup completed.


<selenium.webdriver.firefox.webdriver.WebDriver (session="f8062c9f-b6a3-46a0-9c08-cdcb65ee380a")>

In [86]:
for i, row in df_FPs.iterrows():
    pubmed_id = row["pubmedids"]
    orchestrator.logger.info(f"PubMed id: {pubmed_id} --item {i+1}")  
    
    # Convert string representation of list to an actual list
    if isinstance(pubmed_id, str) and pubmed_id.startswith("["):
        pubmed_id = ast.literal_eval(pubmed_id)  # Converts to a real list
        pubmed_ids = pubmed_id if pubmed_id else None  # Extract the first element
        
    if pubmed_ids is None:    
        orchestrator.logger.info(f"pubmed_ids is None")  
        continue

    orchestrator.logger.info(f"Type of elements in pubmed_ids <class 'list'>: {[type(item) for item in pubmed_ids]}")  # Should now be <class 'str'>
    
    pmc_ids, dois = [], []
    
    for pmid in pubmed_ids:    
        pmc_id, doi = orchestrator.data_fetcher.get_opendata_from_pubmed_id(pmid)
        pmc_ids.append(pmc_id)
        dois.append(doi)
        
    orchestrator.logger.info(f"PMC ID: {pmc_ids}")
    orchestrator.logger.info(f"DOI: {dois}\n")
    
    #Store results
    df_FPs.loc[i, "pmcids"] = ",".join([str(x) for x in pmc_ids if x]) if any(pmc_ids) else None
    df_FPs.loc[i, "dois"] = ",".join([str(x) for x in dois if x]) if any(dois) else None

orchestrator.data_fetcher.quit()

328664979.py - line 3 - INFO - PubMed id: [] --item 1937
328664979.py - line 11 - INFO - pubmed_ids is None
328664979.py - line 3 - INFO - PubMed id: ['39937579'] --item 2350
328664979.py - line 14 - INFO - Type of elements in pubmed_ids <class 'list'>: [<class 'str'>]
data_fetcher.py - line 220 - INFO - Reconstructed URL: https://pubmed.ncbi.nlm.nih.gov/39937579/
data_fetcher.py - line 207 - INFO - PMCID: PMC11820125
data_fetcher.py - line 215 - INFO - DOI: 10.7554/eLife.98523
328664979.py - line 23 - INFO - PMC ID: ['PMC11820125']
328664979.py - line 24 - INFO - DOI: ['10.7554/eLife.98523']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_FPs.loc[i, "pmcids"] = ",".join([str(x) for x in pmc_ids if x]) if any(pmc_ids) else None
A value is trying to be set on a copy of 

In [87]:
print(f"Shape: {df_FPs.shape}")
print(f"Columns: {df_FPs.columns}")

Shape: (36, 23)
Columns: Index(['uid', 'accession', 'gds', 'title', 'summary', 'gpl', 'gse', 'taxon',
       'entrytype', 'gdstype', 'pdat', 'suppfile', 'samples', 'relations',
       'extrelations', 'n_samples', 'pubmedids', 'projects', 'ftplink',
       'geo2r', 'bioproject', 'pmcids', 'dois'],
      dtype='object')


In [88]:
df_FPs_copy = df_FPs.copy()

In [89]:
df1 = pd.read_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")
df1 = df1.drop(columns=["Unnamed: 0"]) if "Unnamed: 0" in df1.columns else df1
print(f"Shape: {df1.shape}")
print(f"Columns: {df1.columns}")

Shape: (239, 7)
Columns: Index(['publication', 'dataset_uid', 'repo_name', 'doi', 'raw_html',
       'publisher', 'smallest_elements'],
      dtype='object')


In [90]:
df1[['publication', 'dataset_uid', 'repo_name', 'doi']]

Unnamed: 0,publication,dataset_uid,repo_name,doi
0,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,PRIDE,10.1001/jamaneurol.2024.4763
1,https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,10.1002/cbic.202400831
2,https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,PRIDE,10.1002/cbic.202400882
3,https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,10.1002/prca.202300107
4,https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,10.1002/anie.202420149
...,...,...,...,...
234,https://www.ncbi.nlm.nih.gov/pubmed/39884247,PXD054970,PRIDE,
235,https://www.ncbi.nlm.nih.gov/pubmed/39900909,PXD058955,PRIDE,
236,https://www.ncbi.nlm.nih.gov/pubmed/39910101,PXD060051,iProX,
237,https://www.ncbi.nlm.nih.gov/pubmed/39910614,PXD060193,iProX,


In [91]:
df_FPs.loc[:, "dois"] = df_FPs["dois"].astype(str).str.lower()
df1.loc[:, "doi"] = df1['doi'].astype(str).str.lower()

print(f'FP DOI: {df_FPs["dois"]}')

for dois in df_FPs["dois"]:
    for doi in dois.split(","):
        if doi == "nan":
            continue
        if doi in df1["doi"].values:
            print(f"DOI {doi} found in df1")
        else:
            continue

FP DOI: 1936                                                    nan
2349                                    10.7554/elife.98523
2759                                                    nan
4767                                                    nan
6823                                                    nan
16154                            10.1038/s41467-024-51831-7
46541                            10.1038/s41590-023-01522-0
51322                            10.1186/s13059-023-02918-9
62852                               10.1126/science.abo7923
74285                            10.1038/s41556-022-00928-6
82664     10.1126/sciadv.abj3967,10.1038/s41467-022-34427-x
113698    10.1038/s41467-020-18231-z,10.1016/j.celrep.20...
115816                                                  nan
118628    10.1038/s41586-020-2266-0,10.1016/j.cell.2020....
124924                           10.1038/s41467-020-16444-w
129003                                                  nan
130045                          

In [92]:
# explode the dois column which is a string of multiple dois separated by commas
df_FPs = df_FPs.assign(dois=df_FPs["dois"].str.split(",")).explode("dois")
df_FPs

Unnamed: 0,uid,accession,gds,title,summary,gpl,gse,taxon,entrytype,gdstype,...,relations,extrelations,n_samples,pubmedids,projects,ftplink,geo2r,bioproject,pmcids,dois
1936,200247291,gse247291,,Adipocyte-specific Steap4 deficiency reduced t...,"Steap4, highly expressed in adipose tissue, is...",24247,247291,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],[],6.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE247nn...,no,PRJNA1037060,,
2349,200195983,gse195983,,Transcriptome profile of EZR_KO cells,To analyze gene expression differences between...,24247,195983,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],[],8.0,['39937579'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE195nn...,no,PRJNA802830,PMC11820125,10.7554/elife.98523
2759,200252821,gse252821,,Sphingolipid metabolism orchestrates the estab...,Bioactive sphingolipids serve as an essential ...,19057,252821,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],[],4.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE252nn...,no,PRJNA1062771,,
4767,200267957,gse267957,,Transcriptome analyses of Leishmania donovani ...,This study was performed with the goal of unde...,34491,267957,Leishmania donovani,GSE,Expression profiling by high throughput sequen...,...,[],[],6.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE267nn...,no,PRJNA1113700,,
6823,200283213,gse283213,,Comparative proteomic landscapes provide insig...,Understanding mammalian preimplantation develo...,21273,283213,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],[],24.0,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE283nn...,no,PRJNA1191977,,
16154,200269782,gse269782,,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,14550,269782,Homo sapiens,GSE,Expression profiling by array,...,[],[],9.0,['39251588'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nn...,yes,PRJNA1123684,PMC11385228,10.1038/s41467-024-51831-7
46541,200229376,gse229376,,Defining blood-induced microglia functions in ...,This SuperSeries is composed of the SubSeries ...,21103;24247,229376,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],[],63.0,['37291385'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE229nn...,no,PRJNA954299,PMC10307624,10.1038/s41590-023-01522-0
51322,200228201,gse228201,,Reading and writing of mRNA m6A modification o...,Background: Maternal and zygotic mRNAs play cr...,21273,228201,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],[],24.0,['37024923'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE228nn...,no,PRJNA947496,PMC10080794,10.1186/s13059-023-02918-9
62852,200197265,gse197265,,Translatome and transcriptome co-profiling rev...,Translational regulation plays a critical role...,24676,197265,Homo sapiens,GSE,Expression profiling by high throughput sequen...,...,[],[],226.0,['36074823'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE197nn...,yes,PRJNA809587,,10.1126/science.abo7923
74285,200165782,gse165782,,Ultrasensitive Ribo-seq reveals translational ...,"In mammals, translational control plays critic...",18480;29177,165782,Mus musculus,GSE,Expression profiling by high throughput sequen...,...,[],"[{'relationtype': 'SRA', 'targetobject': 'SRP3...",105.0,['35697785'],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE165nn...,no,PRJNA697913,,10.1038/s41556-022-00928-6


In [93]:
# Create mapping of DOI to concatenated GSE accession codes
doi_to_gse = (
    df_FPs.groupby("dois")["accession"]
    .apply(lambda x: ",".join(set(x.astype(str))))  # Ensures unique values
    .to_dict()
)

doi_to_gse.pop("nan")  # Remove any NaN keys
doi_to_gse

{'10.1016/j.cell.2014.08.012': 'gse53743',
 '10.1016/j.cell.2020.04.034': 'gse132730',
 '10.1016/j.celrep.2017.03.004': 'gse87544',
 '10.1016/j.celrep.2021.110251': 'gse132355',
 '10.1016/j.cels.2015.08.012': 'gse66715',
 '10.1038/nature11083': 'gse36892',
 '10.1038/ng.2330': 'gse38121',
 '10.1038/nsmb.2660': 'gse36552',
 '10.1038/s41467-017-01981-8': 'gse94460',
 '10.1038/s41467-020-16444-w': 'gse138760',
 '10.1038/s41467-020-18231-z': 'gse132355',
 '10.1038/s41467-022-33584-3': 'gse132730',
 '10.1038/s41467-022-34427-x': 'gse169632',
 '10.1038/s41467-024-51831-7': 'gse269782',
 '10.1038/s41467-024-52762-z': 'gse132730',
 '10.1038/s41556-022-00928-6': 'gse165782',
 '10.1038/s41586-020-2266-0': 'gse132730',
 '10.1038/s41586-020-2759-x': 'gse36552',
 '10.1038/s41590-023-01522-0': 'gse229376',
 '10.1038/s41598-018-33190-8': 'gse106765',
 '10.1093/nar/gkx469': 'gse87328',
 '10.1126/sciadv.aba1972': 'gse135893',
 '10.1126/sciadv.abj3967': 'gse169632',
 '10.1126/science.aac7368': 'gse72064'

In [94]:
for i,row in df1.iterrows():
    doi = row["doi"].lower()
    if doi == "nan" or doi is None:
        continue
    elif doi in doi_to_gse.keys():
        print(f"DOI: {doi} -- GSE: {doi_to_gse[doi]}")
        # update value at dataset_uid by appending GSE
        dataset_uid = df1.loc[i, "dataset_uid"] + "," + doi_to_gse[doi]
        df1.at[i, "dataset_uid"] = dataset_uid
    else:
        continue

DOI: 10.1038/s41467-024-51831-7 -- GSE: gse269782


In [95]:
df1[["publication","dataset_uid","doi"]]

Unnamed: 0,publication,dataset_uid,doi
0,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,10.1001/jamaneurol.2024.4763
1,https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,10.1002/cbic.202400831
2,https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,10.1002/cbic.202400882
3,https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,10.1002/prca.202300107
4,https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",10.1002/anie.202420149
...,...,...,...
234,https://www.ncbi.nlm.nih.gov/pubmed/39884247,PXD054970,
235,https://www.ncbi.nlm.nih.gov/pubmed/39900909,PXD058955,
236,https://www.ncbi.nlm.nih.gov/pubmed/39910101,PXD060051,
237,https://www.ncbi.nlm.nih.gov/pubmed/39910614,PXD060193,


In [96]:
df1.to_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")

In [32]:
# df1["doi"]

In [None]:
# # Function to find matching GSE codes for a given publication
# def find_matching_gse(publication, doi_dict):
#     matched_gse_codes = set()  # Use a set to avoid duplicates
#     
#     if pd.notna(publication):  # Ensure publication is not NaN
#         pub_lower = publication.lower().strip()  # Normalize publication case
#         
#         for doi_group, gse_codes in doi_dict.items():
#             # Split the multiple DOIs stored in the key
#             individual_dois = [doi.strip().lower() for doi in doi_group.split(",")]
#             
#             # Check if any of the individual DOIs is a substring of the publication
#             if any(doi in pub_lower for doi in individual_dois):
#                 matched_gse_codes.update(gse_codes.split(","))  # Add each GSE ID separately
# 
#     return ",".join(sorted(matched_gse_codes)) if matched_gse_codes else None  # Ensure consistent ordering
# 
# # Apply the function to find matches and update dataset_uid
# df1["new_dataset_uids"] = df1["doi"].apply(lambda pub: find_matching_gse(pub, doi_to_gse))
# 
# # Concatenate new dataset IDs with existing ones (if any)
# df1["dataset_uid"] = df1.apply(
#     lambda row: f"{row['dataset_uid']},{row['new_dataset_uids']}" if pd.notna(row["new_dataset_uids"]) else row["dataset_uid"], axis=1
# )
# 
# # Remove unnecessary column and strip redundant commas
# df1.drop(columns=["new_dataset_uids"], inplace=True)
# df1["dataset_uid"] = df1["dataset_uid"].str.strip(",")

In [None]:
# df1[["publication","dataset_uid"]]

In [None]:
# df1.to_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")

In [None]:
# we have dataframe df1 with all our scraped Ground Truth data (raw_html, publication, dataset_uid, ...)
# we have dataframe df_FPs with all our (supposedly) false positives on Ground Truth (accession, pmcid, doi)
# with the previous steps in the GEO dataset creation notebook, we now have ground truth for these "False Positives"
# look for matches, then update ground truth with the new data

# for i, row in df1.iterrows():
#     publication = row["publication"].lower()
#     for fp in df_FPs["doi"]:
#         fp = fp.lower()
#         if fp is not None and (fp in publication or fp == publication):
#             print(f"Publication: {publication}")
#             print(f"DOI of ex-false positive: {fp}")
#             vals = df_FPs[df_FPs["doi"] == fp]
#             print(f"ex-False Positive id: {vals['accession'].values}")
#             addenda = ',' + df_FPs[df_FPs["doi"] == fp]["accession"].values
#             #df1.at[i, "dataset_uid"] += addenda
#             #print(df1.at[i, "dataset_uid"])
#             #print(addenda)
#         elif fp is None:
#             print(None)