In [1]:
import re
import requests
from xml.etree import ElementTree
import json
import pandas as pd
import time
from data_gatherer.data_fetcher import *
from data_gatherer.parser import *
from data_gatherer.orchestrator import *
from dotenv import load_dotenv
import ast  # To safely evaluate string representation of a list
import bdikit as bdi
import requests

In [2]:
output_file = "exp_input/PRIDE_GSE_id_HTML_data.csv"
input_file = "exp_input/PX_id_HTML_v4.parquet"

In [3]:
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "gds",  # Search in GEO database
    "term": '"gse"[Entry Type]',  # Only fetch GEO Series (GSE)
    "retmax": "1000000",  # Maximum records to retrieve (adjust as needed)
    "retmode": "xml"
}

# Request dataset IDs from NCBI
response = requests.get(esearch_url, params=params)
root = ElementTree.fromstring(response.content)

# Extract dataset IDs
gse_ids = [id_elem.text for id_elem in root.findall(".//Id")]

print(f"Total datasets found: {len(gse_ids)}")
print("Sample GSE IDs:", gse_ids[:10])  # Show first 10 results

Total datasets found: 254549
Sample GSE IDs: ['200298457', '200298456', '200298343', '200298187', '200298184', '200298155', '200298096', '200298092', '200295914', '200295194']


In [None]:
def fetch_GEO_data(IDs,request_url,start,stop):
    params = {
        "db": "gds",
        "id": ",".join(IDs[start:stop]),  # Query for first 10 datasets
        "retmode": "json"
    }

    response = requests.get(request_url, params=params)
    
    try:
        data = response.json()
    except:
        raise ValueError("Failed to parse JSON response! Please check the response content.")
    
    return data

In [None]:
pd.DataFrame(fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",0,2))

In [None]:
# load the data from local file
print("Loading local data...")
try:
    df0 = pd.read_csv("exp_input/GEO_data_v1.csv")
    df0 = df0.drop(columns=["Unnamed: 0"])
    # drop null columns
    df0 = df0.dropna(axis=1, how='all')
    print(f"Shape: {df0.shape}")
    print(f"Columns: {df0.columns}")
except:
    df0 = None
    print("No local data found.")

In [None]:
# update dataset with new data whose id is not in the local dataset
if df0 is not None:
    all_ids = set(gse_ids)
    old_ids = set(list(df0["uid"].apply(str)))
    missing_ids = all_ids - old_ids
    
    print(f"Total missing datasets found: {len(missing_ids)}")
    
    data = {}
    step = 1
    
    for i, id in enumerate(missing_ids):
        if i % (len(missing_ids)//10) == 0 and i > 0:
            print(f"Iter {i}")
        try:
            new_data = fetch_GEO_data(list(missing_ids),"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi", i, i+step)
            # print(f"Data: {type(new_data)}")
            # print(f"Data: {new_data.keys()}")
            if "result" not in new_data:
                print(f"new_data: {new_data}")
            for uid, details in new_data['result'].items():
                if uid == "uids":  # Ignore metadata key
                    continue
                data[uid] = details
        except Exception as e:
            raise ValueError(f"Failed to fetch data for {id}! Error: {e}")
        
        time.sleep(0.33)  # Be nice to the server

else:
    data = {}
    i = 0
    mxm = 300
    while True:
        print(i)
        if i > len(gse_ids):
            break
        try:
            new_data = fetch_GEO_data(gse_ids,"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",i,mxm)
            #print(f"Data: {new_data}")
            for uid, details in new_data["result"].items():
                if uid == "uids":  # Ignore metadata key
                    continue
                data[uid] = details
        except:
            print(f"Error at {i}, {mxm}")
            
        i += 300
        mxm += 300
        time.sleep(0.1) 

In [None]:
df = pd.DataFrame(data).T
df = df.dropna(axis=1, how='all')
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")

In [None]:
# union of both dataframes
df = pd.concat([df0,df])
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")

In [None]:
df.to_csv("exp_input/GEO_data.csv")

In [31]:
df = pd.read_csv("exp_input/GEO_data.csv")
df.columns

  df = pd.read_csv("exp_input/GEO_data.csv")


Index(['Unnamed: 0', 'uid', 'accession', 'gds', 'title', 'summary', 'gpl',
       'gse', 'taxon', 'entrytype', 'gdstype', 'pdat', 'suppfile', 'samples',
       'relations', 'extrelations', 'n_samples', 'pubmedids', 'projects',
       'ftplink', 'geo2r', 'bioproject', 'ptechtype', 'valtype', 'ssinfo',
       'subsetinfo', 'seriestitle', 'platformtitle', 'platformtaxa',
       'samplestaxa'],
      dtype='object')

In [16]:
pmids = []
for i, row in df.iterrows():
    lst = ast.literal_eval(row['pubmedids'])
    if len(lst) > 0:
        pmids.extend(lst)
    else:
        continue

pmids = list(set(pmids))

print(f"Total PMIDs found: {len(pmids)}")

Total PMIDs found: 127239


In [21]:
def PMID_to_url(pmid):
    base_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
    return base_url + str(pmid)

def PMID_to_doi(pmid):
    base = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
    params = {"tool": "mytool", "email": "myemail@example.com", "ids": pmid, "format": "json"}    
    
    response = requests.get(base, params=params)
    
    if response.status_code == 200:
        data = response.json()
        records = data.get("records", [])
        if records and "doi" in records[0]:
            return records[0]["pmid"]
        else:
            return None  # No PMID found
    else:
        return None  # Request failed
    
def batch_PMID_to_doi(pmids, batch_size=100):
    base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
    results = {}

    for i in range(0, len(pmids), batch_size):
        progress = i / len(pmids) * 100
        print(f"Processing batch {i}-{i+batch_size} ({progress:.2f}%)")
        batch = pmids[i:i+batch_size]  # Get a batch of PMIDs
        params = {"tool": "mytool", "email": "myemail@example.com", "ids": ",".join(batch), "format": "json"}
        
        response = requests.get(base_url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            records = data.get("records", [])

            for record in records:
                pmid = record.get("pmid")
                doi = record.get("doi", None)  # Get DOI if available
                
                if pmid and pmid not in results:
                    results[pmid] = doi  # Store in dictionary
        
        else:
            print(f"API request failed for batch {i}-{i+batch_size}: {response.status_code}")
        
        time.sleep(0.5)  # Prevent hitting API rate limits (adjust as needed)

    return results

# PMCID_to_PMID("pmc9945972")

In [22]:
pmid_doi_mapping = batch_PMID_to_doi(pmids)

Processing batch 0-100 (0.00%)
Processing batch 100-200 (0.08%)
Processing batch 200-300 (0.16%)
Processing batch 300-400 (0.24%)
Processing batch 400-500 (0.31%)
Processing batch 500-600 (0.39%)
Processing batch 600-700 (0.47%)
Processing batch 700-800 (0.55%)
Processing batch 800-900 (0.63%)
Processing batch 900-1000 (0.71%)
Processing batch 1000-1100 (0.79%)
Processing batch 1100-1200 (0.86%)
Processing batch 1200-1300 (0.94%)
Processing batch 1300-1400 (1.02%)
Processing batch 1400-1500 (1.10%)
Processing batch 1500-1600 (1.18%)
Processing batch 1600-1700 (1.26%)
Processing batch 1700-1800 (1.34%)
Processing batch 1800-1900 (1.41%)
Processing batch 1900-2000 (1.49%)
Processing batch 2000-2100 (1.57%)
Processing batch 2100-2200 (1.65%)
Processing batch 2200-2300 (1.73%)
Processing batch 2300-2400 (1.81%)
Processing batch 2400-2500 (1.89%)
Processing batch 2500-2600 (1.96%)
Processing batch 2600-2700 (2.04%)
Processing batch 2700-2800 (2.12%)
Processing batch 2800-2900 (2.20%)
Proces

In [23]:
# Save the mapping to a JSON file
with open("exp_output/pmid_doi_mapping.json", "w") as file:
    json.dump(pmid_doi_mapping, file)

In [24]:
pmid_doi_mapping

{'17098860': '10.1104/pp.106.086306',
 '17921257': '10.1073/pnas.0701538104',
 '19119024': '10.1016/j.immuni.2008.11.005',
 '19386094': '10.1186/1471-2180-9-76',
 '20107517': '10.1371/journal.pgen.1000818',
 '20582282': '10.3389/neuro.15.003.2009',
 '20661288': '10.1371/journal.pone.0011637',
 '21775533': '10.1158/1078-0432.CCR-11-1133',
 '22359577': '10.1371/journal.pone.0031226',
 '22619330': '10.1073/pnas.1120461109',
 '22952644': '10.1371/journal.pone.0043189',
 '23159735': '10.1016/j.molcel.2012.10.013',
 '23644596': '10.1038/nsmb.2562',
 '24659297': '10.1007/s10048-014-0397-x',
 '25804332': '10.1016/j.ijmm.2015.03.002',
 '26124181': '10.18632/oncotarget.4372',
 '26755704': '10.1084/jem.20150894',
 '26916032': '10.1111/gbb.12288',
 '27332732': '10.1016/j.immuni.2016.06.006',
 '27694846': '10.1038/ncomms13018',
 '27760051': '10.1172/JCI87927',
 '28469092': '10.1172/jci.insight.92102',
 '28949290': '10.7554/eLife.28652',
 '29281816': '10.1016/j.celrep.2017.12.011',
 '29436476': '10.

In [25]:
# Load the mapping from a JSON file
with open("exp_output/pmid_doi_mapping.json", "r") as file:
    pmid_doi_mapping_v1 = json.load(file)

In [59]:
# count the number of None values in the mapping
count_none = sum(1 for doi in pmid_doi_mapping_v1.values() if doi is None)
print(f"Total None values in mapping: {count_none}")
print(f"Total valid mappings: {len(pmid_doi_mapping_v1) - count_none}")

Total None values in mapping: 22530


In [54]:
doi_to_pmid_mapping = {v: k for k, v in pmid_doi_mapping_v1.items()}

In [56]:
len(doi_to_pmid_mapping)

104710

In [28]:
# let's update the dataframe with the new mapping
df_groud_truth = pd.read_parquet("exp_input/PX_id_HTML_v4.parquet")

In [30]:
df_groud_truth.columns

Index(['publication', 'fetch_from', 'dataset_uid', 'repo_name', 'doi',
       'raw_html', 'publisher', 'smallest_elements', 'title', 'keywords'],
      dtype='object')

In [64]:
df_groud_truth.head()

Unnamed: 0_level_0,publication,fetch_from,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements,title,keywords
fetch_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,https://dx.doi.org/10.1001/jamaneurol.2024.4763,pxd056570,PRIDE,10.1001/jamaneurol.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,,Characterization of A Novel Mengingoencephalom...,Mengingoencephalomyelitis Autoantibodies
https://dx.doi.org/10.1002/1873-3468.15092,https://dx.doi.org/10.1002/1873-3468.15092,https://dx.doi.org/10.1002/1873-3468.15092,pxd057199,PRIDE,10.1002/1873-3468.15092,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The COPASI files (.cps) for the model an...",Homeoviscous adaptation to exogenous fatty aci...,"Escherichia coli, acyl carrier protein, exogen..."
https://dx.doi.org/10.1002/1878-0261.13654,https://dx.doi.org/10.1002/1878-0261.13654,https://dx.doi.org/10.1002/1878-0261.13654,pxd048538,PRIDE,10.1002/1878-0261.13654,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The MS proteomics data are available at ...",LC-MSMS based (phospho)proteomics on gastric c...,"gastric cancer, phosphoproteomics, tyrosine ph..."
https://dx.doi.org/10.1002/1878-0261.13733,https://dx.doi.org/10.1002/1878-0261.13733,https://dx.doi.org/10.1002/1878-0261.13733,pxd054727,PRIDE,10.1002/1878-0261.13733,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>Proteomics data generated during this st...",E-selectin Affinity Glycoproteomics Reveals Ne...,"E-selectin, cancer glycoproteome, colorectal c..."
https://dx.doi.org/10.1002/ADHM.202404465,https://dx.doi.org/10.1002/ADHM.202404465,https://dx.doi.org/10.1002/adhm.202404465,pxd052728,PRIDE,10.1002/adhm.202404465,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The mass spectrometry proteomics data ha...",in vivo-like scaffold-free 3D in vitro Models ...,"Cell sheet engineering, Drug treatment, Dystro..."


In [39]:
# count how many times dx.doi.org is in df_groud_truth['publication']
count,pmcnt = 0,0
for pub in df_groud_truth['publication']:
    if pub is not None and "dx.doi.org" in pub:
        count += 1
    elif pub is not None and "pubmed" in pub:
        pmcnt += 1
print(f"Total dx.doi.org found: {count}")
print(f"Total pubmed found: {pmcnt}")
print(f"Total publications: {len(df_groud_truth['publication'])}")
print(f"Total matches: {count+pmcnt}")

Total dx.doi.org found: 1356
Total pubmed found: 698
Total publications: 2061
Total matches: 2054


In [48]:
pmid_gse_mapping = df[['pubmedids','accession']].T.to_dict()

In [52]:
# for each pubmedids, eval string to list and then iterate over the list to get the pmid, then save mapping[pmid] = accession
pmid_gse_mapping = {}
for i, row in df.iterrows():
    lst = ast.literal_eval(row['pubmedids'])
    for pmid in lst:
        pmid_gse_mapping[pmid] = row['accession']

In [53]:
print(f"len: {len(pmid_gse_mapping)}")

len: 127239


In [62]:
# for each entry in ground truth, we will create a column with GSE ids. We can use the publication_col to find the DOI or the PM id
# then we can use the mapping {pmid -> doi} to find the PM id for doi
# then we can use the mapping {pmid -> gse} to find the GSE id for the PM id
count, pmcnt = 0, 0
gse_ids = {}
cnt_tot = 0

for i, row in df_groud_truth.iterrows():
    publication = row["publication"]
    
    if publication and "dx.doi.org" in publication:
        match = re.search(r'dx.doi.org/([a-zA-Z0-9./-]+)', publication)
        if match:
            doi = match.group(1)
            count += 1
            if doi in doi_to_pmid_mapping:
                pmid = doi_to_pmid_mapping[doi]  # Convert DOI to PMID
                if pmid in pmid_gse_mapping:
                    cnt_tot += 1
                    if i not in gse_ids:
                        gse_ids[i] = pmid_gse_mapping[pmid]
                    else:
                        gse_ids[i] += "," + pmid_gse_mapping[pmid]

    elif publication and "pubmed" in publication:
        match = re.search(r'pubmed/([0-9]+)', publication)
        if match:
            pmid = match.group(1)
            pmcnt += 1
            if pmid in pmid_gse_mapping:
                cnt_tot += 1
                if i not in gse_ids:
                    gse_ids[i] = pmid_gse_mapping[pmid]
                else:
                    gse_ids[i] += "," + pmid_gse_mapping[pmid]
        
print(f"Total dx.doi.org found: {count}")
print(f"Total pubmed found: {pmcnt}")
print(f"Total matches in gse: {cnt_tot}")
print(f"GSE IDs found: {len(gse_ids)}")

Total dx.doi.org found: 1356
Total pubmed found: 698
Total matches in gse: 141
GSE IDs found: 141


In [63]:
# update the dataframe with the new GSE ids
for i, gse_id in gse_ids.items():
    if "gse" in gse_id.lower():
        if "gse" in str(df_groud_truth.at[i, "dataset_uid"]):
            df_groud_truth.at[i, "dataset_uid"] += "," + gse_id
        else:
            df_groud_truth.at[i, "dataset_uid"] = gse_id

Updating row https://dx.doi.org/10.1002/advs.202300043 with GSE ID: GSE204773


ZeroDivisionError: division by zero

In [None]:
# # use False Positives from a data-gatherer model run
# gse_FPs = []
# # Sample false positives load from txt file where separated by newline
# with open("exp_output/model_FPs.txt", "r") as file:
#     false_positives = file.read().splitlines()
# 
# for fp in false_positives:
#     if 'gse' in fp and 'https' in fp:
#         fp = fp.split("=")[-1]
#         gse_FPs.append(fp)
#     elif 'gse' in fp:
#         gse_FPs.append(fp)
# 
# print(f"Total false positives found: {len(gse_FPs)}")

In [None]:
# # no capital case in accession
# df["accession"] = df["accession"].astype(str).str.lower()
# # iterate
# i = 0
# gse_uids = []
# for fp in gse_FPs:
#     # check if false positive is in the dataset
#     if fp in df["accession"].values:
#         i+=1
#         # drop false positive from dataset
#         #print(f"False positive {fp} FOUND in dataset.")
#         gse_uids.append(fp)
#     else:
#         print(f"False positive {fp} NOT FOUND in dataset.")
# 
# print(f"Total false positives found also in local GEO data: {i}")
# #print(f"Unique false positives found: {len(set(gse_uids))}")

In [None]:
# # get all false positives from dataframe
# df_FPs = df[df["accession"].isin(gse_uids)]
# df_FPs

In [None]:
# function to get PMCid and doi from pubmed id
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
orchestrator.setup_data_fetcher()

In [None]:
for i, row in df_FPs.iterrows():
    pubmed_id = row["pubmedids"]
    orchestrator.logger.info(f"PubMed id: {pubmed_id} --item {i+1}")  
    
    # Convert string representation of list to an actual list
    if isinstance(pubmed_id, str) and pubmed_id.startswith("["):
        pubmed_id = ast.literal_eval(pubmed_id)  # Converts to a real list
        pubmed_ids = pubmed_id if pubmed_id else None  # Extract the first element
        
    if pubmed_ids is None:    
        orchestrator.logger.info(f"pubmed_ids is None")  
        continue

    orchestrator.logger.info(f"Type of elements in pubmed_ids <class 'list'>: {[type(item) for item in pubmed_ids]}")  # Should now be <class 'str'>
    
    pmc_ids, dois = [], []
    
    for pmid in pubmed_ids:    
        pmc_id, doi = orchestrator.data_fetcher.get_opendata_from_pubmed_id(pmid)
        pmc_ids.append(pmc_id)
        dois.append(doi)
        
    orchestrator.logger.info(f"PMC ID: {pmc_ids}")
    orchestrator.logger.info(f"DOI: {dois}\n")
    
    #Store results
    df_FPs.loc[i, "pmcids"] = ",".join([str(x) for x in pmc_ids if x]) if any(pmc_ids) else None
    df_FPs.loc[i, "dois"] = ",".join([str(x) for x in dois if x]) if any(dois) else None

orchestrator.data_fetcher.quit()

In [None]:
print(f"Shape: {df_FPs.shape}")
print(f"Columns: {df_FPs.columns}")

In [None]:
df_FPs_copy = df_FPs.copy()

In [None]:
df1 = pd.read_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")
df1 = df1.drop(columns=["Unnamed: 0"]) if "Unnamed: 0" in df1.columns else df1
print(f"Shape: {df1.shape}")
print(f"Columns: {df1.columns}")

In [None]:
df1[['publication', 'dataset_uid', 'repo_name', 'doi']]

In [None]:
df_FPs.loc[:, "dois"] = df_FPs["dois"].astype(str).str.lower()
df1.loc[:, "doi"] = df1['doi'].astype(str).str.lower()

print(f'FP DOI: {df_FPs["dois"]}')

for dois in df_FPs["dois"]:
    for doi in dois.split(","):
        if doi == "nan":
            continue
        if doi in df1["doi"].values:
            print(f"DOI {doi} found in df1")
        else:
            continue

In [None]:
# explode the dois column which is a string of multiple dois separated by commas
df_FPs = df_FPs.assign(dois=df_FPs["dois"].str.split(",")).explode("dois")
df_FPs

In [None]:
# Create mapping of DOI to concatenated GSE accession codes
doi_to_gse = (
    df_FPs.groupby("dois")["accession"]
    .apply(lambda x: ",".join(set(x.astype(str))))  # Ensures unique values
    .to_dict()
)

doi_to_gse.pop("nan")  # Remove any NaN keys
doi_to_gse

In [None]:
for i,row in df1.iterrows():
    doi = row["doi"].lower()
    if doi == "nan" or doi is None:
        continue
    elif doi in doi_to_gse.keys():
        print(f"DOI: {doi} -- GSE: {doi_to_gse[doi]}")
        # update value at dataset_uid by appending GSE
        dataset_uid = df1.loc[i, "dataset_uid"] + "," + doi_to_gse[doi]
        df1.at[i, "dataset_uid"] = dataset_uid
    else:
        continue

In [None]:
df1[["publication","dataset_uid","doi"]]

In [None]:
df1.to_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")

In [None]:
# df1["doi"]

In [None]:
# # Function to find matching GSE codes for a given publication
# def find_matching_gse(publication, doi_dict):
#     matched_gse_codes = set()  # Use a set to avoid duplicates
#     
#     if pd.notna(publication):  # Ensure publication is not NaN
#         pub_lower = publication.lower().strip()  # Normalize publication case
#         
#         for doi_group, gse_codes in doi_dict.items():
#             # Split the multiple DOIs stored in the key
#             individual_dois = [doi.strip().lower() for doi in doi_group.split(",")]
#             
#             # Check if any of the individual DOIs is a substring of the publication
#             if any(doi in pub_lower for doi in individual_dois):
#                 matched_gse_codes.update(gse_codes.split(","))  # Add each GSE ID separately
# 
#     return ",".join(sorted(matched_gse_codes)) if matched_gse_codes else None  # Ensure consistent ordering
# 
# # Apply the function to find matches and update dataset_uid
# df1["new_dataset_uids"] = df1["doi"].apply(lambda pub: find_matching_gse(pub, doi_to_gse))
# 
# # Concatenate new dataset IDs with existing ones (if any)
# df1["dataset_uid"] = df1.apply(
#     lambda row: f"{row['dataset_uid']},{row['new_dataset_uids']}" if pd.notna(row["new_dataset_uids"]) else row["dataset_uid"], axis=1
# )
# 
# # Remove unnecessary column and strip redundant commas
# df1.drop(columns=["new_dataset_uids"], inplace=True)
# df1["dataset_uid"] = df1["dataset_uid"].str.strip(",")

In [None]:
# df1[["publication","dataset_uid"]]

In [None]:
# df1.to_csv("exp_input/PRIDE_GSE_id_HTML_data.csv")

In [None]:
# we have dataframe df1 with all our scraped Ground Truth data (raw_html, publication, dataset_uid, ...)
# we have dataframe df_FPs with all our (supposedly) false positives on Ground Truth (accession, pmcid, doi)
# with the previous steps in the GEO dataset creation notebook, we now have ground truth for these "False Positives"
# look for matches, then update ground truth with the new data

# for i, row in df1.iterrows():
#     publication = row["publication"].lower()
#     for fp in df_FPs["doi"]:
#         fp = fp.lower()
#         if fp is not None and (fp in publication or fp == publication):
#             print(f"Publication: {publication}")
#             print(f"DOI of ex-false positive: {fp}")
#             vals = df_FPs[df_FPs["doi"] == fp]
#             print(f"ex-False Positive id: {vals['accession'].values}")
#             addenda = ',' + df_FPs[df_FPs["doi"] == fp]["accession"].values
#             #df1.at[i, "dataset_uid"] += addenda
#             #print(df1.at[i, "dataset_uid"])
#             #print(addenda)
#         elif fp is None:
#             print(None)

In [65]:
df_10 = pd.read_parquet("exp_input/PX_id_HTML_v4.parquet")

In [66]:
df_10['publisher'].value_counts()

publisher
pubmed              698
sciencedirect       259
nature              208
cytoscape           182
mdpi                 83
                   ... 
icm-experimental      1
cshlp                 1
iucr                  1
karger                1
jamanetwork           1
Name: count, Length: 63, dtype: int64

In [67]:
df_10[df_10['publisher']=='proteomexchange']

Unnamed: 0_level_0,publication,fetch_from,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements,title,keywords
fetch_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://dx.doi.org/10.6019/PXD051338,https://dx.doi.org/10.6019/PXD051338,https://dx.doi.org/10.6019/pxd051338,pxd051338,PRIDE,10.6019/pxd051338,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",Peptide mass fingerprinting to investigate if ...,"DHFR variants, PMF"
https://dx.doi.org/10.6019/PXD051588,https://dx.doi.org/10.6019/PXD051588,https://dx.doi.org/10.6019/pxd051588,pxd051588,PRIDE,10.6019/pxd051588,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",STRUCTURE OF APOLIPOPROTEIN B100 BOUND TO LOW-...,"apolipoprotein B100, cardiovascular disease, c..."
https://dx.doi.org/10.6019/PXD051745,https://dx.doi.org/10.6019/PXD051745,https://dx.doi.org/10.6019/pxd051745,pxd051745,PRIDE,10.6019/pxd051745,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",The Protein Composition of Human Adenovirus Re...,"HMGB1, Human adenovirus (HAdV), biomolecular ..."
https://dx.doi.org/10.6019/PXD051771,https://dx.doi.org/10.6019/PXD051771,https://dx.doi.org/10.6019/pxd051771,pxd051771,PRIDE,10.6019/pxd051771,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",N-Glycosylation-Defective IL6 Alternatively Ac...,"EGFR-TKI resistance, EMT, IL6, SRC, YAP, glyco..."
https://dx.doi.org/10.6019/PXD052143,https://dx.doi.org/10.6019/PXD052143,https://dx.doi.org/10.6019/pxd052143,pxd052143,PRIDE,10.6019/pxd052143,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",Cardio-metabolic and cytoskeletal proteomic si...,"Duchenne muscular dystrophy, cytoskeletal orga..."
...,...,...,...,...,...,...,...,...,...,...
https://dx.doi.org/10.6019/PXD059791,https://dx.doi.org/10.6019/PXD059791,https://dx.doi.org/10.6019/pxd059791,pxd059791,PRIDE,10.6019/pxd059791,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",Forced Intracellular degradation of Xenoantige...,"cell-based cancer immunotherapy, immunopeptido..."
https://dx.doi.org/10.6019/PXD059803,https://dx.doi.org/10.6019/PXD059803,https://dx.doi.org/10.6019/pxd059803,pxd059803,PRIDE,10.6019/pxd059803,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",Genetic analysis of arabidopsis autophagy-rela...,"AIM, ATG8, Autophagy, LDS, Nutrient recycling,..."
https://dx.doi.org/10.6019/PXD059817,https://dx.doi.org/10.6019/PXD059817,https://dx.doi.org/10.6019/pxd059817,pxd059817,PRIDE,10.6019/pxd059817,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",Composition and liquid-to-solid maturation of ...,Bacterial Dormancy/ Protein Aggregation/ Energ...
https://dx.doi.org/10.6019/PXD059868,https://dx.doi.org/10.6019/PXD059868,https://dx.doi.org/10.6019/pxd059868,pxd059868,PRIDE,10.6019/pxd059868,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...",proteomexchange,"[[""<table class=\""dataset-summary\""><tbody><tr...",Composition and liquid-to-solid maturation of ...,Bacterial Dormancy/ Protein Aggregation/ Energ...


In [68]:
df_10.columns

Index(['publication', 'fetch_from', 'dataset_uid', 'repo_name', 'doi',
       'raw_html', 'publisher', 'smallest_elements', 'title', 'keywords'],
      dtype='object')

In [69]:
df_10.head()

Unnamed: 0_level_0,publication,fetch_from,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements,title,keywords
fetch_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,https://dx.doi.org/10.1001/jamaneurol.2024.4763,pxd056570,PRIDE,10.1001/jamaneurol.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,,Characterization of A Novel Mengingoencephalom...,Mengingoencephalomyelitis Autoantibodies
https://dx.doi.org/10.1002/1873-3468.15092,https://dx.doi.org/10.1002/1873-3468.15092,https://dx.doi.org/10.1002/1873-3468.15092,pxd057199,PRIDE,10.1002/1873-3468.15092,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The COPASI files (.cps) for the model an...",Homeoviscous adaptation to exogenous fatty aci...,"Escherichia coli, acyl carrier protein, exogen..."
https://dx.doi.org/10.1002/1878-0261.13654,https://dx.doi.org/10.1002/1878-0261.13654,https://dx.doi.org/10.1002/1878-0261.13654,pxd048538,PRIDE,10.1002/1878-0261.13654,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The MS proteomics data are available at ...",LC-MSMS based (phospho)proteomics on gastric c...,"gastric cancer, phosphoproteomics, tyrosine ph..."
https://dx.doi.org/10.1002/1878-0261.13733,https://dx.doi.org/10.1002/1878-0261.13733,https://dx.doi.org/10.1002/1878-0261.13733,pxd054727,PRIDE,10.1002/1878-0261.13733,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>Proteomics data generated during this st...",E-selectin Affinity Glycoproteomics Reveals Ne...,"E-selectin, cancer glycoproteome, colorectal c..."
https://dx.doi.org/10.1002/ADHM.202404465,https://dx.doi.org/10.1002/ADHM.202404465,https://dx.doi.org/10.1002/adhm.202404465,pxd052728,PRIDE,10.1002/adhm.202404465,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The mass spectrometry proteomics data ha...",in vivo-like scaffold-free 3D in vitro Models ...,"Cell sheet engineering, Drug treatment, Dystro..."


In [None]:
fetched_data_path = "exp_input/fetched_data.parquet"

fetched_df = pd.read_parquet(fetched_data_path)