In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from orchestrator import *
import os
import pandas as pd
import re
import time
import json

In [3]:
#os.remove("exp_input/PX_id_HTML.parquet")

In [4]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
output_file = "exp_input/PX_id_HTML.parquet"
input_file = "exp_input/PX_id_HTML.parquet"
orchestrator.logger.info(f"Input/Output file exists: {os.path.exists(input_file),os.path.exists(output_file)}")

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp
3313608735.py - line 6 - INFO - Input/Output file exists: (False, False)


In [5]:
urls = None
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
publication_datasets = df[['publication','identifier','repository','title','keywords']]

In [6]:
# get an idea of values in df publication
publication_datasets['publication'].value_counts()

publication
Dataset with its publication pending                                                                                                                                                                      11719
no publication                                                                                                                                                                                             2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>                                                                                                               56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>                                                                                                            28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>                                                              

In [7]:
#  clean the `publication` column by filtering out unwanted values like `"Dataset with its publication pending"`, `"no publication"`, and any HTML tags. 
# Remove rows with unwanted values
filtered_df = publication_datasets[~publication_datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]

In [8]:
# Assuming filtered_df is already defined
filtered_df['publication_link'] = None  # Create a new column for the links

for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub) ########### "extract" "pandas function
        if match:
            filtered_df.at[i, 'publication_link'] = match  # Assign all the matched URLs
        else:
            filtered_df.at[i, 'publication_link'] = None
    else:
        filtered_df.at[i, 'publication_link'] = None

# Drop rows with missing links (optional)
filtered_df.dropna(subset=['publication_link'], inplace=True)
filtered_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['publication_link'] = None  # Create a new column for the links
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.dropna(subset=['publication_link'], inplace=True)


In [9]:
# filtered_df['publication_link'].value_counts()
# # starting from publication, make some string substitution and create a new column: doi
# filtered_df["doi"] = filtered_df["publication"].copy()
# filtered_df["doi"] = filtered_df["doi"].apply(orchestrator.data_fetcher.convert_url_to_doi)

In [10]:
m = 0
for i,row in filtered_df.iterrows():
    id = row['identifier']
    m+=len(row['publication_link'])
orchestrator.logger.info(f"Average number of publications per dataset: {m/i+1}")

1793603813.py - line 5 - INFO - Average number of publications per dataset: 2.5313095933060907


In [11]:
# print(filtered_df['publication'][1])
# print(filtered_df['publication_link'][1])
# # print publication and publication link fields

In [12]:
# raw_data_v1.csv \ PRIDEid_HTML_data.csv is the file containing the old data
try:
    # check if the file exists at path input_file
    df_old = pd.read_parquet(input_file)
    if "publication" in df_old.columns:
        df_old.set_index("publication",inplace=True) 
    # check the uid of the last value of df_old and find its index in filtered_df
    dataset_uids = list(df_old['dataset_uid'].values)
    # change this: do not set start point. Just skip link that are already in df
    all_ids = []
    for element in [item.split(',') for item in df_old['dataset_uid'].values]:
        all_ids.extend(element)
    orchestrator.logger.info(f"File found:\n{df_old.head()}")
except FileNotFoundError:
    df_old = pd.DataFrame(columns=['publication','dataset_uid','repo_name','raw_html','smallest_elements'])
    dataset_uids = []
    all_ids = []
    orchestrator.logger.info("File not found")

3762078171.py - line 18 - INFO - File not found


In [13]:
# count distinct values in dataset_uids
n_datasets = len(set(all_ids))
print(f"# of Datasets already added: {n_datasets}")
print(f"# of Publications already added: {len(df_old)}")

# of Datasets already added: 0
# of Publications already added: 0


In [14]:
def extract_all_elements_with_UID(source_html, uid):
    soup = BeautifulSoup(source_html, "html.parser")
    
    matching_elements = []
    
    for p in soup.find_all(["table","p"]):  # Find only <p> elements
        text = p.get_text(strip=True)
        
        if uid in text:  # Check if UID is in the text
            matching_elements.append((str(p), len(text)))  # Store element and length

    # If multiple matches, return the **smallest** one
    if matching_elements:
        # smallest_p, _ = min(matching_elements, key=lambda x: x[1])  # Find smallest
        return matching_elements  # Pretty-print the raw HTML for debugging # smallest_p
    
    return [None]  # No match found

In [15]:
def add_example_to_merged_df(row, raw_html):
    # handle uid also when comma-separated, then split and extract smallest element
    if 'identifier' in row:
        uid = row['identifier']
    elif 'dataset_uid' in row:
        uid = row['dataset_uid']
    if ',' in uid:
        uids = uid.split(',')
        elements = []
        for uid in uids:
            elm_i = extract_all_elements_with_UID(raw_html, uid)
            if elm_i in elements: # no dupes
                continue
            else:
                elements.append(elm_i)
        return elements
    else:
        return extract_all_elements_with_UID(raw_html, uid)


In [16]:
data = []

iter_max = 10

t0 = time.time()

for i,row in filtered_df.iterrows():
        
    id = row['identifier']
    
    if id in dataset_uids:
        orchestrator.logger.info(f"Skipping dataset {id} as it is already in the data")
        continue
    
    if i == iter_max:
        break
        
    if i%100 == 0 and i>0:
        orchestrator.logger.info(f"Progress {i+1}/{len(filtered_df)}. ETA {((time.time()-t0)/(i+1))*(len(filtered_df)-i-1)}")
        
    # get time in a nice format like hh:mm:ss
    time_elapsed = time.strftime('%H:%M:%S', time.gmtime(time.time()-t0))
        
    orchestrator.logger.info(f"Processing URL {i+1}.\nTime elapsed: {time_elapsed}") if i>0 else None
    orchestrator.logger.info(f"{len(row['publication_link'])} links found for dataset {id}")
    
    j = 0
    for url in row['publication_link']:
        j+=1
        
        driver = orchestrator.setup_data_fetcher()
        
        orchestrator.logger.info(f"Processing URL: {url}")
        adjusted_url_for_fetch = url
    
        orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(url, orchestrator.full_DOM, orchestrator.logger)
        
        try:
            #orchestrator.logger.info("Fetching Raw content")
            raw_data = orchestrator.data_fetcher.fetch_data(url)
            doi = orchestrator.data_fetcher.convert_url_to_doi(url)
            url = url.lower()
            
            orchestrator.logger.info(f"Publication #: {j}, current URL: {orchestrator.data_fetcher.scraper_tool.current_url}")
            orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(orchestrator.data_fetcher.scraper_tool.current_url)
            
            if orchestrator.publisher == "biorxiv":
                adjusted_url_for_fetch = orchestrator.data_fetcher.scraper_tool.current_url + ".full"
                raw_data = orchestrator.data_fetcher.fetch_data(adjusted_url_for_fetch)
                
            if orchestrator.publisher == "pubmed":
                PMC_ID = orchestrator.data_fetcher.get_PMCID_from_pubmed_html(raw_data)
                if PMC_ID is not None:
                    adjusted_url_for_fetch = orchestrator.data_fetcher.reconstruct_PMC_link(PMC_ID)
                    raw_data = orchestrator.data_fetcher.fetch_data(adjusted_url_for_fetch)

            if id in raw_data:
                example = add_example_to_merged_df(row, raw_data)
                orchestrator.logger.info(f"Example: {example}")
                
                data.append({"publication": url, "fetch_from" : adjusted_url_for_fetch, "doi": doi, "publisher":orchestrator.publisher, "dataset_uid": id, "repo_name": row['repository'],
                             "raw_html": raw_data, "smallest_elements": example, "title" : row['title'], "keywords" : row['keywords']})
                orchestrator.logger.info(f"Data appended for {url}")
                driver.quit()
                continue
            else:
                orchestrator.logger.info("id not found in raw data")
                data.append({"publication": url, "fetch_from" : adjusted_url_for_fetch, "doi": doi, "publisher":orchestrator.publisher, "dataset_uid": id, "repo_name": row['repository'],
                             "raw_html": raw_data, "smallest_elements": "n/a", "title" : row['title'], "keywords" : row['keywords']})
                driver.quit()
                continue
        except Exception as e:
            orchestrator.logger.error(f"Error processing URL {url}: {e}", exc_info=True)

print(f"Time elapsed for {i-n_datasets} iterations: {time.strftime('%H:%M:%S', time.gmtime(time.time() - t0))}")

1323953949.py - line 25 - INFO - 1 links found for dataset PXD059466
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
1323953949.py - line 33 - INFO - Processing URL: https://dx.doi.org/10.1038/S41467-025-56720-1
data_fetcher.py - line 70 - INFO - Non-API URL detected, or API unsupported. Webscraper update
data_fetcher.py - line 266 - INFO - DOI: 10.1038/s41467-025-56720-1
1323953949.py - line 44 - INFO - Publication #: 1, current URL: https://www.nature.com/articles/s41467-025-56720-1
data_fetcher.py - line 35 - INFO - Publisher: nature
1323953949.py - line 59 - INFO - Example: [('<p>The protein mass spectrometry (LC-MS/MS) data produced in this study are available in the ProteomeXchange under accession code PXD059466. All data generated or analyzed in this study are included in this published article and its supplementary information files.\xa0<a data-track="click" data-track-action="section anchor" data-track-label="link" href="/articles/s41467-025-56720-1#Sec33">Sou

Time elapsed for 10 iterations: 00:03:16


In [17]:
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
 
if "smallest_elements" in df.columns:    
    df["smallest_elements"] = df["smallest_elements"].apply(
        lambda x: json.dumps(x) if isinstance(x, list) else (x if isinstance(x, str) else json.dumps([]))
    )
df

Unnamed: 0_level_0,fetch_from,doi,publisher,dataset_uid,repo_name,raw_html,smallest_elements,title,keywords
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://dx.doi.org/10.1038/s41467-025-56720-1,https://dx.doi.org/10.1038/S41467-025-56720-1,10.1038/s41467-025-56720-1,nature,PXD059466,PRIDE,"<html lang=""en"" class=""js""><head><link rel=""pr...","[[""<p>The protein mass spectrometry (LC-MS/MS)...",Endothelial SHANK3 Regulates Tight Junctions i...,"BioID2, SHANK3, bEnd3. Cells"
https://dx.doi.org/10.6019/pxd051312,https://dx.doi.org/10.6019/PXD051312,10.6019/pxd051312,proteomexchange,PXD051312,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan...","[[""<table class=\""dataset-summary\""><tbody><tr...",Systemic changes in early pregnancy in the mar...,"MRP, TTR, biomarker, early pregnancy loss, sec..."
https://www.ncbi.nlm.nih.gov/pubmed/39912552,https://www.ncbi.nlm.nih.gov/pubmed/39912552,,pubmed,PXD051312,PRIDE,"<html lang=""en""><head itemscope="""" itemtype=""h...",,Systemic changes in early pregnancy in the mar...,"MRP, TTR, biomarker, early pregnancy loss, sec..."
https://dx.doi.org/10.1002/prca.202400095,https://dx.doi.org/10.1002/prca.202400095,10.1002/prca.202400095,wiley,PXD051312,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i...","[[""<p>Mass spectrometry proteomics data were d...",Systemic changes in early pregnancy in the mar...,"MRP, TTR, biomarker, early pregnancy loss, sec..."
https://dx.doi.org/10.17159/sajs.2025/18571,https://dx.doi.org/10.17159/SAJS.2025/18571,10.17159/sajs.2025/18571,co,PXD054431,PRIDE,"<html lang=""en"" xml:lang=""en""><head>\n\t<meta ...","[[""<p><strong>Open data set :</strong> <a href...",Results from an Australopithecus africanus den...,"Australopithecus africanus, Enamel, Paleaoprot..."
https://dx.doi.org/10.1101/2024.04.03.587901,https://www.biorxiv.org/content/10.1101/2024.0...,10.1101/2024.04.03.587901,biorxiv,PXD050443,PRIDE,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3...","[[""<p id=\""p-81\"">All the sequencing data gene...",Arabidopsis and Spirodela Histone PTMs (In cas...,"Histone PTMs, Plants, Transposons, epigenetics"
https://dx.doi.org/10.1101/2025.02.05.636703,https://www.biorxiv.org/content/10.1101/2025.0...,10.1101/2025.02.05.636703,biorxiv,PXD051704,PRIDE,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3...","[[""<p id=\""p-66\"">The mass spectrometry proteo...",Time-series cotton fiber development profiling...,"Gossypium hirsutum, PCP, cotton, fiber develop..."
https://dx.doi.org/10.1186/s13100-024-00339-4,https://dx.doi.org/10.1186/S13100-024-00339-4,10.1186/s13100-024-00339-4,biomedcentral,PXD057269,PRIDE,"<html lang=""en"" class=""js webfonts-loaded""><he...","[[""<p>All the MS data files have been deposite...",Targeted mass spectrometry of endogenous LINE-...,"LINE-1, PRM, SRM, Targeted proteomics, cancer,..."
https://dx.doi.org/10.1186/s12964-025-02046-w,https://dx.doi.org/10.1186/s12964-025-02046-w,10.1186/s12964-025-02046-w,biomedcentral,PXD026293,PRIDE,"<html lang=""en"" class=""js webfonts-loaded""><he...","[[""<p>The mass spectrometry proteomics data ha...",Inhibition of NLRP3 enhances pro-apoptotic eff...,"Acute Myeloid Leukemia, EiF2Î±, NLRP3"
https://www.ncbi.nlm.nih.gov/pubmed/39875995,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,,pubmed,PXD026293,PRIDE,"<html lang=""en"" class=""""><head>\n\n <me...","[[""<p>The mass spectrometry proteomics data ha...",Inhibition of NLRP3 enhances pro-apoptotic eff...,"Acute Myeloid Leukemia, EiF2Î±, NLRP3"


In [18]:
if os.path.exists(input_file):
    df_old = pd.read_parquet(input_file)
    if "publication" in df_old.columns:
        df_old.set_index("publication",inplace=True)
    orchestrator.logger.info(df_old.shape)
    # append rows to the existing file
    df_new = pd.concat([df_old,df])
    df_new.to_parquet(output_file)
    orchestrator.logger.info(f"Data appended to {output_file}")
else:
    df.to_parquet(output_file)
    orchestrator.logger.info(f"Data written to new file {output_file}")
    df_new = df

1422399759.py - line 12 - INFO - Data written to new file exp_input/PX_id_HTML.parquet


In [19]:
df_merged = (
    df_new.reset_index()
    .groupby('publication')
    .agg({
        'fetch_from': 'first',  # Keep the first fetch_from,
        'dataset_uid': lambda x: ','.join(sorted(set(x))),  # Concatenate unique dataset_uids
        'repo_name': lambda x: ','.join(sorted(set(x))),   # Concatenate unique repo_names
        'doi': 'first',  # Keep the first raw_html,
        'raw_html': 'first',  # Keep the first raw_html,
        'publisher' : 'first',
        'smallest_elements': lambda x: ' | '.join(filter(None, x)),  # Remove None values and join strings
        'title' : lambda x: ','.join(sorted(set(x))),
        'keywords' : lambda x: ','.join(sorted(set(x)))
    })
)

In [20]:
df_merged

Unnamed: 0_level_0,fetch_from,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements,title,keywords
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://dx.doi.org/10.1002/prca.202400095,https://dx.doi.org/10.1002/prca.202400095,PXD051312,PRIDE,10.1002/prca.202400095,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,"[[""<p>Mass spectrometry proteomics data were d...",Systemic changes in early pregnancy in the mar...,"MRP, TTR, biomarker, early pregnancy loss, sec..."
https://dx.doi.org/10.1016/j.biortech.2024.132023,https://dx.doi.org/10.1016/J.BIORTECH.2024.132023,PXD057021,PRIDE,10.1016/j.biortech.2024.132023,"<html lang=""en-us"" data-astro-cid-ns72h3ro="""" ...",sciencedirect,,Proteomic analysis of natural photoheterotroph...,"Clostridium sp., Proteins expression, Rhodopse..."
https://dx.doi.org/10.1016/j.mcpro.2024.100892,https://dx.doi.org/10.1016/J.MCPRO.2024.100892,PXD053023,PRIDE,10.1016/j.mcpro.2024.100892,"<html lang=""en-us"" data-astro-cid-ns72h3ro="""" ...",sciencedirect,,Electrophoresis-Correlative Framing of Ion Mob...,"Single cell, Xenopus laevis, capillary electro..."
https://dx.doi.org/10.1038/s41467-025-56720-1,https://dx.doi.org/10.1038/S41467-025-56720-1,PXD059466,PRIDE,10.1038/s41467-025-56720-1,"<html lang=""en"" class=""js""><head><link rel=""pr...",nature,"[[""<p>The protein mass spectrometry (LC-MS/MS)...",Endothelial SHANK3 Regulates Tight Junctions i...,"BioID2, SHANK3, bEnd3. Cells"
https://dx.doi.org/10.1101/2024.04.03.587901,https://www.biorxiv.org/content/10.1101/2024.0...,PXD050443,PRIDE,10.1101/2024.04.03.587901,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3...",biorxiv,"[[""<p id=\""p-81\"">All the sequencing data gene...",Arabidopsis and Spirodela Histone PTMs (In cas...,"Histone PTMs, Plants, Transposons, epigenetics"
https://dx.doi.org/10.1101/2024.09.07.611787,https://www.biorxiv.org/content/10.1101/2024.0...,PXD044584,PRIDE,10.1101/2024.09.07.611787,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3...",biorxiv,"[[""<p id=\""p-100\"">Cryo-EM maps have been depo...",First-in-class DUB Molecular Glues and Inhibit...,"deubiquitylase, hdx, hydrogen deuterium exchan..."
https://dx.doi.org/10.1101/2025.02.05.636703,https://www.biorxiv.org/content/10.1101/2025.0...,PXD051704,PRIDE,10.1101/2025.02.05.636703,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3...",biorxiv,"[[""<p id=\""p-66\"">The mass spectrometry proteo...",Time-series cotton fiber development profiling...,"Gossypium hirsutum, PCP, cotton, fiber develop..."
https://dx.doi.org/10.1186/s12964-025-02046-w,https://dx.doi.org/10.1186/s12964-025-02046-w,PXD026293,PRIDE,10.1186/s12964-025-02046-w,"<html lang=""en"" class=""js webfonts-loaded""><he...",biomedcentral,"[[""<p>The mass spectrometry proteomics data ha...",Inhibition of NLRP3 enhances pro-apoptotic eff...,"Acute Myeloid Leukemia, EiF2Î±, NLRP3"
https://dx.doi.org/10.1186/s13100-024-00339-4,https://dx.doi.org/10.1186/S13100-024-00339-4,PXD057269,PRIDE,10.1186/s13100-024-00339-4,"<html lang=""en"" class=""js webfonts-loaded""><he...",biomedcentral,"[[""<p>All the MS data files have been deposite...",Targeted mass spectrometry of endogenous LINE-...,"LINE-1, PRM, SRM, Targeted proteomics, cancer,..."
https://dx.doi.org/10.17159/sajs.2025/18571,https://dx.doi.org/10.17159/SAJS.2025/18571,PXD054431,PRIDE,10.17159/sajs.2025/18571,"<html lang=""en"" xml:lang=""en""><head>\n\t<meta ...",co,"[[""<p><strong>Open data set :</strong> <a href...",Results from an Australopithecus africanus den...,"Australopithecus africanus, Enamel, Paleaoprot..."


In [21]:
df_merged.to_parquet(output_file)

In [22]:
orchestrator.logger.info(len(filtered_df))
filtered_df.head()

1264854841.py - line 1 - INFO - 25696


Unnamed: 0,publication,identifier,repository,title,keywords,publication_link
0,"<a href=""https://dx.doi.org/10.1038/S41467-025...",PXD059466,PRIDE,Endothelial SHANK3 Regulates Tight Junctions i...,"BioID2, SHANK3, bEnd3. Cells",[https://dx.doi.org/10.1038/S41467-025-56720-1]
1,"<a href=""https://dx.doi.org/10.6019/PXD051312""...",PXD051312,PRIDE,Systemic changes in early pregnancy in the mar...,"MRP, TTR, biomarker, early pregnancy loss, sec...","[https://dx.doi.org/10.6019/PXD051312, https:/..."
2,"<a href=""https://dx.doi.org/10.17159/SAJS.2025...",PXD054431,PRIDE,Results from an Australopithecus africanus den...,"Australopithecus africanus, Enamel, Paleaoprot...",[https://dx.doi.org/10.17159/SAJS.2025/18571]
3,"<a href=""https://dx.doi.org/10.1101/2024.04.03...",PXD050443,PRIDE,Arabidopsis and Spirodela Histone PTMs (In cas...,"Histone PTMs, Plants, Transposons, epigenetics",[https://dx.doi.org/10.1101/2024.04.03.587901]
4,"<a href=""https://dx.doi.org/10.1101/2025.02.05...",PXD051704,PRIDE,Time-series cotton fiber development profiling...,"Gossypium hirsutum, PCP, cotton, fiber develop...",[https://dx.doi.org/10.1101/2025.02.05.636703]


In [23]:
def count_new_ids_in_publications():
    cnt = 0
    # new ids to add to publication with example
    new_ids = {}
    for publications,identifier in zip(filtered_df['publication_link'],filtered_df['identifier']): ##################### CASE INSENSITIVE CHECK
        for pub in publications:
            for publication in df_merged.index:
                if pub.lower() == publication.lower() and identifier not in df_merged.loc[publication]['dataset_uid']:
                    cnt+=1
                    new_ids[publication] = identifier
                    orchestrator.logger.info(f"Publication from proteomexchange_search.tsv: {pub,identifier}")
                    #orchestrator.logger.info(f"Publication data in df_merged: {df_merged.loc[publication]}")
                    orchestrator.logger.info("\n")
    orchestrator.logger.info(f"Count of new ids with publications in proteomexchange_search.tsv: {cnt}")
    return new_ids

In [24]:
new_ids = count_new_ids_in_publications()

1095542657.py - line 14 - INFO - Count of new ids with publications in proteomexchange_search.tsv: 0


In [25]:
new_ids

{}

In [26]:
df_merged_copy = df_merged.copy()

In [27]:
# add new dataset ids to df_merged:
# - the ids should be added to the dataset_uid column of the publication 
# - the raw_html will remain the same
# - smallest elements will be extended with the new ids matching elements

if new_ids is not None:
    for new_publication in new_ids:
        orchestrator.logger.info(f"new_publication: {new_publication}")
        orchestrator.logger.info(f"old_ids: {df_merged_copy.loc[new_publication]['dataset_uid']} {type(df_merged_copy.loc[new_publication]['dataset_uid'])}, new_id: {new_ids[new_publication]}")
        
        df_merged.at[new_publication,'dataset_uid'] = df_merged_copy.loc[new_publication]['dataset_uid'] + ',' + new_ids[new_publication]
        orchestrator.logger.info(f"updated:{df_merged_copy.loc[new_publication]['dataset_uid'] + ',' + new_ids[new_publication]}")
        
        orchestrator.logger.info(f"old_smallest_elements: {df_merged_copy.loc[new_publication]['smallest_elements']}")
        orchestrator.logger.info(f"function args{df_merged_copy.loc[new_publication], 'df_merged_copy.loc[new_PXD][raw_html]'}")
        
        if df_merged.loc[new_publication]['smallest_elements'] == 'n/a' or df_merged.loc[new_publication]['smallest_elements'] is None:
            raw_html = df_merged_copy.loc[new_publication]['raw_html']
            df_merged.at[new_publication,'smallest_elements'] = extract_all_elements_with_UID(raw_html, new_ids[new_publication])
        elif ',' in df_merged.loc[new_publication]['smallest_elements']:
            orchestrator.logger.info('extending')
            df_merged.at[new_publication,'smallest_elements'] = df_merged.at[new_publication,'smallest_elements'].extend(add_example_to_merged_df(df_merged.loc[new_publication], new_ids[new_publication]))
        else:
            orchestrator.logger.error(f"SOMETHING WENT WRONG")
        
        orchestrator.logger.info(f"df_merged.loc[new_PXD]['dataset_uid']: {df_merged.loc[new_publication]['dataset_uid']}")
        orchestrator.logger.info(f"new smallest elements with {new_publication}: {df_merged.loc[new_publication]['smallest_elements']}\n\n")

In [28]:
df_merged[df_merged['smallest_elements'] == 'n/a'][['publisher']]

Unnamed: 0_level_0,publisher
publication,Unnamed: 1_level_1
https://dx.doi.org/10.1016/j.biortech.2024.132023,sciencedirect
https://dx.doi.org/10.1016/j.mcpro.2024.100892,sciencedirect
https://www.ncbi.nlm.nih.gov/pubmed/39912552,pubmed


In [29]:
df_merged.to_parquet(output_file)
df_merged.head()

Unnamed: 0_level_0,fetch_from,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements,title,keywords
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://dx.doi.org/10.1002/prca.202400095,https://dx.doi.org/10.1002/prca.202400095,PXD051312,PRIDE,10.1002/prca.202400095,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,"[[""<p>Mass spectrometry proteomics data were d...",Systemic changes in early pregnancy in the mar...,"MRP, TTR, biomarker, early pregnancy loss, sec..."
https://dx.doi.org/10.1016/j.biortech.2024.132023,https://dx.doi.org/10.1016/J.BIORTECH.2024.132023,PXD057021,PRIDE,10.1016/j.biortech.2024.132023,"<html lang=""en-us"" data-astro-cid-ns72h3ro="""" ...",sciencedirect,,Proteomic analysis of natural photoheterotroph...,"Clostridium sp., Proteins expression, Rhodopse..."
https://dx.doi.org/10.1016/j.mcpro.2024.100892,https://dx.doi.org/10.1016/J.MCPRO.2024.100892,PXD053023,PRIDE,10.1016/j.mcpro.2024.100892,"<html lang=""en-us"" data-astro-cid-ns72h3ro="""" ...",sciencedirect,,Electrophoresis-Correlative Framing of Ion Mob...,"Single cell, Xenopus laevis, capillary electro..."
https://dx.doi.org/10.1038/s41467-025-56720-1,https://dx.doi.org/10.1038/S41467-025-56720-1,PXD059466,PRIDE,10.1038/s41467-025-56720-1,"<html lang=""en"" class=""js""><head><link rel=""pr...",nature,"[[""<p>The protein mass spectrometry (LC-MS/MS)...",Endothelial SHANK3 Regulates Tight Junctions i...,"BioID2, SHANK3, bEnd3. Cells"
https://dx.doi.org/10.1101/2024.04.03.587901,https://www.biorxiv.org/content/10.1101/2024.0...,PXD050443,PRIDE,10.1101/2024.04.03.587901,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3...",biorxiv,"[[""<p id=\""p-81\"">All the sequencing data gene...",Arabidopsis and Spirodela Histone PTMs (In cas...,"Histone PTMs, Plants, Transposons, epigenetics"


In [30]:
for i,row in df_merged.iterrows():
    if row['publisher'] == 'Unknown Publisher':
        print(i)

In [31]:
# let's run a quick check
for pub,row in df_merged.iterrows():
    print(f"Publication URL: {pub}")
    cnt = 0
    for i,record in filtered_df.iterrows():
        for publication in record['publication_link']:
            if pub.lower() == publication.lower():
                # print the dataset id
                print(f"Match found in source data: {record['identifier']}")
                cnt+=1
                break
    orchestrator.logger.info(f"Number of matches in source data: {cnt}")
    orchestrator.logger.info(f"Value found in merged data: {row['dataset_uid']}")
    orchestrator.logger.info(f"Number of items in processed data: {len(row['dataset_uid'].split(','))}")
    
    if cnt != len(row['dataset_uid'].split(',')):
        raise ValueError(f"ERROR: Count mismatch for {pub}")

Publication URL: https://dx.doi.org/10.1002/prca.202400095
Match found in source data: PXD051312
Number of matches in source data: 1
Value found in merged data: PXD051312
Number of items in processed data: 1
Publication URL: https://dx.doi.org/10.1016/j.biortech.2024.132023
Match found in source data: PXD057021
Number of matches in source data: 1
Value found in merged data: PXD057021
Number of items in processed data: 1
Publication URL: https://dx.doi.org/10.1016/j.mcpro.2024.100892
Match found in source data: PXD053023
Number of matches in source data: 1
Value found in merged data: PXD053023
Number of items in processed data: 1
Publication URL: https://dx.doi.org/10.1038/s41467-025-56720-1
Match found in source data: PXD059466
Number of matches in source data: 1
Value found in merged data: PXD059466
Number of items in processed data: 1
Publication URL: https://dx.doi.org/10.1101/2024.04.03.587901
Match found in source data: PXD050443
Number of matches in source data: 1
Value found in 