In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from classifier import *
from data_fetcher import *
from parser import *
from orchestrator import *
from logger_setup import *
import os
import json
import pandas as pd
import numpy as np
import re
import time

In [2]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp


In [3]:
urls = None
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
publication_datasets = df[['publication','identifier','repository']]

In [4]:
# get an idea of values in df publication
publication_datasets['publication'].value_counts()

publication
Dataset with its publication pending                                                                                                                                                                      11719
no publication                                                                                                                                                                                             2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>                                                                                                               56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>                                                                                                            28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>                                                              

In [5]:
#  clean the `publication` column by filtering out unwanted values like `"Dataset with its publication pending"`, `"no publication"`, and any HTML tags. 
# Remove rows with unwanted values
filtered_df = publication_datasets[~publication_datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]

In [6]:
# Assuming filtered_df is already defined
filtered_df['publication_link'] = None  # Create a new column for the links

for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub)
        if match:
            filtered_df.at[i, 'publication_link'] = match  # Assign all the matched URLs
        else:
            filtered_df.at[i, 'publication_link'] = None
    else:
        filtered_df.at[i, 'publication_link'] = None

# Drop rows with missing links (optional)
filtered_df.dropna(subset=['publication_link'], inplace=True)
filtered_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['publication_link'] = None  # Create a new column for the links
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.dropna(subset=['publication_link'], inplace=True)


In [7]:
m = 0
for i,row in filtered_df.iterrows():
    id = row['identifier']
    m+=len(row['publication_link'])
print(m/i+1)
print(filtered_df)

2.5313095933060907
                                             publication identifier  \
0      <a href="https://dx.doi.org/10.1038/S41467-025...  PXD059466   
1      <a href="https://dx.doi.org/10.6019/PXD051312"...  PXD051312   
2      <a href="https://dx.doi.org/10.17159/SAJS.2025...  PXD054431   
3      <a href="https://dx.doi.org/10.1101/2024.04.03...  PXD050443   
4      <a href="https://dx.doi.org/10.1101/2025.02.05...  PXD051704   
...                                                  ...        ...   
25691  <a href="http://www.ncbi.nlm.nih.gov/pubmed/23...  PXD000056   
25692  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000009   
25693  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000012   
25694  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000013   
25695  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000003   

         repository                                   publication_link  
0             PRIDE    [https://dx.doi.org/10.1038/S414

In [8]:
# raw_data_v1.csv \ PRIDEid_HTML_data.csv is the file containing the old data
df_old = pd.read_csv("exp_input/raw_data_v1.csv", index_col="publication")
# check the uid of the last value of df_old and find its index in filtered_df
dataset_uids = list(df_old['dataset_uid'].values)

In [9]:
df_old.head()

Unnamed: 0_level_0,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,PRIDE,10.1001/JAMANEUROL.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,na
https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,10.1002/CBIC.202400831,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The mass spectrometry proteomics data ha...
https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,PRIDE,10.1002/CBIC.202400882,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The data that support the findings of th...
https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,10.1002/PRCA.202300107,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>Generation of the protein library and SW...
https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,10.1002/anie.202420149,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>The mass spectrometry proteomics data ha...


In [10]:
all_ids = []
for element in [item.split(',') for item in df_old['dataset_uid'].values]:
    all_ids.extend(element)

In [11]:
# count distinct values in dataset_uids
latest_index = len(set(all_ids))
latest_index

181

In [12]:
def extract_all_elements_with_UID(source_html, uid):
    soup = BeautifulSoup(source_html, "html.parser")
    
    matching_elements = []
    
    for p in soup.find_all(["table","p"]):  # Find only <p> elements
        text = p.get_text(strip=True)
        
        if uid in text:  # Check if UID is in the text
            matching_elements.append((str(p), len(text)))  # Store element and length

    # If multiple matches, return the **smallest** one
    if matching_elements:
        # smallest_p, _ = min(matching_elements, key=lambda x: x[1])  # Find smallest
        return matching_elements  # Pretty-print the raw HTML for debugging # smallest_p
    
    return None  # No match found

In [13]:
def add_example_to_merged_df(row, raw_html):
    # handle uid also when comma-separated, then split and extract smallest element
    if 'identifier' in row:
        uid = row['identifier']
    elif 'dataset_uid' in row:
        uid = row['dataset_uid']
    if ',' in uid:
        uids = uid.split(',')
        elements = []
        for uid in uids:
            elm_i = extract_all_elements_with_UID(raw_html, uid)
            if elm_i in elements: # no dupes
                continue
            else:
                elements.append(elm_i)
        return elements
    else:
        return extract_all_elements_with_UID(raw_html, uid)


In [14]:
data = []

start = latest_index + 1 

iter_max = 1

t0 = time.time()

for i,row in filtered_df[start:].iterrows():
        
    id = row['identifier']
    
    if id in dataset_uids:
        continue
    
    if i == start + iter_max:
        break
        
    if i%100 == 0 and i>0:
        orchestrator.logger.info(f"Progress {i+1}/{len(filtered_df)}. ETA {((time.time()-t0)/(i+1))*(len(filtered_df)-i-1)}")
        
    orchestrator.logger.info(f"Processing URL {i+1}.\nTime elapsed: {time.time()-t0}") if i>0 else None
    orchestrator.logger.info(f"{len(row['publication_link'])} links found for dataset {id}")
    
    j = 0
    for url in row['publication_link']:
        j+=1
        
        driver = orchestrator.setup_data_fetcher()
        
        orchestrator.logger.info(f"Processing URL: {url}")
    
        orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(url, orchestrator.full_DOM, orchestrator.logger)
        
        try:
            #orchestrator.logger.info("Fetching Raw content")
            raw_data = orchestrator.data_fetcher.fetch_data(url)
            
            orchestrator.logger.info(f"Publication #: {j}, current URL: {orchestrator.data_fetcher.scraper_tool.current_url}")
            orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(orchestrator.data_fetcher.scraper_tool.current_url)
            
            if orchestrator.publisher == "biorxiv":
                fixed_url = orchestrator.data_fetcher.scraper_tool.current_url + ".full"
                raw_data = orchestrator.data_fetcher.fetch_data(fixed_url)

            if id in raw_data:
                example = add_example_to_merged_df(row, raw_data)
                orchestrator.logger.info(f"Example: {example}")
                
                data.append({"publication": url, "publisher":orchestrator.publisher, "dataset_uid": id, "repo_name": row['repository'],
                             "raw_html": raw_data, "smallest_elements": example})
                orchestrator.logger.info(f"Data appended for {url}")
                driver.quit()
                continue
            else:
                orchestrator.logger.info("id not found in raw data")
                data.append({"publication": url, "publisher":orchestrator.publisher, "dataset_uid": id, "repo_name": row['repository'],
                             "raw_html": raw_data, "smallest_elements": 'na'})
                driver.quit()
                continue
        except Exception as e:
            orchestrator.logger.error(f"Error processing URL {url}: {e}", exc_info=True)

1225212621.py - line 22 - INFO - Processing URL 183.
Time elapsed: 0.005068063735961914
1225212621.py - line 23 - INFO - 1 links found for dataset PXD045956
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
1225212621.py - line 31 - INFO - Processing URL: https://dx.doi.org/10.1021/ACS.ANALCHEM.4C02924
data_fetcher.py - line 69 - INFO - Non-API URL detected, or API unsupported. Webscraper update
1225212621.py - line 39 - INFO - Publication #: 1, current URL: https://pubs.acs.org/doi/10.1021/acs.analchem.4c02924
data_fetcher.py - line 34 - INFO - Publisher: acs
1225212621.py - line 56 - INFO - id not found in raw data


In [15]:
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
df

Unnamed: 0_level_0,publisher,dataset_uid,repo_name,raw_html,smallest_elements
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://dx.doi.org/10.1021/ACS.ANALCHEM.4C02924,acs,PXD045956,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i...",na


In [16]:
df_old = pd.read_csv("exp_input/raw_data_v1.csv", index_col="publication")
print(df_old.shape)
# append rows to the existing CSV
df_new = pd.concat([df_old,df])
df_new.to_csv("exp_input/raw_data_v1.csv")

(238, 6)


In [17]:
df_new.shape

(239, 6)

In [18]:
print(len(df_new.index))
print(len(df_new.index.unique()))

239
239


In [20]:
# starting from publication, make some string substitution and create a new column: doi
df_new["doi"] = df_new.index
df_new["doi"] = df_new["doi"].apply(orchestrator.data_fetcher.convert_url_to_doi)

data_fetcher.py - line 256 - INFO - DOI: 10.1001/JAMANEUROL.2024.4763
data_fetcher.py - line 256 - INFO - DOI: 10.1002/CBIC.202400831
data_fetcher.py - line 256 - INFO - DOI: 10.1002/CBIC.202400882
data_fetcher.py - line 256 - INFO - DOI: 10.1002/PRCA.202300107
data_fetcher.py - line 256 - INFO - DOI: 10.1002/anie.202420149
data_fetcher.py - line 256 - INFO - DOI: 10.1002/cac2.12663
data_fetcher.py - line 256 - INFO - DOI: 10.1002/cpt.3409
data_fetcher.py - line 256 - INFO - DOI: 10.1002/ctm2.70030
data_fetcher.py - line 256 - INFO - DOI: 10.1002/pmic.202400310
data_fetcher.py - line 256 - INFO - DOI: 10.1002/prca.202400095
data_fetcher.py - line 256 - INFO - DOI: 10.1007/s00395-024-01041-5
data_fetcher.py - line 256 - INFO - DOI: 10.1016/J.BBADVA.2025.100140
data_fetcher.py - line 256 - INFO - DOI: 10.1016/J.BIORTECH.2024.132023
data_fetcher.py - line 256 - INFO - DOI: 10.1016/J.CELREP.2024.114793
data_fetcher.py - line 256 - INFO - DOI: 10.1016/J.CELREP.2024.115224
data_fetcher.py - 

In [21]:
df_merged = (
    df_new.reset_index()
    .groupby('publication')
    .agg({
        'dataset_uid': lambda x: ','.join(sorted(set(x))),  # Concatenate unique dataset_uids
        'repo_name': lambda x: ','.join(sorted(set(x))),   # Concatenate unique repo_names
        'doi': 'first',  # Keep the first raw_html,
        'raw_html': 'first',  # Keep the first raw_html,
        'publisher' : 'first',
        'smallest_elements': 'first'
    })
)

In [22]:
df_merged

Unnamed: 0_level_0,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,PRIDE,10.1001/JAMANEUROL.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,na
https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,10.1002/CBIC.202400831,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The mass spectrometry proteomics data ha...
https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,PRIDE,10.1002/CBIC.202400882,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The data that support the findings of th...
https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,10.1002/PRCA.202300107,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>Generation of the protein library and SW...
https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,10.1002/anie.202420149,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>The mass spectrometry proteomics data ha...
...,...,...,...,...,...,...
https://www.ncbi.nlm.nih.gov/pubmed/39884247,PXD054970,PRIDE,,"<html lang=""en""><head itemscope="""" itemtype=""h...",pubmed,na
https://www.ncbi.nlm.nih.gov/pubmed/39900909,PXD058955,PRIDE,,"<html lang=""en""><head itemscope="""" itemtype=""h...",pubmed,na
https://www.ncbi.nlm.nih.gov/pubmed/39910101,PXD060051,iProX,,"<html lang=""en""><head itemscope="""" itemtype=""h...",pubmed,na
https://www.ncbi.nlm.nih.gov/pubmed/39910614,PXD060193,iProX,,"<html lang=""en""><head itemscope="""" itemtype=""h...",pubmed,na


In [23]:
df_merged.to_csv("exp_input/raw_data_v1.csv")

In [24]:
print(len(filtered_df))
filtered_df.head()

25696


Unnamed: 0,publication,identifier,repository,publication_link
0,"<a href=""https://dx.doi.org/10.1038/S41467-025...",PXD059466,PRIDE,[https://dx.doi.org/10.1038/S41467-025-56720-1]
1,"<a href=""https://dx.doi.org/10.6019/PXD051312""...",PXD051312,PRIDE,"[https://dx.doi.org/10.6019/PXD051312, https:/..."
2,"<a href=""https://dx.doi.org/10.17159/SAJS.2025...",PXD054431,PRIDE,[https://dx.doi.org/10.17159/SAJS.2025/18571]
3,"<a href=""https://dx.doi.org/10.1101/2024.04.03...",PXD050443,PRIDE,[https://dx.doi.org/10.1101/2024.04.03.587901]
4,"<a href=""https://dx.doi.org/10.1101/2025.02.05...",PXD051704,PRIDE,[https://dx.doi.org/10.1101/2025.02.05.636703]


In [25]:
cnt = 0
#new ids to add to publication with example
new_ids = {}
for publications,identifier in zip(filtered_df['publication_link'],filtered_df['identifier']):
    for pub in publications:
        for publication in df_merged.index:
            if pub == publication and identifier not in df_merged.loc[publication]['dataset_uid']:
                cnt+=1
                new_ids[publication] = identifier
                print(f"Publication from proteomexchange_search.tsv: {pub,identifier}")
                print(f"Publication data in df_merged: \n{df_merged.loc[publication]}")
                print("\n")
print(cnt)

0


In [26]:
new_ids

{}

In [27]:
df_merged_copy = df_merged.copy()

In [28]:
# add new ids to df_merged:
# - the ids should be added to the dataset_uid column of the publication 
# - the raw_html will remain the same
# - smallest elements will be extended with the new ids matching elements
for new_PXD in new_ids:
    print(f"new_PXD: {new_PXD}")
    print(f"old_ids: {df_merged_copy.loc[new_PXD]['dataset_uid']} {type(df_merged_copy.loc[new_PXD]['dataset_uid'])}, \nnew_id: {new_ids[new_PXD]}")
    
    df_merged.at[new_PXD,'dataset_uid'] = df_merged_copy.loc[new_PXD]['dataset_uid'] + ',' + new_ids[new_PXD]
    print(f"updated:{df_merged_copy.loc[new_PXD]['dataset_uid'] + ',' + new_ids[new_PXD]}")
    
    print(f"old_smallest_elements: {df_merged_copy.loc[new_PXD]['smallest_elements']}\n")
    #print(f"function args{df_merged_copy.loc[new_PXD], 'df_merged_copy.loc[new_PXD][raw_html]'}")
    
    if df_merged.loc[new_PXD]['smallest_elements'] == 'na' or df_merged.loc[new_PXD]['smallest_elements'] is None:
        df_merged.at[new_PXD,'smallest_elements'] = extract_all_elements_with_UID(df_merged_copy.loc[new_PXD]['raw_html'], new_ids[new_PXD])
    elif type(df_merged.loc[new_PXD]['smallest_elements']) == list:
        print('extending')
        df_merged.at[new_PXD,'smallest_elements'] = df_merged.at[new_PXD,'smallest_elements'].extend(add_example_to_merged_df(df_merged.loc[new_PXD], new_ids[new_PXD]))
    else:
        print(f"SOMETHING WENT WRONG")
    
    print(f"df_merged.loc[new_PXD]['dataset_uid']: {df_merged.loc[new_PXD]['dataset_uid']}")
    print(f"new smallest elements with {new_PXD}: {df_merged.loc[new_PXD]['smallest_elements']}\n\n")

In [29]:
df_merged.loc[['https://dx.doi.org/10.1038/s41467-024-51174-3',]]

Unnamed: 0_level_0,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
https://dx.doi.org/10.1038/s41467-024-51174-3,"PXD052529,PXD054049,PXD054202",PRIDE,10.1038/s41467-024-51174-3,"<html lang=""en"" class=""js""><head><link rel=""pr...",nature,


In [30]:
len(df_merged)

239

In [31]:
df_merged.to_csv("exp_input/PRIDEid_HTML_data.csv")

In [32]:
df_merged.head()

Unnamed: 0_level_0,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,PRIDE,10.1001/JAMANEUROL.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,na
https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,10.1002/CBIC.202400831,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The mass spectrometry proteomics data ha...
https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,PRIDE,10.1002/CBIC.202400882,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The data that support the findings of th...
https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,10.1002/PRCA.202300107,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>Generation of the protein library and SW...
https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,10.1002/anie.202420149,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>The mass spectrometry proteomics data ha...


In [33]:
for i,row in df_merged.iterrows():
    if row['publisher'] == 'Unknown Publisher':
        print(i)

https://dx.doi.org/10.1002/CBIC.202400831
https://dx.doi.org/10.1002/CBIC.202400882
https://dx.doi.org/10.1002/cpt.3409
https://dx.doi.org/10.1002/pmic.202400310
https://dx.doi.org/10.1096/fba.2022-00063
https://dx.doi.org/10.1111/1751-7915.70106
