In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from classifier import *
from data_fetcher import *
from parser import *
from orchestrator import *
from logger_setup import *
import os
import json
import pandas as pd
import numpy as np
import re
import time

In [2]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp


In [3]:
urls = None
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
publication_datasets = df[['publication','identifier','repository']]

In [4]:
# get an idea of values in df publication
publication_datasets['publication'].value_counts()

publication
Dataset with its publication pending                                                                                                                                                                      11719
no publication                                                                                                                                                                                             2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>                                                                                                               56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>                                                                                                            28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>                                                              

In [5]:
#  clean the `publication` column by filtering out unwanted values like `"Dataset with its publication pending"`, `"no publication"`, and any HTML tags. 
# Remove rows with unwanted values
filtered_df = publication_datasets[~publication_datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]
filtered_df[381:]

Unnamed: 0,publication,identifier,repository
899,"<a href=""https://dx.doi.org/10.1101/2024.09.18...",PXD054415,PRIDE
901,"<a href=""https://www.ncbi.nlm.nih.gov/pubmed/3...",PXD029492,PRIDE
902,"<a href=""https://www.ncbi.nlm.nih.gov/pubmed/3...",PXD054117,PRIDE
906,"<a href=""https://dx.doi.org/10.1186/S12964-024...",PXD051580,PRIDE
907,"<a href=""https://dx.doi.org/10.1038/S42004-024...",PXD055488,PRIDE
...,...,...,...
40130,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/23...",PXD000056,PRIDE
40131,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/22...",PXD000009,PRIDE
40133,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/22...",PXD000012,PRIDE
40134,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/22...",PXD000013,PRIDE


In [6]:
# Assuming filtered_df is already defined
filtered_df['publication_link'] = None  # Create a new column for the links

for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub)
        if match:
            filtered_df.at[i, 'publication_link'] = match  # Assign all the matched URLs
        else:
            filtered_df.at[i, 'publication_link'] = None
    else:
        filtered_df.at[i, 'publication_link'] = None

# Drop rows with missing links (optional)
filtered_df.dropna(subset=['publication_link'], inplace=True)
filtered_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['publication_link'] = None  # Create a new column for the links
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.dropna(subset=['publication_link'], inplace=True)


In [7]:
m = 0
for i,row in filtered_df.iterrows():
    id = row['identifier']
    m+=len(row['publication_link'])
print(m/i+1)
print(filtered_df)

2.5313095933060907
                                             publication identifier  \
0      <a href="https://dx.doi.org/10.1038/S41467-025...  PXD059466   
1      <a href="https://dx.doi.org/10.6019/PXD051312"...  PXD051312   
2      <a href="https://dx.doi.org/10.17159/SAJS.2025...  PXD054431   
3      <a href="https://dx.doi.org/10.1101/2024.04.03...  PXD050443   
4      <a href="https://dx.doi.org/10.1101/2025.02.05...  PXD051704   
...                                                  ...        ...   
25691  <a href="http://www.ncbi.nlm.nih.gov/pubmed/23...  PXD000056   
25692  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000009   
25693  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000012   
25694  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000013   
25695  <a href="http://www.ncbi.nlm.nih.gov/pubmed/22...  PXD000003   

         repository                                   publication_link  
0             PRIDE    [https://dx.doi.org/10.1038/S414

In [8]:
data = []

start = 441

iter_max = 50

t0 = time.time()

for i,row in filtered_df[start:].iterrows():
    
    driver = orchestrator.setup_data_fetcher()
        
    id = row['identifier']
    
    if i == 441 + iter_max:
        break
        
    if i%100 == 0 and i>0:
        print(f"Progress {i+1}/{len(filtered_df)}. ETA {((time.time()-t0)/(i+1))*(len(filtered_df)-i-1)}")
        
    print(f"Processing URL {i+1}.\nTime elapsed: {time.time()-t0}") if i>0 else None
    print(f"{len(row['publication_link'])} links found for dataset {id}")
    

    for url in row['publication_link']:
        orchestrator.logger.info(f"Processing URL: {url}")
    
        orchestrator.current_url = url
        orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(url)
        
        orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(url, orchestrator.full_DOM, orchestrator.logger)
        
        try:
            orchestrator.logger.info("Fetching Raw content")
            raw_data = orchestrator.data_fetcher.fetch_data(url)
            if id in raw_data:
                data.append({"publication": url,"dataset_uid": row['identifier'], "repo_name": row['repository'], "raw_html": raw_data})
                break
            else:
                print("id not found in raw data")
                continue
        except Exception as e:
            orchestrator.logger.error(f"Error processing URL {url}: {e}", exc_info=True)
    
    driver.quit()

    
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
df

orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39486496
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 389.
Time elapsed: 2.1852731704711914
3 links found for dataset PXD056511


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD056511
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/38617292
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 390.
Time elapsed: 21.39770793914795
2 links found for dataset PXD050296


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1101/2024.04.04.588169
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1101/2023.12.07.570621
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 391.
Time elapsed: 41.14006805419922
1 links found for dataset PXD040649


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/32883804
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 392.
Time elapsed: 60.57174015045166
2 links found for dataset PXD016592


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/35689013
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/35689013
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 393.
Time elapsed: 78.63879919052124
1 links found for dataset PXD023307


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/36009432
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 394.
Time elapsed: 89.4761369228363
1 links found for dataset PXD030894


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.2139/SSRN.4967745
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 395.
Time elapsed: 100.35900902748108
1 links found for dataset PXD055809


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39513657
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 396.
Time elapsed: 117.53144311904907
1 links found for dataset PXD054847


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39120973
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 397.
Time elapsed: 130.03853797912598
1 links found for dataset PXD054692


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39219796
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 398.
Time elapsed: 145.6634020805359
1 links found for dataset PXD054620


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39421202
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 399.
Time elapsed: 158.55678415298462
1 links found for dataset PXD054231


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD054686
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 400.
Time elapsed: 171.16219019889832
2 links found for dataset PXD054686


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.26508/LSA.202403007
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Progress 401/25696. ETA 11630.73322670418
Processing URL 401.
Time elapsed: 184.3813979625702
1 links found for dataset PXD058295


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.3389/FMED.2024.1302637
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 402.
Time elapsed: 203.41849327087402
1 links found for dataset PXD045874


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD054390
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 403.
Time elapsed: 217.17926025390625
1 links found for dataset PXD054390


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/32932765
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 404.
Time elapsed: 230.3014452457428
1 links found for dataset PXD020106


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD054428
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 405.
Time elapsed: 242.8087601661682
1 links found for dataset PXD054428


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1021/acs.jproteome.4c00653
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 406.
Time elapsed: 255.76814723014832
2 links found for dataset PXD053293


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39533701
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39290339
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 407.
Time elapsed: 283.93104314804077
2 links found for dataset PXD050584


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1016/j.bioactmat.2024.08.037
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/38556435
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 408.
Time elapsed: 310.3474769592285
2 links found for dataset PXD042713


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1016/j.jdermsci.2024.03.006
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 43 - ERROR - Error processing URL https://dx.doi.org/10.1016/j.jdermsci.2024.03.006: Message: TypeError: document.body is null
Stacktrace:
@https://www.jdsjournal.com/article/S0923-1811(24)00052-5/abstract:2:7
@https://www.jdsjournal.com/article/S0923-1811(24)00052-5/abstract:3:8
Traceback (most recent call last):
  File "/var/folders/br/873byd752sb72sf4n7yfykxw0000gn/T/ipykernel_28083/2951215865.py", line 35, in <module>
    raw_data = orchestrator.data_fetcher.fetch_data(url)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/pietro/Desktop/VIDA-NYU/data-gatherer/data_fetcher.py", line 94, in fetch_data
    self.simulate_user_scroll()
  File "/Users/pietro/Desktop/VIDA-NYU/data-gatherer/data_fetcher.py", line 122, in simulate_user_scroll
    new_height = self.scraper_tool.execute_script("return document.body.scrollHeight")
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Processing URL 409.
Time elapsed: 332.47547698020935
1 links found for dataset PXD053895


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39368483
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 410.
Time elapsed: 347.82084012031555
1 links found for dataset PXD053631


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39600157
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 411.
Time elapsed: 362.70966815948486
1 links found for dataset PXD053626


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/38627210
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 412.
Time elapsed: 374.9282422065735
2 links found for dataset PXD048538


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1002/1878-0261.13654
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1016/J.MCPRO.2024.100888
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 413.
Time elapsed: 400.792160987854
1 links found for dataset PXD055123


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD048514
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 414.
Time elapsed: 412.6132411956787
2 links found for dataset PXD048514


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1084/JEM.20240625
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 415.
Time elapsed: 425.1045560836792
1 links found for dataset PXD046122


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1084/JEM.20240625
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 416.
Time elapsed: 443.3894441127777
1 links found for dataset PXD046124


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1016/J.EBIOM.2024.105508
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 417.
Time elapsed: 460.476126909256
1 links found for dataset PXD055932


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD053798
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 418.
Time elapsed: 478.46993613243103
2 links found for dataset PXD053798


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39491529
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 419.
Time elapsed: 491.2611770629883
2 links found for dataset PXD053678


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1002/pmic.202400251
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


id not found in raw data


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1016/J.ACA.2024.343550
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 420.
Time elapsed: 515.5170531272888
2 links found for dataset PXD048982


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD052289
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 421.
Time elapsed: 530.4811911582947
2 links found for dataset PXD052289


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1038/S41564-024-01901-9
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 422.
Time elapsed: 543.7403881549835
1 links found for dataset PXD057832


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1038/S41564-024-01901-9
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 423.
Time elapsed: 557.8687541484833
1 links found for dataset PXD057885


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1021/JASMS.4C00366
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 424.
Time elapsed: 570.970547914505
1 links found for dataset PXD053474


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1021/ACS.JPROTEOME.4C00525
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 425.
Time elapsed: 585.3000240325928
1 links found for dataset PXD052776


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1016/J.MOLCEL.2024.11.026
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 426.
Time elapsed: 602.24649310112
1 links found for dataset PXD057694


2951215865.py - line 34 - INFO - Fetching Raw content
2951215865.py - line 43 - ERROR - Error processing URL https://dx.doi.org/10.1016/J.MOLCEL.2024.11.026: Message: TypeError: document.body is null
Stacktrace:
@https://www.cell.com/molecular-cell/abstract/S1097-2765(24)00951-1?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS1097276524009511%3Fshowall%3Dtrue:2:26
@https://www.cell.com/molecular-cell/abstract/S1097-2765(24)00951-1?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS1097276524009511%3Fshowall%3Dtrue:3:8
Traceback (most recent call last):
  File "/var/folders/br/873byd752sb72sf4n7yfykxw0000gn/T/ipykernel_28083/2951215865.py", line 35, in <module>
    raw_data = orchestrator.data_fetcher.fetch_data(url)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/pietro/Desktop/VIDA-NYU/data-gatherer/data_fetcher.py", line 94, in fetch_data
    self.simulate_user_scroll()
  File "/Users/pietro/Desktop/VIDA-NYU/data-gatherer

Processing URL 427.
Time elapsed: 614.2634670734406
3 links found for dataset PXD047724


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1038/S41557-024-01711-W
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 428.
Time elapsed: 627.4505279064178
1 links found for dataset PXD057262


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1042/BCJ20240204
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 429.
Time elapsed: 640.4676342010498
1 links found for dataset PXD058694


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39019927
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 430.
Time elapsed: 658.2527000904083
1 links found for dataset PXD053342


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39218923
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 431.
Time elapsed: 671.4438090324402
1 links found for dataset PXD053334


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39491647
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 432.
Time elapsed: 686.3920249938965
1 links found for dataset PXD053321


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/35742945
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 433.
Time elapsed: 699.1738150119781
1 links found for dataset PXD053222


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39146936
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 434.
Time elapsed: 711.5821261405945
1 links found for dataset PXD053163


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39295558
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 435.
Time elapsed: 724.224179983139
1 links found for dataset PXD053058


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39191251
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 436.
Time elapsed: 736.677766084671
1 links found for dataset PXD051352


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1038/S41419-024-07267-4
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 437.
Time elapsed: 751.2641332149506
1 links found for dataset PXD058491


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1093/FEMSML/UQAE026
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 438.
Time elapsed: 764.3570940494537
1 links found for dataset PXD048449


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/39543366
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 439.
Time elapsed: 786.0519909858704
1 links found for dataset PXD052844


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://dx.doi.org/10.1039/D4MO00147H
data_fetcher.py - line 30 - INFO - Publisher: doi
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 440.
Time elapsed: 801.0054461956024
1 links found for dataset PXD055164


2951215865.py - line 34 - INFO - Fetching Raw content
orchestrator.py - line 30 - INFO - Previous driver quit.
orchestrator.py - line 45 - INFO - Data fetcher setup completed.
2951215865.py - line 26 - INFO - Processing URL: https://www.ncbi.nlm.nih.gov/pubmed/38956074
data_fetcher.py - line 65 - INFO - Non-API URL detected, or API unsupported. Webscraper update


Processing URL 441.
Time elapsed: 820.1276791095734
1 links found for dataset PXD052807


2951215865.py - line 34 - INFO - Fetching Raw content


id not found in raw data


orchestrator.py - line 30 - INFO - Previous driver quit.


KeyboardInterrupt: 

In [9]:
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
df

Unnamed: 0_level_0,dataset_uid,repo_name,raw_html
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://dx.doi.org/10.6019/PXD056511,PXD056511,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.6019/PXD054686,PXD054686,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.26508/LSA.202403007,PXD058295,PRIDE,"<html lang=""en"" dir=""ltr"" xmlns=""http://www.w3..."
https://dx.doi.org/10.3389/FMED.2024.1302637,PXD045874,PRIDE,"<html lang=""en"" data-n-head=""%7B%22lang%22:%7B..."
https://dx.doi.org/10.6019/PXD054390,PXD054390,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.6019/PXD054428,PXD054428,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.1016/j.bioactmat.2024.08.037,PXD050584,PRIDE,"<html lang=""en-US"" class=""toolbar-stuck""><head..."
https://dx.doi.org/10.1002/1878-0261.13654,PXD048538,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i..."
https://dx.doi.org/10.6019/PXD048514,PXD048514,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.1016/J.EBIOM.2024.105508,PXD055932,PRIDE,"<html lang=""en"" class=""pb-page wf-sourcesanspr..."


In [10]:
df_old = pd.read_csv("exp_input/raw_data.csv", index_col="publication")
print(df_old.shape)
# append rows to the existing CSV
df_new = pd.concat([df_old,df])
df_new.to_csv("exp_input/PRIDEid_HTML_data.csv")

(220, 3)


In [12]:
df_new.shape

(242, 3)

In [17]:
print(len(df_new.index))
print(len(df_new.index.unique()))

242
221


In [19]:
df_merged = (
    df_new.reset_index()
    .groupby('publication')
    .agg({
        'dataset_uid': lambda x: ','.join(sorted(set(x))),  # Concatenate unique dataset_uids
        'repo_name': lambda x: ','.join(sorted(set(x))),   # Concatenate unique repo_names
        'raw_html': 'first'  # Keep the first raw_html
    })
)

In [None]:
df_merged.to_csv("exp_input/raw_data.csv")

In [24]:
df_merged

Unnamed: 0_level_0,dataset_uid,repo_name,raw_html
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://dx.doi.org/10.1002/1878-0261.13654,PXD048538,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i..."
https://dx.doi.org/10.1002/ADHM.202404465,PXD052728,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i..."
https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i..."
https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i..."
https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,"<html lang=""en"" class=""pb-page"" data-request-i..."
...,...,...,...
https://dx.doi.org/10.6019/PXD059932,PXD059932,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.7554/ELIFE.86194.2,PXD054648,PRIDE,"<html lang=""en-US""><head><meta charset=""utf-8""..."
https://dx.doi.org/10.7554/ELIFE.86931.2,PXD037489,PRIDE,"<html lang=""en-US""><head><meta charset=""utf-8""..."
https://dx.doi.org/10.7554/ELIFE.98523.2,PXD045157,PRIDE,"<html lang=""en-US""><head><meta charset=""utf-8""..."
