In [3]:
# import from the files in this directory
from dotenv import load_dotenv
from classifier import *
from data_fetcher import *
from parser import *
from orchestrator import *
from logger_setup import *
import os
import json
import pandas as pd
import numpy as np
import re

In [4]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized.


In [5]:
orchestrator.setup_data_fetcher()

orchestrator.py - line 37 - INFO - Data fetcher setup completed.


In [6]:
urls = None
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
publication_datasets = df[['publication','identifier','repository']]

In [7]:
# get an idea of values in df publication
publication_datasets['publication'].value_counts()

publication
Dataset with its publication pending                                                                                                                                                                      11719
no publication                                                                                                                                                                                             2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>                                                                                                               56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>                                                                                                            28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>                                                              

In [8]:
#  clean the `publication` column by filtering out unwanted values like `"Dataset with its publication pending"`, `"no publication"`, and any HTML tags. 
# Remove rows with unwanted values
filtered_df = publication_datasets[~publication_datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]

In [9]:
publication_links = []
for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Convert to string to avoid TypeError
    if "href" in pub:  # Process only if it contains an <a href> tag
        match = re.search(r'href=[\'"]([^\'"]+)[\'"]', pub)
        if match:
            publication_links.append(match.group(1))  # Extract the URL
        else:
            publication_links.append(None)  # No valid URL found
    else:
        publication_links.append(None)  # Non-HTML entries get None

# Assign the cleaned list as a new column
filtered_df['publication_link'] = publication_links
# Drop rows where no valid publication link was found (optional)
filtered_df.dropna(subset=['publication_link'], inplace=True)
# Reset index
filtered_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['publication_link'] = publication_links
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.dropna(subset=['publication_link'], inplace=True)


In [10]:
filtered_df

Unnamed: 0,publication,identifier,repository,publication_link
0,"<a href=""https://dx.doi.org/10.1038/S41467-025...",PXD059466,PRIDE,https://dx.doi.org/10.1038/S41467-025-56720-1
1,"<a href=""https://dx.doi.org/10.6019/PXD051312""...",PXD051312,PRIDE,https://dx.doi.org/10.6019/PXD051312
2,"<a href=""https://dx.doi.org/10.17159/SAJS.2025...",PXD054431,PRIDE,https://dx.doi.org/10.17159/SAJS.2025/18571
3,"<a href=""https://dx.doi.org/10.1101/2024.04.03...",PXD050443,PRIDE,https://dx.doi.org/10.1101/2024.04.03.587901
4,"<a href=""https://dx.doi.org/10.1101/2025.02.05...",PXD051704,PRIDE,https://dx.doi.org/10.1101/2025.02.05.636703
...,...,...,...,...
25691,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/23...",PXD000056,PRIDE,http://www.ncbi.nlm.nih.gov/pubmed/23212214
25692,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/22...",PXD000009,PRIDE,http://www.ncbi.nlm.nih.gov/pubmed/22994238
25693,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/22...",PXD000012,PRIDE,http://www.ncbi.nlm.nih.gov/pubmed/22905912
25694,"<a href=""http://www.ncbi.nlm.nih.gov/pubmed/22...",PXD000013,PRIDE,http://www.ncbi.nlm.nih.gov/pubmed/22874012


In [11]:
data = []

for i,row in filtered_df.iterrows():
    
    if i == 3:
        break
    
    url = row['publication_link']
    orchestrator.logger.info(f"Processing URL: {url}")
    
    orchestrator.current_url = url
    orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(url)
    
    orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(url, orchestrator.full_DOM, orchestrator.logger)
    
    try:
        orchestrator.logger.info("Fetching Raw content")
        raw_data = orchestrator.data_fetcher.fetch_data(url)
        data.append({"publication": url,"dataset_uid": row['identifier'], "repo_name": row['repository'], "raw_html": raw_data})
    
    except Exception as e:
        orchestrator.logger.error(f"Error processing URL {url}: {e}", exc_info=True)
    
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
df

2617608109.py - line 9 - INFO - Processing URL: https://dx.doi.org/10.1038/S41467-025-56720-1
data_fetcher.py - line 29 - INFO - Publisher: doi
data_fetcher.py - line 64 - INFO - Non-API URL detected, or API unsupported. Webscraper update
2617608109.py - line 17 - INFO - Fetching Raw content
2617608109.py - line 9 - INFO - Processing URL: https://dx.doi.org/10.6019/PXD051312
data_fetcher.py - line 29 - INFO - Publisher: doi
data_fetcher.py - line 64 - INFO - Non-API URL detected, or API unsupported. Webscraper update
2617608109.py - line 17 - INFO - Fetching Raw content
2617608109.py - line 9 - INFO - Processing URL: https://dx.doi.org/10.17159/SAJS.2025/18571
data_fetcher.py - line 29 - INFO - Publisher: doi
data_fetcher.py - line 64 - INFO - Non-API URL detected, or API unsupported. Webscraper update
2617608109.py - line 17 - INFO - Fetching Raw content


Unnamed: 0_level_0,dataset_uid,repo_name,raw_html
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://dx.doi.org/10.1038/S41467-025-56720-1,PXD059466,PRIDE,"<html lang=""en"" class=""js""><head><link rel=""pr..."
https://dx.doi.org/10.6019/PXD051312,PXD051312,PRIDE,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."
https://dx.doi.org/10.17159/SAJS.2025/18571,PXD054431,PRIDE,"<html lang=""en"" xml:lang=""en""><head>\n\t<meta ..."


In [12]:
df.to_csv("exp_input/PRIDEid_HTML_data.csv")