In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from orchestrator import *
import os
import pandas as pd
import re
import time
import json

In [2]:
#os.remove("exp_input/PX_id_HTML.parquet")

In [3]:
load_dotenv()
config_path = 'config_experiment.json'
orchestrator = Orchestrator(config_path)

ground_truth_src = "exp_input/publication_data_citations_PXD.csv"
output_file = "exp_input/PX_id_HTML_v4.parquet"
input_file = "exp_input/PX_id_HTML_v3.parquet"

orchestrator.logger.info(f"Ground Truth file exists: {os.path.exists(ground_truth_src)}")
orchestrator.logger.info(f"Input file exists: {os.path.exists(input_file)}")
orchestrator.logger.info(f"Output file exists: {os.path.exists(output_file)}")


orchestrator.py - line 21 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp
3745124042.py - line 9 - INFO - Ground Truth file exists: True
3745124042.py - line 10 - INFO - Input file exists: True
3745124042.py - line 11 - INFO - Output file exists: True


In [4]:
df_ground_truth_src = pd.read_csv(ground_truth_src)
all_urls = df_ground_truth_src['publication_link'].values
df_ground_truth_src.head()

Unnamed: 0,publication_link,identifier,repository,title,keywords,identifier_count
0,http://dx.doi.org/10.1002/cam4.3825,pxd023689,PRIDE,Proteomics reveals the function reverse of MPS...,"prostate CAFs, prostate cancer cells, secretomics",1
1,http://dx.doi.org/10.1002/cbic.202000870,"pxd023060,pxd023059,pxd023056,pxd023057,pxd023058","PRIDE,PRIDE,PRIDE,PRIDE,PRIDE",Rapid and high coverage profile of human cyste...,"Cysteine, FAIMS, SP3,Cysteine, FAIMS, SP3,Cyst...",5
2,http://dx.doi.org/10.1002/pmic.202000214,pxd020638,PRIDE,Combining label-free and label-based accurate ...,"Beef meat quality, Data-Independent Acquisitio...",1
3,http://dx.doi.org/10.1002/pmic.202000240,pxd023907,PRIDE,Fusarium oxysporum f. sp. vasinfectum (UP00003...,"cotton, fungi, fusarium oxysporum",1
4,http://dx.doi.org/10.1002/pmic.202100036,pxd022191,PRIDE,Spectral library for SARS-COVID proteome and i...,"HLA, ProteomeXchange project tag: Covid-19, SA...",1


In [5]:
# raw_data_v1.csv \ PRIDEid_HTML_data.csv is the file containing the old data
try:
    df_old = pd.read_parquet(input_file)
    
    if "publication" in df_old.columns:
        df_old.set_index("publication",inplace=True, drop=False) 
        
    orchestrator.logger.info(f"File found: {df_old.columns}")
    
except FileNotFoundError:
    df_old = pd.DataFrame(columns=['publication','dataset_uid','repo_name','raw_html','smallest_elements'])

    orchestrator.logger.info("File not found")

3871908856.py - line 8 - INFO - File found: Index(['fetch_from', 'dataset_uid', 'repo_name', 'doi', 'raw_html',
       'publisher', 'smallest_elements', 'title', 'keywords', 'publication'],
      dtype='object')


In [6]:
# count distinct values in dataset_uids
flat_list = [item for sublist in df_old['dataset_uid'].dropna().str.split(',') for item in sublist]
n_datasets = len(set(flat_list))
print(f"# of Datasets already added: {n_datasets}")
print(f"# of Publications already added: {len(df_old)}")

# of Datasets already added: 1505
# of Publications already added: 2154


In [7]:
def extract_all_elements_with_UID(source_html, uid):
    orchestrator.logger.info(f"Extracting elements with UID: {uid}")
    
    soup = BeautifulSoup(source_html, "html.parser")
    
    matching_elements = []
    
    for p in soup.find_all(["table","p"]):  # Find only <p> elements
        text = p.get_text(strip=True)
        
        if re.search(uid, text, re.IGNORECASE):  # Check if UID is in the text
            matching_elements.append((str(p), len(text)))  # Store element and length

    # If multiple matches, return the **smallest** one
    if matching_elements:
        # smallest_p, _ = min(matching_elements, key=lambda x: x[1])  # Find smallest
        return matching_elements  # Pretty-print the raw HTML for debugging # smallest_p
    
    return [None]  # No match found

In [8]:
def add_example_to_merged_df(row, raw_html):
    # handle uid also when comma-separated, then split and extract smallest element
    if 'identifier' in row:
        uid = row['identifier']
    elif 'dataset_uid' in row:
        uid = row['dataset_uid']
    if ',' in uid:
        uids = uid.split(',')
        elements = []
        for uid in uids:
            elm_i = extract_all_elements_with_UID(raw_html, uid)
            if elm_i in elements: # no dupes
                continue
            else:
                elements.append(elm_i)
        return elements
    else:
        return extract_all_elements_with_UID(raw_html, uid)


In [9]:
data = []

iter_max = 2155

t0 = time.time()

for i,row in df_ground_truth_src[2154:].iterrows():
            
    id = row['identifier']
    link = row['publication_link']
    
    if link in df_old['publication']:
        orchestrator.logger.info(f"Skipping publication {url} as it is already in the data")
        continue
    
    if i >= iter_max:
        break
        
    if i%100 == 0 and i>0:
        orchestrator.logger.info(f"Progress {i+1}/{len(df_ground_truth_src)}. ETA {((time.time()-t0)/(i+1))*(len(df_ground_truth_src)-i-1)}")
                
    driver = orchestrator.setup_data_fetcher()
    url = row['publication_link'].lower()
    orchestrator.logger.info(f"Processing URL: {url}")
    
    adjusted_url_for_fetch = url
    
    orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(url, orchestrator.full_DOM, orchestrator.logger)
        
    try:
        #orchestrator.logger.info("Fetching Raw content")
        raw_data = orchestrator.data_fetcher.fetch_data(url)
        doi = orchestrator.data_fetcher.convert_url_to_doi(url)
            
        orchestrator.logger.info(f"Data fetching from actual URL: {orchestrator.data_fetcher.scraper_tool.current_url}")
        orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(orchestrator.data_fetcher.scraper_tool.current_url)
            
        if orchestrator.publisher == "biorxiv":
            adjusted_url_for_fetch = orchestrator.data_fetcher.scraper_tool.current_url + ".full"
            raw_data = orchestrator.data_fetcher.fetch_data(adjusted_url_for_fetch)
                
        elif orchestrator.publisher == "pubmed":
            PMC_ID = orchestrator.data_fetcher.get_PMCID_from_pubmed_html(raw_data)
            if PMC_ID is not None:
                adjusted_url_for_fetch = orchestrator.data_fetcher.reconstruct_PMC_link(PMC_ID)
                raw_data = orchestrator.data_fetcher.fetch_data(adjusted_url_for_fetch)

        if re.search(id, raw_data, re.IGNORECASE):
            example = add_example_to_merged_df(row, raw_data)
            orchestrator.logger.info(f"Example: {example}")
                
            data.append({"publication": url, "fetch_from" : adjusted_url_for_fetch.lower(), "doi": doi, "publisher":orchestrator.publisher,
                         "dataset_uid": id, "repo_name": row['repository'], "raw_html": raw_data, "smallest_elements": example, "title" : row['title'],
                         "keywords" : row['keywords']})
            orchestrator.logger.info(f"Data appended for {url}")
            if orchestrator.config['search_method'] != 'cloudscraper':
                driver.quit()     
            continue
            
        else:
            orchestrator.logger.info("id not found in raw data")
            data.append({"publication": url, "fetch_from" : adjusted_url_for_fetch.lower(), "doi": doi, "publisher":orchestrator.publisher,
                         "dataset_uid": id, "repo_name": row['repository'], "raw_html": raw_data, "smallest_elements": "n/a", "title" : row['title'],
                         "keywords" : row['keywords']})   
            if orchestrator.config['search_method'] != 'cloudscraper':
                driver.quit()
            continue
                
    except Exception as e:
        orchestrator.logger.error(f"Error processing URL {url}: {e}", exc_info=True)
    
    # get time in a nice format like hh:mm:ss
    time_elapsed = time.strftime('%H:%M:%S', time.gmtime(time.time()-t0))
    orchestrator.logger.info(f"Finished processing URL {i+1}.\nTime elapsed: {time_elapsed}")

print(f"Time elapsed for {i-len(df_old)} iterations: {time.strftime('%H:%M:%S', time.gmtime(time.time() - t0))}")

orchestrator.py - line 50 - INFO - Data fetcher setup completed.
2694105563.py - line 24 - INFO - Processing URL: http://www.ncbi.nlm.nih.gov/pubmed/31521194
data_fetcher.py - line 71 - INFO - Non-API URL detected, or API unsupported. Webscraper update
2694105563.py - line 35 - INFO - Data fetching from actual URL: https://pubmed.ncbi.nlm.nih.gov/31521194/
data_fetcher.py - line 31 - INFO - Publisher: pubmed
data_fetcher.py - line 208 - INFO - PMCID: PMC6745076
385839148.py - line 2 - INFO - Extracting elements with UID: pxd013451
2694105563.py - line 50 - INFO - Example: [('<p>The data that support the findings of this paper are available in the published article and its additional files as well as in publically available data repositories. The explorative mass spectrometry proteomics data have been deposited to the ProteomeXchange Consortium via the PRIDE [<a aria-describedby="CR126" aria-expanded="false" class="usa-link" href="#CR126">126</a>] partner repository with the dataset ide

Time elapsed for 1 iterations: 00:01:25


In [10]:
df = pd.DataFrame(data)
if 'publication' in df.columns:  
    df.set_index("publication",inplace=True, drop=False)
 
if "smallest_elements" in df.columns:    
    df["smallest_elements"] = df["smallest_elements"].apply(
        lambda x: json.dumps(x) if isinstance(x, list) else (x if isinstance(x, str) else json.dumps([]))
    )
    
df.loc['http://www.ncbi.nlm.nih.gov/pubmed/31521194']

publication                http://www.ncbi.nlm.nih.gov/pubmed/31521194
fetch_from           https://www.ncbi.nlm.nih.gov/pmc/articles/pmc6...
doi                                                               None
publisher                                                       pubmed
dataset_uid                                                  pxd013451
repo_name                                               PanoramaPublic
raw_html             <html lang="en" class=""><head>\n\n        <me...
smallest_elements    [["<p>The data that support the findings of th...
title                Endo-Lysosomal Proteins and Ubiquitin CSF Conc...
keywords             Alzheimerâ€™s disease, Biomarker, CSF, Parkins...
Name: http://www.ncbi.nlm.nih.gov/pubmed/31521194, dtype: object

In [11]:
if os.path.exists(input_file):
    df_old = pd.read_parquet(input_file)
    #orchestrator.logger.info(f"Old data: {df_old.dtypes}")
    if "publication" in df_old.columns:
        orchestrator.logger.debug(f"Old data: {df_old.head()}")
        df_old.set_index("publication",inplace=True, drop=False)
    orchestrator.logger.info(df_old.shape)
    # append rows to the existing file
    #orchestrator.logger.info(f"Current data: {df.dtypes}")
    df['keywords'] = df['keywords'].astype(str).fillna('')
    df_old['keywords'] = df_old['keywords'].astype(str).fillna('')
    df_new = pd.concat([df_old,df], ignore_index=True)
    df_new.to_parquet(output_file)
    orchestrator.logger.debug(f"Data appended to {output_file}")
else:
    df.to_parquet(output_file)
    orchestrator.logger.info(f"Data written to new file {output_file}")
    df_new = df

1306007366.py - line 7 - INFO - (2154, 10)


In [12]:
df_merged = (
    df_new.reset_index()
    .groupby('fetch_from')
    .agg({
        'publication': 'first',  # Keep the first publication,
        'fetch_from': 'first',  # Keep the first fetch_from,
        'dataset_uid': lambda x: ','.join(sorted(set(x))),  # Concatenate unique dataset_uids
        'repo_name': lambda x: ','.join(sorted(set(x))),   # Concatenate unique repo_names
        'doi': 'first',  # Keep the first doi,
        'raw_html': 'first',  # Keep the first raw_html,
        'publisher': 'first',
        'smallest_elements': lambda x: ' | '.join(filter(None, x)),  # Remove None values and join strings
        'title': lambda x: ','.join(sorted(set(str(val) for val in x if pd.notna(val)))),  # Convert all to str
        'keywords': lambda x: ','.join(sorted(set(str(val) for val in x if pd.notna(val))))  # Convert all to str
    })
)

In [13]:
df_merged.to_parquet(output_file)

In [14]:
orchestrator.logger.info(len(df_ground_truth_src))
orchestrator.logger.info(f"Length of final df {len(df_merged)}")
df_ground_truth_src.head()

2497774158.py - line 1 - INFO - 34879
2497774158.py - line 2 - INFO - Length of final df 2054


Unnamed: 0,publication_link,identifier,repository,title,keywords,identifier_count
0,http://dx.doi.org/10.1002/cam4.3825,pxd023689,PRIDE,Proteomics reveals the function reverse of MPS...,"prostate CAFs, prostate cancer cells, secretomics",1
1,http://dx.doi.org/10.1002/cbic.202000870,"pxd023060,pxd023059,pxd023056,pxd023057,pxd023058","PRIDE,PRIDE,PRIDE,PRIDE,PRIDE",Rapid and high coverage profile of human cyste...,"Cysteine, FAIMS, SP3,Cysteine, FAIMS, SP3,Cyst...",5
2,http://dx.doi.org/10.1002/pmic.202000214,pxd020638,PRIDE,Combining label-free and label-based accurate ...,"Beef meat quality, Data-Independent Acquisitio...",1
3,http://dx.doi.org/10.1002/pmic.202000240,pxd023907,PRIDE,Fusarium oxysporum f. sp. vasinfectum (UP00003...,"cotton, fungi, fusarium oxysporum",1
4,http://dx.doi.org/10.1002/pmic.202100036,pxd022191,PRIDE,Spectral library for SARS-COVID proteome and i...,"HLA, ProteomeXchange project tag: Covid-19, SA...",1


In [15]:
df_merged[df_merged['smallest_elements'] == 'n/a'][['publisher']]

Unnamed: 0_level_0,publisher
fetch_from,Unnamed: 1_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,jamanetwork
https://dx.doi.org/10.1002/ADVS.202407572,onlinelibrary
https://dx.doi.org/10.1002/cpt.3409,onlinelibrary
https://dx.doi.org/10.1002/ejp.1909,wiley
https://dx.doi.org/10.1007/S00586-021-06826-Z,springer
...,...
https://www.ncbi.nlm.nih.gov/pubmed/39841590,pubmed
https://www.ncbi.nlm.nih.gov/pubmed/39855199,pubmed
https://www.ncbi.nlm.nih.gov/pubmed/39870302,pubmed
https://www.ncbi.nlm.nih.gov/pubmed/39884247,pubmed


In [16]:
# df_merged.to_parquet(output_file)
# df_merged.head()

def safe_convert(x):
    if isinstance(x, list):
        # Convert list elements to strings, handle None values
        return '|'.join(str(item) for item in x if item is not None)
    elif x is None:
        return ''  # Convert None to an empty string
    else:
        return str(x)  # Ensure non-list values are strings

df_merged['smallest_elements'] = df_merged['smallest_elements'].apply(safe_convert)

# Save to Parquet
df_merged.to_parquet(output_file)

In [17]:
df_merged = pd.read_parquet(output_file)

In [18]:
df_merged['fetch_from'] = df_merged['fetch_from'].str.lower()
df_merged['dataset_uid'] = df_merged['dataset_uid'].str.lower()
df_merged.head()

Unnamed: 0_level_0,publication,fetch_from,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements,title,keywords
fetch_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,https://dx.doi.org/10.1001/jamaneurol.2024.4763,pxd056570,PRIDE,10.1001/jamaneurol.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,,Characterization of A Novel Mengingoencephalom...,Mengingoencephalomyelitis Autoantibodies
https://dx.doi.org/10.1002/1873-3468.15092,https://dx.doi.org/10.1002/1873-3468.15092,https://dx.doi.org/10.1002/1873-3468.15092,pxd057199,PRIDE,10.1002/1873-3468.15092,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The COPASI files (.cps) for the model an...",Homeoviscous adaptation to exogenous fatty aci...,"Escherichia coli, acyl carrier protein, exogen..."
https://dx.doi.org/10.1002/1878-0261.13654,https://dx.doi.org/10.1002/1878-0261.13654,https://dx.doi.org/10.1002/1878-0261.13654,pxd048538,PRIDE,10.1002/1878-0261.13654,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The MS proteomics data are available at ...",LC-MSMS based (phospho)proteomics on gastric c...,"gastric cancer, phosphoproteomics, tyrosine ph..."
https://dx.doi.org/10.1002/1878-0261.13733,https://dx.doi.org/10.1002/1878-0261.13733,https://dx.doi.org/10.1002/1878-0261.13733,pxd054727,PRIDE,10.1002/1878-0261.13733,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>Proteomics data generated during this st...",E-selectin Affinity Glycoproteomics Reveals Ne...,"E-selectin, cancer glycoproteome, colorectal c..."
https://dx.doi.org/10.1002/ADHM.202404465,https://dx.doi.org/10.1002/ADHM.202404465,https://dx.doi.org/10.1002/adhm.202404465,pxd052728,PRIDE,10.1002/adhm.202404465,"<html lang=""en"" class=""pb-page"" data-request-i...",onlinelibrary,"[[""<p>The mass spectrometry proteomics data ha...",in vivo-like scaffold-free 3D in vitro Models ...,"Cell sheet engineering, Drug treatment, Dystro..."


In [19]:
# let's run a quick check
ok, not_ok = 0,0 
iteration = 0

for pub,row in df_merged.iterrows():
    iteration+=1
    pub = pub.lower()
    ids = set(sorted(row['dataset_uid'].split(',')))
    orchestrator.logger.debug(f"Publication URL: {pub}, uids: {ids}")
    
    if iteration%(len(df_merged)//20)==0:
        orchestrator.logger.debug(f"Progress {iteration}/{len(df_merged)}")

    if row['publisher'] == 'pubmed':
        pub = row['publication']
    
    if row['publisher'] == 'biorxiv':
        pub = row['publication']
    
    matching_ids = df_ground_truth_src[df_ground_truth_src['publication_link'] == pub]['identifier'].values
    orchestrator.logger.debug(f"Matching row: {matching_ids}")
    
    ground_truth = set(','.join(sorted(matching_ids)).split(','))
                    
    # set comparison
    if ground_truth == ids:
        ok+=1
        
    else:
        not_ok+=1
        orchestrator.logger.info(f"Publication URL: {pub}")
        orchestrator.logger.info(f"Value found in source data: {ground_truth}")
        orchestrator.logger.info(f"Value found in merged data: {ids}")
        #raise ValueError(f"ERROR: Count mismatch for {pub}")
        
        update_value=','.join(ground_truth)
        orchestrator.logger.info(f"matching_ids: {update_value}")
        df_merged.at[pub,'dataset_uid'] = update_value
    
    
orchestrator.logger.info(f"Check completed. {ok} publications good. {not_ok} errors found.")

2304522645.py - line 31 - INFO - Publication URL: https://dx.doi.org/10.1016/j.cub.2023.07.068
2304522645.py - line 32 - INFO - Value found in source data: {'pxd038903', 'pxd038907', 'pxd042084'}
2304522645.py - line 33 - INFO - Value found in merged data: {'pxd038903', 'pxd042084'}
2304522645.py - line 37 - INFO - matching_ids: pxd038903,pxd038907,pxd042084
2304522645.py - line 31 - INFO - Publication URL: https://dx.doi.org/10.1016/j.molcel.2024.08.023
2304522645.py - line 32 - INFO - Value found in source data: {'pxd054427', 'pxd055155', 'pxd054304', 'pxd054298', 'pxd054300'}
2304522645.py - line 33 - INFO - Value found in merged data: {'pxd054427', 'pxd054304', 'pxd055155'}
2304522645.py - line 37 - INFO - matching_ids: pxd054427,pxd055155,pxd054304,pxd054298,pxd054300
2304522645.py - line 31 - INFO - Publication URL: https://dx.doi.org/10.1016/j.mcpro.2023.100584
2304522645.py - line 32 - INFO - Value found in source data: {'pxd036648', 'pxd036593', 'pxd040732'}
2304522645.py - li

In [20]:
df_merged.to_parquet(output_file)