In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from classifier import *
from data_fetcher import *
from parser import *
from orchestrator import *
from logger_setup import *
import os
import json
import pandas as pd
import numpy as np
import re
import time

In [2]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp


In [4]:
urls = None
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
publication_datasets = df[['publication','identifier','repository']]

In [4]:
# get an idea of values in df publication
publication_datasets['publication'].value_counts()

In [5]:
#  clean the `publication` column by filtering out unwanted values like `"Dataset with its publication pending"`, `"no publication"`, and any HTML tags. 
# Remove rows with unwanted values
filtered_df = publication_datasets[~publication_datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]
filtered_df[381:]

In [6]:
# Assuming filtered_df is already defined
filtered_df['publication_link'] = None  # Create a new column for the links

for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub)
        if match:
            filtered_df.at[i, 'publication_link'] = match  # Assign all the matched URLs
        else:
            filtered_df.at[i, 'publication_link'] = None
    else:
        filtered_df.at[i, 'publication_link'] = None

# Drop rows with missing links (optional)
filtered_df.dropna(subset=['publication_link'], inplace=True)
filtered_df.reset_index(drop=True, inplace=True)

In [7]:
m = 0
for i,row in filtered_df.iterrows():
    id = row['identifier']
    m+=len(row['publication_link'])
print(m/i+1)
print(filtered_df)

In [8]:
data = []

start = 441

iter_max = 50

t0 = time.time()

for i,row in filtered_df[start:].iterrows():
    
    driver = orchestrator.setup_data_fetcher()
        
    id = row['identifier']
    
    if i == 441 + iter_max:
        break
        
    if i%100 == 0 and i>0:
        print(f"Progress {i+1}/{len(filtered_df)}. ETA {((time.time()-t0)/(i+1))*(len(filtered_df)-i-1)}")
        
    print(f"Processing URL {i+1}.\nTime elapsed: {time.time()-t0}") if i>0 else None
    print(f"{len(row['publication_link'])} links found for dataset {id}")
    

    for url in row['publication_link']:
        orchestrator.logger.info(f"Processing URL: {url}")
    
        orchestrator.current_url = url
        orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(url)
        
        orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(url, orchestrator.full_DOM, orchestrator.logger)
        
        try:
            orchestrator.logger.info("Fetching Raw content")
            raw_data = orchestrator.data_fetcher.fetch_data(url)
            if id in raw_data:
                data.append({"publication": url,"dataset_uid": row['identifier'], "repo_name": row['repository'], "raw_html": raw_data})
                break
            else:
                print("id not found in raw data")
                continue
        except Exception as e:
            orchestrator.logger.error(f"Error processing URL {url}: {e}", exc_info=True)
    
    driver.quit()

    
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
df

In [9]:
df = pd.DataFrame(data)
df.set_index("publication",inplace=True)
df

In [10]:
df_old = pd.read_csv("exp_input/raw_data.csv", index_col="publication")
print(df_old.shape)
# append rows to the existing CSV
df_new = pd.concat([df_old,df])
df_new.to_csv("exp_input/PRIDEid_HTML_data.csv")

In [12]:
df_new.shape

In [17]:
print(len(df_new.index))
print(len(df_new.index.unique()))

In [19]:
df_merged = (
    df_new.reset_index()
    .groupby('publication')
    .agg({
        'dataset_uid': lambda x: ','.join(sorted(set(x))),  # Concatenate unique dataset_uids
        'repo_name': lambda x: ','.join(sorted(set(x))),   # Concatenate unique repo_names
        'raw_html': 'first'  # Keep the first raw_html
    })
)

In [None]:
df_merged.to_csv("exp_input/raw_data.csv")

In [24]:
df_merged