In [None]:
# import from the files in this directory
from dotenv import load_dotenv
from data_gatherer import *
import os
import pandas as pd
import re
import time
import json
import requests

In [None]:
# os.remove("exp_input/fetched_data.parquet")

In [None]:
load_dotenv()
config_path = 'config_experiment.json'
orchestrator = DataGatherer(config_path)

ground_truth_src = "exp_input/dataset_citation_records_Table.parquet"
fetched_data_path = "exp_input/fetched_data.parquet"
dataset_table = "exp_input/Table_datasets.parquet"

orchestrator.logger.info(f"Ground Truth file exists: {os.path.exists(ground_truth_src)}")
orchestrator.logger.info(f"ground_truth_X_y file exists: {os.path.exists(fetched_data_path)}")


In [None]:
df_ground_truth_src = pd.read_parquet(ground_truth_src)
orchestrator.logger.info(f"len ground_truth: {len(df_ground_truth_src)}")
df_ground_truth_src.sample(10)

In [None]:
# raw_data_v1.csv \ PRIDEid_HTML_data.csv is the file containing the old data
try:
    df_old = pd.read_parquet(fetched_data_path)
    
    if "publication" in df_old.columns:
        df_old.set_index("publication",inplace=True, drop=False) 
        
    orchestrator.logger.info(f"File found: {df_old.columns}")
    
except FileNotFoundError:
    df_old = pd.DataFrame(columns=['citing_publication_link','identifier','repository','raw_html'])

    orchestrator.logger.info("File not found")

In [None]:
# count distinct values in dataset_uids
flat_list = [item for sublist in df_old['dataset_uid'].dropna().str.split(',') for item in sublist]
n_datasets = len(set(flat_list))
print(f"# of Datasets already added: {n_datasets}")
print(f"# of Publications already added: {len(df_old)}")

In [None]:
df_ground_truth_src = df_ground_truth_src[df_ground_truth_src['citation_record_from_doi']==0].drop(['citation_record_from_doi'], axis=1)
df_ground_truth_src = df_ground_truth_src.groupby('citing_publication_link').agg({
        'citing_publication_link': 'first',  # Keep the first publication,
        'identifier': lambda x: ','.join(set(x)),  # Concatenate unique dataset_uids
        'repository': lambda x: ','.join(set(x)),   # Concatenate unique repo_names
        'citation_record_source': lambda x: ','.join(set(x)),  # Keep the first title,
        # 'title': lambda x: ','.join(set(x)),  # Keep the first title,
    })

print(len(df_ground_truth_src))
print(df_ground_truth_src.columns)

In [None]:
df_ground_truth_src['citation_record_source'].value_counts().head(10)

In [None]:
sample_size = 25  # Adjust this value

stratified_sample = df_ground_truth_src.groupby('citation_record_source', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), sample_size), random_state=142)  # Ensure it doesn't exceed available rows
).reset_index(drop=True)

In [None]:
data, i = [], 0
t0 = time.time()
iter_max = 15000  # Limit iterations

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

for link, id, repo, record_src in stratified_sample.itertuples(index=False):
    i += 1
    if i >= iter_max:
        break

    if link in df_old['publication'].values:
        orchestrator.logger.info(f"Skipping publication {link} (already in data)")
        i-=1  # Decrement i to ensure we still process the same number of links
        continue

    orchestrator.logger.info(f"Processing URL: {link}")

    try:
        response = requests.get(link, headers=headers, timeout=10)
        response.raise_for_status()  # Raise error for HTTP failures (4xx, 5xx)

        raw_data = response.text  # Extract HTML content

        data.append({
            "publication": link,
            "src_website": 'ncbi',
            "dataset_uid": id,
            "repo_name": repo,
            "raw_html": raw_data
        })

    except requests.exceptions.RequestException as e:
        orchestrator.logger.error(f"Error fetching URL {link}: {e}", exc_info=True)

    # Log progress every 100 iterations
    if i % 100 == 0 and i > 0:
        elapsed = time.time() - t0
        eta = (elapsed / (i + 1)) * (len(stratified_sample) - i - 1)
        orchestrator.logger.info(f"\nProgress {i+1}/{len(stratified_sample)}. ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}\n")
    
    time.sleep(0.5)  # Optional: Add a delay to avoid overwhelming the server

print(f"Time elapsed for {len(data)} iterations: {time.strftime('%H:%M:%S', time.gmtime(time.time() - t0))}")

In [None]:
df = pd.DataFrame(data)
if 'publication' in df.columns:  
    df.set_index("publication",inplace=True, drop=False)
 
if "smallest_elements" in df.columns:    
    df["smallest_elements"] = df["smallest_elements"].apply(
        lambda x: json.dumps(x) if isinstance(x, list) else (x if isinstance(x, str) else json.dumps([]))
    )


In [None]:
# concat the new data with the old data
print(len(df_old))
print(len(df))
df_merged = pd.concat([df_old, df], ignore_index=True)
print(len(df_merged))

In [None]:
cnt, not_cnt, mask = 0,0, []
df['identifiers_in_HTML'] = ''

for pub,row in df_merged.iterrows():
    keep = True
    ids = row['dataset_uid'].split(',')
    for id in ids:
        if id in row['raw_html']:
            cnt +=1
            # append the id to  the row identifiers_in_HTML
            df_merged.at[pub,'identifiers_in_HTML'] = df_merged.at[pub,'identifiers_in_HTML'] + ',' + id
        else:
            #print(f"ID {id} not found in {pub}")
            not_cnt +=1
            # drop row from df
            keep = False
    
    mask.append(keep)
    
print(cnt, not_cnt)
print(len(df_merged))
print(len(mask))
print(sum(mask))

In [None]:
for col in df_merged.columns:
    df_merged[col] = df_merged[col].astype(str)

In [None]:
print(df_merged.dtypes)

In [None]:
print(f"df_merged length: {len(df_merged)}")
print(f"mask length: {len(mask)}")

In [None]:
df_merged = df_merged[mask].copy()
print(len(df_merged))

In [None]:
df_merged.to_parquet(fetched_data_path)

In [None]:
# # let's run a quick check
# ok, not_ok = 0,0 
# iteration = 0
# dataset_table = pd.read_parquet(dataset_table)
# 
# for i,row in df_merged.iterrows():
#     iteration+=1
#     pub = row['publication'].lower()
#     ids = set(sorted(row['dataset_uid'].split(',')))
#     orchestrator.logger.debug(f"Publication URL: {pub}, uids: {ids}")
#     
#     if iteration%(len(df_merged)//20)==0:
#         orchestrator.logger.debug(f"Progress {iteration}/{len(df_merged)}")
#     
#     matching_ids = dataset_table[dataset_table['citing_publications_links'] == pub]['identifier'].values
#     orchestrator.logger.debug(f"Matching row: {matching_ids}")
#     
#     ground_truth = set(','.join(sorted(matching_ids)).split(','))
#                     
#     # set comparison
#     if ground_truth == ids:
#         ok+=1
#         
#     else:
#         not_ok+=1
#         orchestrator.logger.info(f"Publication URL: {pub}")
#         orchestrator.logger.info(f"Value found in source data: {ground_truth}")
#         orchestrator.logger.info(f"Value found in merged data: {ids}")
#         #raise ValueError(f"ERROR: Count mismatch for {pub}")
#         
#         update_value=','.join(ground_truth)
#         orchestrator.logger.info(f"matching_ids: {update_value}")
#         df_merged.at[pub,'dataset_uid'] = update_value
#     
#     
# orchestrator.logger.info(f"Check completed. {ok} publications good. {not_ok} errors found.")

In [None]:
# data, i = [], 0
# 
# t0 = time.time()
# iter_max = 1500  # Limit iterations
# 
# for link, id, repo, record_src in stratified_sample.itertuples(index=False):
#     i+=1
#     if i >= iter_max:
#         break
# 
#     if link in df_old['citing_publication_link'].values:
#         orchestrator.logger.info(f"Skipping publication {link} (already in data)")
#         continue
#     
#     orchestrator.logger.info(f"Processing URL: {link}")
# 
#     adjusted_url_for_fetch = link
#     orchestrator.data_fetcher = orchestrator.data_fetcher.update_DataFetcher_settings(
#         link, orchestrator.full_DOM, orchestrator.logger
#     )
# 
#     try:
#         raw_data = orchestrator.data_fetcher.fetch_data(link)
#         doi = orchestrator.data_fetcher.url_to_doi(link)
# 
#         orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(
#             orchestrator.data_fetcher.scraper_tool.current_url
#         )
# 
#         if orchestrator.publisher == "biorxiv":
#             adjusted_url_for_fetch = orchestrator.data_fetcher.scraper_tool.current_url + ".full"
#             raw_data = orchestrator.data_fetcher.fetch_data(adjusted_url_for_fetch)
# 
#         elif orchestrator.publisher == "pubmed":
#             PMC_ID = orchestrator.data_fetcher.get_PMCID_from_pubmed_html(raw_data)
#             if PMC_ID:
#                 adjusted_url_for_fetch = orchestrator.data_fetcher.reconstruct_PMC_link(PMC_ID)
#                 raw_data = orchestrator.data_fetcher.fetch_data(adjusted_url_for_fetch)
# 
#         # smallest_elements = (
#         #     add_example_to_merged_df(row, raw_data) if re.search(id, raw_data, re.IGNORECASE) else "n/a"
#         # )
# 
#         data.append({
#             "publication": link, 
#             "fetch_from": adjusted_url_for_fetch.lower(), 
#             "doi": doi, 
#             "publisher": orchestrator.publisher,
#             "dataset_uid": id, 
#             "repo_name": repo, 
#             "raw_html": raw_data, 
#             # "smallest_elements": smallest_elements, 
#             # "title": title
#         })
# 
#     except Exception as e:
#         orchestrator.logger.error(f"Error processing URL {link}: {e}", exc_info=True)
# 
#     # Log every 100 iterations
#     if i % 100 == 0 and i > 0:
#         elapsed = time.time() - t0
#         eta = (elapsed / (i + 1)) * (len(df_ground_truth_src) - i - 1)
#         orchestrator.logger.info(f"\nProgress {i+1}/{len(df_ground_truth_src)}. ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}\n")
# 
# # Quit WebDriver after all iterations
# driver.quit()
# print(f"Time elapsed for {len(data)} iterations: {time.strftime('%H:%M:%S', time.gmtime(time.time() - t0))}")