In [1]:
import pandas as pd
from data_gatherer.data_gatherer import DataGatherer
from scripts.experiment_utils import *
import numpy as np

In [2]:
df_citations = pd.read_parquet("scripts/exp_input/dataset_citation_records_Table.parquet")

In [3]:
df_citations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401327 entries, 0 to 401326
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   identifier                401327 non-null  object
 1   repository                401327 non-null  object
 2   citing_publication_link   401327 non-null  object
 3   citation_record_source    401327 non-null  object
 4   citation_record_from_doi  401327 non-null  int64 
 5   doi                       401327 non-null  object
 6   pmcid                     401327 non-null  object
dtypes: int64(1), object(6)
memory usage: 21.4+ MB


In [4]:
df_citations['identifier'].describe()

count         401327
unique        190428
top       syn2580853
freq             119
Name: identifier, dtype: object

In [5]:
df_citations['citation_record_source'].value_counts()

citation_record_source
GEO_API                         352982
proteomexchange_search.tsv       44281
europe pmc synapse id mining      4064
Name: count, dtype: int64

In [6]:
df_citations[df_citations['citation_record_source']!='GEO_API']['identifier'].describe()

count          48345
unique         25350
top       syn2580853
freq             119
Name: identifier, dtype: object

In [7]:
df_citations[df_citations['citation_record_source']=='GEO_API']['identifier'].describe()

count       352982
unique      165078
top       GSE55296
freq            34
Name: identifier, dtype: object

In [8]:
df_citations['citing_publication_link'].describe()

count                                     401327
unique                                    246823
top       https://dx.doi.org/10.1038/nature11247
freq                                       16711
Name: citing_publication_link, dtype: object

In [9]:
# create a histogram of the number of citations per repository, log scale
print(df_citations['repository'].value_counts())
#df_citations['repository'].value_counts().plot(kind='bar', title='Number of citations per repository', logy=True)

repository
GEO               352982
PRIDE              39263
Synapse             4064
iProX               3306
jPOST                828
MassIVE              436
PanoramaPublic       338
PeptideAtlas         110
Name: count, dtype: int64


In [10]:
df_fetched = pd.read_parquet('scripts/exp_input/Local_fetched_data.parquet')

In [11]:
df_fetched.columns, len(df_fetched)

(Index(['file_name', 'raw_cont', 'format', 'length', 'path', 'publication'], dtype='object'),
 1313)

In [12]:
input_file = "scripts/exp_input/europe_pmc_synapse_id_mining.txt"
model_name = "gemini-2.0-flash"  # "gemini-2.0-flash" or "gpt-4o-mini"
write_htmls_xmls = True
articles_dir = "scripts/output/html_xml_samples"

# read list from a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()

print("Number of PMCIDs:", len(pmcids))

Number of PMCIDs: 2190


In [13]:
dg = DataGatherer(log_level='INFO', llm_name=model_name)

[97mdata_gatherer.py - line 302 - INFO - Setting up data fetcher...[0m
[97mdata_gatherer.py - line 332 - INFO - Data fetcher setup completed.[0m
[97mdata_gatherer.py - line 103 - INFO - DataGatherer orchestrator initialized. Extraction Model: gemini-2.0-flash[0m


In [14]:
raw_data = dg.fetch_data(pmcids,write_htmls_xmls=True, article_file_dir='scripts/tmp/raw_files/') 

[97mdata_gatherer.py - line 148 - INFO - length of complete fetches < urls: 0 < 2190[0m
[97mdata_fetcher.py - line 167 - INFO - raw_HTML_data_filepath: None[0m
[97mdata_fetcher.py - line 174 - INFO - Initializing EntrezFetcher(('requests', 'self.config'))[0m
[97mdata_fetcher.py - line 683 - INFO - Fetching data from request: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC10001072&retmode=xml&api_key=426d4afdbf84343737c3138818880bc88208[0m
[97mxml_retriever.py - line 446 - INFO - ----Checking for data_availability_sections section in raw data.[0m
[97mxml_retriever.py - line 69 - INFO - Loading target sections for section name: data_availability_sections[0m
[97mxml_retriever.py - line 457 - INFO - ----Found section: <notes xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schemas/ali/1.0...[0m
[97mxml_retriever.py - line 446 - INFO - ----Checking for supplementary_data_sections section in raw data.[0m
[97mxml_retriever.

In [25]:
load_pmc_files_from_html_xml_dir_to_dataframe_fetch_file('scripts/tmp/raw_files/PMC','scripts/exp_input/Local_fetched_data_1.parquet')

Filename does not start with 'PMC': .DS_Store. Skipping this file.
Basename does not contain '__': .DS_Store
Skipping unsupported file format: .DS_Store
Loaded 2190 files from scripts/tmp/raw_files/PMC
No existing file found. Using loaded DataFrame with 2190 entries.
Saving DataFrame to scripts/exp_input/Local_fetched_data_1.parquet


In [26]:
df_fetched = pd.read_parquet('scripts/exp_input/Local_fetched_data_1.parquet')

In [27]:
df_fetched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   pub_title    2190 non-null   object
 1   file_name    2190 non-null   object
 2   raw_cont     2190 non-null   object
 3   format       2190 non-null   object
 4   length       2190 non-null   int64 
 5   path         2190 non-null   object
 6   publication  2190 non-null   object
dtypes: int64(1), object(6)
memory usage: 119.9+ KB


In [28]:
df_err = df_fetched[df_fetched['publication'].isna()]
df_err.head()

Unnamed: 0,pub_title,file_name,raw_cont,format,length,path,publication


In [30]:
df_fetched['length'].describe()

count    2.190000e+03
mean     2.245000e+05
std      1.064628e+05
min      4.375400e+04
25%      1.657760e+05
50%      2.088020e+05
75%      2.572198e+05
max      1.839410e+06
Name: length, dtype: float64