# Step 1 — Initialize Project Configuration, Download PubMed Articles, and Run Entity Extraction

## 1. Project Initialization

In [None]:
from pathlib import Path
from haldxai.init.config_utils import init_project, show_config, set_config

# 1️⃣ Specify the project root directory (an empty or existing project folder)
ROOT = Path("/path/to/your/project")

# 2️⃣ One-click initialization
#    If the project is already initialized, use force=True or skip this step
init_project(ROOT, force=False)

# 3️⃣ Display current configuration and environment variables
show_config(ROOT)

# 4️⃣ Modify configurations inside a Notebook
set_config("api.deepseek.model",  "deepseek-chat-1.5", project_root=ROOT)
set_config("batch.max_workers",   32,                  project_root=ROOT)
set_config("api.bioportal.page_size", 20,              project_root=ROOT)

# 5️⃣ Confirm that the settings have been updated
show_config(ROOT)

## 2. Download Aging-Related and Non-Aging PubMed Articles

### 2.1 Fetch Aging-Related Articles

In [None]:
from pathlib import Path
from haldxai.init.config_utils import show_config, set_config
from haldxai.pubmed.run_fetch_articles import run as run_fetch_articles

ROOT = Path("/path/to/your/project")

aging_related_query = """
(
("aging"[Title/Abstract] OR "ageing"[Title/Abstract]
OR "geriatric"[Title/Abstract] OR "geriatrics"[Title/Abstract]
OR "centenarian"[Title/Abstract] OR "older people"[Title/Abstract]
OR "older adult"[Title/Abstract] OR "the elderly"[Title/Abstract]
OR "the aged"[Title/Abstract] OR "old age"[Title/Abstract]
OR "old adults"[Title/Abstract] OR "longevity"[Title/Abstract]
OR "senescence"[Title/Abstract] OR "cellular senescence"[Title/Abstract]
OR "anti-aging"[Title/Abstract] OR "geroprotector"[Title/Abstract]
OR "healthspan"[Title/Abstract] OR "lifespan extension"[Title/Abstract]
OR "healthspan extension"[Title/Abstract] OR "life expectancy"[Title/Abstract])
OR ("Aging"[Mesh] OR "Geriatrics"[Mesh] OR "Longevity"[Mesh])
)
AND "Journal Article"[ptyp]
AND ("humans"[MeSH Terms])
AND (English[lang])
""".strip().replace("\n", " ")

set_config("pubmed_query.aging-related", aging_related_query)
set_config("last_update_year", 2025)

# Run the fetch job
run_fetch_articles(task="aging-related")

### 2.2 Fetch Non-Aging-Related Articles (Negative Samples)

In [None]:
from pathlib import Path
from haldxai.init.config_utils import show_config, set_config
from haldxai.pubmed.run_fetch_articles import run as run_fetch_articles

ROOT = Path("/path/to/your/project")

not_aging_related_query = """
("Journal Article"[ptyp])
AND ("humans"[MeSH Terms])
AND (English[lang])
NOT (
"aging"[Title/Abstract] OR "ageing"[Title/Abstract] OR "geriatric"[Title/Abstract]
OR "geriatrics"[Title/Abstract] OR "centenarian"[Title/Abstract] OR "older people"[Title/Abstract]
OR "older adult"[Title/Abstract] OR "the elderly"[Title/Abstract] OR "the aged"[Title/Abstract]
OR "old age"[Title/Abstract] OR "old adults"[Title/Abstract] OR "longevity"[Title/Abstract]
OR "senescence"[Title/Abstract] OR "cellular senescence"[Title/Abstract] OR "anti-aging"[Title/Abstract]
OR "geroprotector"[Title/Abstract] OR "healthspan"[Title/Abstract] OR "lifespan extension"[Title/Abstract]
OR "life expectancy"[Title/Abstract] OR "Aging"[Mesh] OR "Geriatrics"[Mesh] OR "Longevity"[Mesh]
)
""".strip().replace("\n", " ")

set_config("pubmed_query.not-aging-related", not_aging_related_query)
set_config("last_update_year", 2025)

# Fetch a limited number for negative samples
run_fetch_articles(task="not-aging-related", start_year=2025, end_year=2025, retmax=200)

### 2.3 Merge Impact Factor Metadata

In [None]:
from haldxai.pubmed.run_postprocess_articles import run as run_postprocess_articles

# For aging-related articles
run_postprocess_articles(task="aging-related", force=True)

# For negative samples
run_postprocess_articles(task="not-aging-related", force=True)

## 3. Entity Recognition Pipeline

### 3.1 Run SciSpacy NER

In [None]:
from haldxai.workflow.run_spacy_batches import run as run_spacy_batches

run_spacy_batches(
    models=["en_ner_bionlp13cg_md", "en_ner_bc5cdr_md", "en_ner_jnlpba_md"],
    years=[2023, 2024]
)

### 3.2 Train an Aging-Related Document Classifier

In [None]:
from haldxai.workflow.train_aging_classifier import run as train_aging_classifier

train_aging_classifier(
    project_root="~/Projects/HALDxAI-Project",
    model_name="aging_classifier_tfidf_lr_v1",
    neg_ratio=3,
    show_cv=True,
    tfidf_max_feat=5000,
    ngram="1-2"
)

### 3.3 Run DeepSeek-Based NER Batch Inference

In [None]:
from pathlib import Path
from haldxai.workflow.prepare_llm_batches import run as prep_run
from haldxai.workflow.run_llm_batches import run as infer_run
from haldxai.workflow.parse_llm_results import run as parse_run

ROOT = Path("/path/to/HALDxAI-Project")

# 1) Generate JSONL batch files
prep_run(project_root=ROOT, force=False)

# 2) Run inference for each task
infer_run(
    task_name="AgingRelated-DeepSeekV3",
    years="2025",
    project_root=ROOT
)

# 3) Parse model outputs
parse_run(
    task="AgingRelated-DeepSeekV3",
    years="2025",
    root=ROOT
)

## 3.4 Parse and Annotate NER Outputs

In [None]:
import json
from pathlib import Path
from haldxai.ner.postprocess import (
    build_deepseek_entity_dict,
    build_spacy_entity_dict,
)
from haldxai.enrich.bioportal import run_bioportal
from haldxai.postprocess.annotate import run_annotation
from haldxai.postprocess.merge_annotations import merge_ann

ROOT = Path("/path/to/HALDxAI-Project")

# Build LLM-based entity dictionary
build_deepseek_entity_dict("AgingRelated-DeepSeekV3")

# Annotate using BioPortal
run_bioportal(tasks=["AgingRelated-DeepSeekV3"])

bioportal_dict_file = ROOT / "data/ner_dict/bioPortal/final_entity_results.json"

with open(bioportal_dict_file, "r", encoding="utf-8") as f:
    entity_info_dict = json.load(f)

print(f"BioPortal annotated entities: {len(entity_info_dict)}")

# Run annotation for DeepSeek tasks
deepseek_tasks = ["AgingRelated-DeepSeekV3"]
for t in deepseek_tasks:
    run_annotation(
        ROOT, t,
        kind="deepseek",
        bio_dict=bioportal_dict_file,
        out_dir=f"{ROOT}/data/ner_dict"
    )

# Merge all annotation results (incremental merge)
merge_ann(ROOT)