Things to updates

1. Need to also extract document type for inclusion/exclusion

In [None]:
import os
import time
import json
from random import sample
from pathlib import Path

import requests
from tqdm import tqdm
from dotenv import load_dotenv
import pandas as pd


from open_extract.llm import OLLAMAExtractor
from open_extract.data_model import Screening

os.chdir("..")
load_dotenv()

In [None]:
md_files = sorted(Path("data/screening_250205/mds").glob("*.md"))
md_files = sample(md_files, 3)
md_files

In [None]:
extractor = OLLAMAExtractor(
    model_name="deepseek-r1-70b-15k-ctx",
    ollama_host="olvi-1:11434",
    target_model=Screening,
)


In [None]:
Path("runs/screening_update_250225").mkdir(exist_ok=True, parents=True)

In [None]:
for file in tqdm(md_files):
    save_file_path = (
        Path("../runs/screening_update_250225") / file.with_suffix(".json").name
    )
    if not save_file_path.exists():
        y = extractor.run(file.read_text())
        save_file_path.write_text(y.model_dump_json(indent=4))

The results show that the LLM is not a good source of doc type. It just classified the paper as review paper no matter what.
I think we can just handle it with external API, i.e., OpenAlex

In [None]:
def query_doc_type(doi: str) -> str:
    """Query the OpenAlex API to get the document type of a given DOI."""

    url = f"https://api.openalex.org/works/https://doi.org/{doi}"

    # Adding polite email to the request
    polite_email = os.getenv("API_POLITE_EMAIL")
    url = f"{url}?mailto={polite_email}" if polite_email else url
    response = requests.get(url)

    if response.status_code == 404:
        return "not_found"

    if response.status_code == 429:  # Rate limit exceeded
        time.sleep(10)
        response = requests.get(url)

    try:
        return response.json()["type"]
    except KeyError:
        return ""


In [None]:
files = list(Path("runs/screening_250205/").glob("*.json"))

In [None]:
for x in tqdm(list(files)):
    data = json.loads(x.read_text())
    if "publication_doi" in data and data["publication_doi"]:
        data["doc_type"] = query_doc_type(data["publication_doi"])
        print(f"{data["publication_doi"]=}: {data['doc_type']}")
        x.write_text(json.dumps(data, indent=4))
        time.sleep(0.2)

### Export to Shared GSheet manually

In [None]:
export_data = []
for file in files:
    data = json.loads(file.read_text())
    export_data.append(
        {
        "file_name": file.stem,
        "publication_doi": data.get("publication_doi", ""),
        "doc_type": data.get("doc_type", "")
    }
    )

In [None]:
pd.DataFrame(export_data).to_csv("doctypes.csv", index=False)