<a href="https://colab.research.google.com/github/SinaRampe/pubmed_downloader/blob/main/pubmed_downloader_with_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### If from an already performed PubMed search the results were saved as csv 

In [2]:
!pip install metapub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting metapub
  Downloading metapub-0.5.5.tar.gz (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.3/120.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting eutils (from metapub)
  Downloading eutils-0.6.0-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting habanero (from metapub)
  Downloading habanero-1.2.3-py2.py3-none-any.whl (30 kB)
Collecting cssselect (from metapub)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting unidecode (from metapub)
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt (from metapub)
  D

In [22]:
import pandas as pd
import numpy as np
from metapub import FindIt
from metapub import PubMedFetcher
import requests
import os

In [6]:
csv_data = "cpic_guidelines"
df = pd.read_csv(f'{csv_data}.csv')  

In [7]:
save_directory = f"pdfs_{csv_data}"

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

In [8]:
df.head(1)

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI
0,36760155,Pharmacogenetic actionability and medication p...,"Anderson JD, Davis BH, Giang G, Jones A, Lee C...",Clin Transl Sci. 2023 Apr;16(4):662-672. doi: ...,Anderson JD,Clin Transl Sci,2023,2023/02/10,PMC10087076,,10.1111/cts.13479


In [9]:
pmids = df['PMID'].tolist()

In [10]:
len(pmids)

36

In [23]:
fetch = PubMedFetcher()

In [24]:
def get_article_data(pmid):
    data = {"PMID": str(pmid)}
    
    try:
        article = fetch.article_by_pmid(pmid)
        
        data.update({
            "Title": article.title,
            "Abstract": article.abstract,
            "Author": article.authors,
            "Year": article.year,
            "Volume": article.volume,
            "Issue": article.issue,
            "Journal": article.journal,
            "Citation": article.citation,
            "Link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            "AnyError": 0
        })

    except Exception:
        data["AnyError"] = 1

    return data


In [25]:
collected_data = []

for pmid in pmids:
    data = get_article_data(pmid)
    collected_data.append(data)

df = pd.DataFrame(collected_data)

columns_to_fill = ["Title", "Abstract", "Author", "Year", "Volume", "Issue", "Journal", "Citation", "Link"]
df.loc[df["AnyError"] == 1, columns_to_fill] = np.nan


In [26]:
df = pd.DataFrame(collected_data)

In [27]:
df.head(1)

Unnamed: 0,PMID,Title,Abstract,Author,Year,Volume,Issue,Journal,Citation,Link,AnyError
0,36760155,Pharmacogenetic actionability and medication p...,Although major advancements have been made in ...,"[Anderson JD, Davis BH, Giang G, Jones A, Lee ...",2023,16,4,Clin Transl Sci,"Anderson JD, et al. Pharmacogenetic actionabil...",https://pubmed.ncbi.nlm.nih.gov/36760155/,0


In [28]:
pdf_url_or_reason = []
pdf_url_available = []

In [29]:
for pmid in pmids:

  src = FindIt(pmid)

  if src.url is None:
    pdf_url_or_reason.append(src.reason)
    pdf_url_available.append(0)

  else: 
    pdf_url_or_reason.append(src.url)
    pdf_url_available.append(1)
    response = requests.get(src.url, allow_redirects=True)

    with open(os.path.join(save_directory, f"{pmid}.pdf"), 'wb') as f:
      f.write(response.content)

2023-05-10 07:57:48 a161b73a84dc metapub.findit[413] INFO FindIt Cache initialized at /root/.cache/findit.db


In [30]:
len(pdf_url_available)

36

In [31]:
df["pdf-url"] = pdf_url_or_reason
df["pdf_url_available"] = pdf_url_available

In [32]:
df.to_csv(f'{csv_data}_articledetails.csv')

In [34]:
save_directory

'pdfs_cpic_guidelines'

In [35]:
!ls -l "pdfs_cpic_guidelines" | grep -v '^d' | wc -l

27


other resources for ideas how to extend my code

https://github.com/billgreenwald/Pubmed-Batch-Download

https://github.com/ddomingof/PubMed2PDF