# Create dataset of figures in pdf

## Import lib

In [20]:
#To download fulltext
import requests
import pandas as pd

#To filter invalid pdf
from pypdf import PdfReader
from pypdf.errors import PdfReadError

#To handle files
import glob
import os

#To extract images from pdf
import fitz


## Download papers full text

In [15]:
df = pd.read_csv("../../results/extracted_csv/paper_openalex.csv")
df_2023 = df[df["publication_year"] == 2023]
df_acdc = df_2023[df_2023["dataset_used"] == "ACDC"]

In [18]:
url_base = "https://api.openalex.org/works/https://doi.org/"
paper_id = 1
for doi in df_acdc["DOI"]:
    print(doi)
    url = url_base + doi 
    r_paper = requests.get(url)
    if r_paper.status_code == 200:
        r_paper_json = r_paper.json()
        fulltext_url = r_paper_json["open_access"]["oa_url"]
        if fulltext_url:
            r_fulltext = requests.get(fulltext_url,allow_redirects=True)
            if r_fulltext.status_code == 200:
                print(fulltext_url)
                open(f"../../results/papers_fulltext/{paper_id}.pdf","wb").write(r_fulltext.content)
                paper_id += 1
        if paper_id == 10:
            break
    else:
        print(f"Error {r_paper.status_code} for {doi}")

10.1016/b978-0-32-385773-4.00023-x
10.1016/b978-0-32-385773-4.00025-3
10.1016/b978-0-32-385773-4.00009-5
https://doi.org/10.1016/b978-0-32-385773-4.00009-5
10.1016/j.compbiomed.2022.106439
10.1371/journal.pdig.0000159
https://journals.plos.org/digitalhealth/article/file?id=10.1371/journal.pdig.0000159&type=printable
10.1109/access.2023.3234241
10.1016/j.compmedimag.2022.102174
10.1088/1361-6560/acb19a
https://iopscience.iop.org/article/10.1088/1361-6560/acb19a/pdf
10.1007/s00740-022-00474-9
10.1016/j.patcog.2023.109318
https://doi.org/10.1016/j.patcog.2023.109318
10.1109/access.2023.3238058
10.1038/s41598-023-28348-y
https://www.nature.com/articles/s41598-023-28348-y.pdf
10.3389/fphys.2023.1027076
https://www.frontiersin.org/articles/10.3389/fphys.2023.1027076/pdf
10.1016/b978-0-12-821983-6.00008-4
10.3390/bioengineering10020166
https://www.mdpi.com/2306-5354/10/2/166/pdf?version=1674889337
10.1016/j.media.2023.102762
http://arxiv.org/pdf/2206.01136
10.1016/j.bspc.2023.104631
10.1109/w

## Remove invalid pdf obtain from previous step

In [19]:
pdf_path = glob.glob("../../results/papers_fulltext/*.pdf")

if not os.path.exists("../../results/papers_fulltext/valid_pdf"):
    os.makedirs("../../results/papers_fulltext/valid_pdf")

if not os.path.exists("../../results/papers_fulltext/removed_pdf"):
    os.makedirs("../../results/papers_fulltext/removed_pdf")

for file in pdf_path:
    try:
        PdfReader(file,strict=True)
        os.rename(file, file.replace("papers_fulltext/","papers_fulltext/valid_pdf/"))
    except PdfReadError:
        os.rename(file, file.replace("papers_fulltext/","papers_fulltext/removed_pdf/"))


## Extract images from valid pdf

In [26]:
valid_pdf_path = glob.glob("../../results/papers_fulltext/valid_pdf/*.pdf")
images_path = "../../results/papers_fulltext/images"
if not os.path.exists(images_path):
    os.makedirs(images_path)
for file in valid_pdf_path:
    pdf_file = fitz.open(file)
    #Get the number of pages in PDF file
    page_nums = len(pdf_file)

    #Create empty list to store images information
    images_list = []

    #Extract all images information from each page
    for page_num in range(page_nums):
        page_content = pdf_file[page_num]
        images_list.extend(page_content.get_images())

    if len(images_list)!=0:
        if not os.path.exists(os.path.join(images_path, os.path.basename(file.replace(".pdf","")))):
            os.makedirs(os.path.join(images_path, os.path.basename(file.replace(".pdf",""))))

        #Save all the extracted images
        for i, img in enumerate(images_list, start=1):
            #Extract the image object number
            xref = img[0]
            #Extract image
            base_image = pdf_file.extract_image(xref)
            #Store image bytes
            image_bytes = base_image['image']
            #Store image extension
            image_ext = base_image['ext']
            #Generate image file name
            image_name = str(i) + '.' + image_ext
            #Save image
            with open(os.path.join(images_path, os.path.basename(file.replace(".pdf","")),image_name) , 'wb') as image_file:
                image_file.write(image_bytes)
                image_file.close()

