The goal of this notebook is to find approach to parse data from open alex by query 

First approach is to filter query by full text criteria 

In [15]:
import os
import requests
import logging
from typing import List


class OpenAlexReader:
    """OpenAlex Reader.
    Gets a search query, returns a list of Documents of the top corresponding scientific papers on OpenAlex.
    """

    def __init__(self) -> None:
        """Initialize with parameters."""
        super().__init__()

    def _fetch_openalex_results(self, search_query: str, max_results: int):
        """Fetch top scientific papers from OpenAlex based on search query."""
        base_url = "https://api.openalex.org/works"
        params = {
            "search": search_query,
            "filter": "has_fulltext:true",  # Add filter for full-text availability
            "per_page": max_results,
            "mailto": "your-email@example.com"  # Replace with your email
        }
        response = requests.get(base_url, params=params)

        if response.status_code != 200:
            raise Exception(f"Failed to fetch data from OpenAlex. Status code: {response.status_code}")

        return response.json().get("results", [])

    def get_papers(self, user_query: str, max_results=3) -> List[dict]:
        """
        Returns a list of dicts with document metadata extracted from OpenAlex.
        Title, authors, date, archive id and abstract. 
        """
        papers = self._fetch_openalex_results(user_query, max_results=max_results)
        results = []
        broken_results = []
        for i, paper in enumerate(papers):
            try:
                paper_metadata = {
                    ''
                    'title': paper.get('title', 'No title'),
                    'authors': "\n ".join([author.get('author', {}).get('display_name', 'Unknown') for author in
                                           paper.get('authorships', [])]),
                    'date': paper['publication_date'],
                    'archive_id': paper.get('doi', "/").split("/")[-1],
                    'doi': paper.get('doi', "N/A"),
                    'abstract': self.extract_abstract(paper.get('abstract_inverted_index', "")),
                }
                results.append(paper_metadata)
            except:
                print(i, paper.get('doi'))
                broken_results.append(paper)
        return papers

    def download_paper(self, papers_metadata: List[dict], download_dir="downloads"):
        """Download the PDF of a paper using its metadata.

        Args:
            paper_metadata (dict): Metadata of the paper containing the URL.
            download_dir (str): Directory to save downloaded papers.
        """
        # Ensure download directory exists
        os.makedirs(download_dir, exist_ok=True)
        if not isinstance(papers_metadata, list):
            papers_metadata = [papers_metadata]

        for paper_metadata in papers_metadata:

            # Try to extract a URL for the PDF
            # Typically, look for the 'doi' or other links in the paper_metadata

            if paper_metadata.get('primary_location'):
                if paper_metadata.get('primary_location').get("pdf_url"):
                    pdf_url = paper_metadata.get('primary_location').get('pdf_url')
            if not pdf_url:
                pdf_url = paper_metadata.get('doi')  # Use DOI if available

            if not pdf_url:
                # If DOI is not available, you might look for other links in the metadata
                pdf_url = paper_metadata.get('url_for_pdf')  # Replace with correct key if available

            if pdf_url:
                # You might need to replace this with the actual PDF URL
                logging.debug(f"> Downloading PDF from {pdf_url}")

                try:
                    pdf_response = requests.get(pdf_url)
                    pdf_response.raise_for_status()  # Raise error for bad responses

                    # Create a filename based on the title or DOI
                    title = paper_metadata.get('display_name', 'unknown_paper').replace('/', '_')
                    filename = f"{title}.pdf"
                    file_path = os.path.join(download_dir, filename)

                    with open(file_path, 'wb') as f:
                        f.write(pdf_response.content)

                    logging.debug(f"> Successfully downloaded {filename}")
                except requests.RequestException as e:
                    logging.error(f"Error downloading PDF: {e}")
            else:
                logging.warning("No downloadable PDF URL found for this paper.")

    def extract_abstract(self, abstract_inverted_index):
        """Convert an abstract_inverted_index into a readable text format.
    
        Args:
            abstract_inverted_index (dict): The abstract inverted index from OpenAlex.
    
        Returns:
            str: The reconstructed abstract as a plain text string.
        """
        if not abstract_inverted_index:
            return "No abstract available"
        # Initialize a list to hold the words based on their positions
        abstract_length = max(max(pos) for pos in abstract_inverted_index.values()) + 1
        abstract_words = [""] * abstract_length

        # Place each word into the correct position
        for word, positions in abstract_inverted_index.items():
            for pos in positions:
                abstract_words[pos] = word

        # Join the words to form the complete abstract text
        return ' '.join(word for word in abstract_words if word)


#USAGE EXAMPLE 
"""
r = OpenAlexReader()
metadata = r.get_papers("Transformers", max_results=1)
r.download_paper(metadata)
"""
r = OpenAlexReader()
metadata = r._fetch_openalex_results(search_query="Transformers", max_results=100)
r.download_paper(metadata)

In [31]:
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader('downloads', required_exts=[".pdf"])
documents = reader.load_data()
filenames = set([documents[i].metadata['file_name'] for i in range(len(documents))])
len(filenames), len(metadata)

In [38]:
filenames = set([documents[i].metadata['file_name'] for i in range(len(documents))])
len(filenames), len(metadata)

(30, 100)

## With this approach we have 30 over 100

In [57]:
import os
import requests
import logging
from typing import List


class OpenAlexReader:
    """OpenAlex Reader.
    Gets a search query, returns a list of Documents of the top corresponding scientific papers on OpenAlex.
    """

    def __init__(self) -> None:
        """Initialize with parameters."""
        super().__init__()

    def _fetch_openalex_results(self, search_query: str, max_results: int):
        """Fetch top scientific papers from OpenAlex based on search query."""
        base_url = "https://api.openalex.org/works"
        params = {
            "search": search_query,  # Add filter for full-text availability
            "per_page": max_results,
            "mailto": "your-email@example.com"  # Replace with your email
        }
        response = requests.get(base_url, params=params)

        if response.status_code != 200:
            raise Exception(f"Failed to fetch data from OpenAlex. Status code: {response.status_code}")

        return response.json().get("results", [])

    def get_papers(self, user_query: str, max_results=3) -> List[dict]:
        """
        Returns a list of dicts with document metadata extracted from OpenAlex.
        Title, authors, date, archive id and abstract. 
        """
        papers = self._fetch_openalex_results(user_query, max_results=max_results)
        results = []
        broken_results = []
        for i, paper in enumerate(papers):
            try:
                paper_metadata = {
                    ''
                    'title': paper.get('title', 'No title'),
                    'authors': "\n ".join([author.get('author', {}).get('display_name', 'Unknown') for author in
                                           paper.get('authorships', [])]),
                    'date': paper['publication_date'],
                    'archive_id': paper.get('doi', "/").split("/")[-1],
                    'doi': paper.get('doi', "N/A"),
                    'abstract': self.extract_abstract(paper.get('abstract_inverted_index', "")),
                }
                results.append(paper_metadata)
            except:
                print(i, paper.get('doi'))
                broken_results.append(paper)
        return papers

    def download_paper(self, papers_metadata: List[dict], download_dir="downloads"):
        """Download the PDF of a paper using its metadata.

        Args:
            paper_metadata (dict): Metadata of the paper containing the URL.
            download_dir (str): Directory to save downloaded papers.
        """
        # Ensure download directory exists
        os.makedirs(download_dir, exist_ok=True)
        if not isinstance(papers_metadata, list):
            papers_metadata = [papers_metadata]

        for paper_metadata in papers_metadata:
            pdf_url = None
            # Try to extract a URL for the PDF
            # Typically, look for the 'doi' or other links in the paper_metadata

            if paper_metadata.get('primary_location'):
                if paper_metadata.get('primary_location').get("pdf_url"):
                    pdf_url = paper_metadata.get('primary_location').get('pdf_url')
            if not pdf_url:
                pdf_url = paper_metadata.get('doi')  # Use DOI if available

            if not pdf_url:
                # If DOI is not available, you might look for other links in the metadata
                pdf_url = paper_metadata.get('url_for_pdf')  # Replace with correct key if available

            if pdf_url:
                # You might need to replace this with the actual PDF URL
                logging.debug(f"> Downloading PDF from {pdf_url}")

                try:
                    pdf_response = requests.get(pdf_url)
                    pdf_response.raise_for_status()  # Raise error for bad responses

                    # Create a filename based on the title or DOI
                    title = paper_metadata.get('display_name', 'unknown_paper').replace('/', '_')
                    filename = f"{title}.pdf"
                    file_path = os.path.join(download_dir, filename)

                    with open(file_path, 'wb') as f:
                        f.write(pdf_response.content)

                    logging.debug(f"> Successfully downloaded {filename}")
                except requests.RequestException as e:
                    logging.error(f"Error downloading PDF: {e}")
            else:
                logging.warning("No downloadable PDF URL found for this paper.")

    def extract_abstract(self, abstract_inverted_index):
        """Convert an abstract_inverted_index into a readable text format.
    
        Args:
            abstract_inverted_index (dict): The abstract inverted index from OpenAlex.
    
        Returns:
            str: The reconstructed abstract as a plain text string.
        """
        if not abstract_inverted_index:
            return "No abstract available"
        # Initialize a list to hold the words based on their positions
        abstract_length = max(max(pos) for pos in abstract_inverted_index.values()) + 1
        abstract_words = [""] * abstract_length

        # Place each word into the correct position
        for word, positions in abstract_inverted_index.items():
            for pos in positions:
                abstract_words[pos] = word

        # Join the words to form the complete abstract text
        return ' '.join(word for word in abstract_words if word)


#USAGE EXAMPLE 
"""
r = OpenAlexReader()
metadata = r.get_papers("Transformers", max_results=1)
r.download_paper(metadata)
"""
!rm -rf downloads
r = OpenAlexReader()
metadata = r._fetch_openalex_results(search_query="Transformers", max_results=100)
r.download_paper(metadata)

ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9710580/
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/2020.emnlp-demos.6.pdf
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/P19-1285.pdf
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/726791/
ERROR:root:Error downloading PDF: 403 Client Error: Forbidden for url: https://digital-library.theiet.org/content/journals/10.1049/piee.1966.0236
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9709990/
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9578646/
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9711179/
E

In [58]:
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader('downloads', required_exts=[".pdf"])
documents = reader.load_data()
filenames = set([documents[i].metadata['file_name'] for i in range(len(documents))])
len(os.listdir('downloads')), len(filenames), len(metadata)



Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/A survey of transformers.pdf with error: RetryError[<Future at 0x17936b050 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale.pdf with error: RetryError[<Future at 0x113566a10 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Attention Is All You Need.pdf with error: RetryError[<Future at 0x11335fa50 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf with error: RetryError[<Future at 0x179279f50 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/BEiT: BERT Pre-Training of Image Transformers.pdf with error: RetryError[<Future at 0x1793525d0 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Big Bird: Transformers for Longer Sequences.pdf with error: RetryError[<Future at 0x17fd17e90 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/CTRL: A Conditional Transformer Language Model for Controllable Generation.pdf with error: RetryError[<Future at 0x178ffee50 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Conformer: Convolution-augmented Transformer for Speech Recognition.pdf with error: RetryError[<Future at 0x179217410 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Deformable DETR: Deformable Transformers for End-to-End Object Detection.pdf with error: RetryError[<Future at 0x1134af950 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/End-to-End Object Detection with Transformers.pdf with error: RetryError[<Future at 0x300bda190 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Experimental thermodynamic evaluation for a single stage heat transformer prototype build with commercial PHEs.pdf with error: RetryError[<Future at 0x300bd9390 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer.pdf with error: RetryError[<Future at 0x300bdac10 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Generating Long Sequences with Sparse Transformers.pdf with error: RetryError[<Future at 0x107513350 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/HuggingFace's Transformers: State-of-the-art Natural Language Processing.pdf with error: RetryError[<Future at 0x17901af50 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Inductors and Transformers for Power Electronics.pdf with error: RetryError[<Future at 0x1793c9690 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Longformer: The Long-Document Transformer.pdf with error: RetryError[<Future at 0x179af3710 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Loss of Life Transformer Prediction Based on Stacking Ensemble Improved by Genetic Algorithm By IJISRT.pdf with error: RetryError[<Future at 0x1793f93d0 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Medical Transformer: Gated Axial-Attention for Medical Image Segmentation.pdf with error: RetryError[<Future at 0x300c9cbd0 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Reformer: The Efficient Transformer.pdf with error: RetryError[<Future at 0x10754ba90 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers.pdf with error: RetryError[<Future at 0x300c63750 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Spatial Transformer Networks.pdf with error: RetryError[<Future at 0x17916ee90 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Swin-Unet: Unet-Like Pure Transformer for Medical Image Segmentation.pdf with error: RetryError[<Future at 0x178f51b10 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/TransUNet: Transformers Make Strong Encoders for Medical Image Segmentation.pdf with error: RetryError[<Future at 0x300c61a50 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Transformer and Inductor Design Handbook.pdf with error: RetryError[<Future at 0x300c9e1d0 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Transformer in Transformer.pdf with error: RetryError[<Future at 0x300c60210 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention.pdf with error: RetryError[<Future at 0x300c63490 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Transformers in Vision: A Survey.pdf with error: RetryError[<Future at 0x1793c9a50 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Universal Transformers.pdf with error: RetryError[<Future at 0x178f51050 state=finished raised PdfStreamError>]. Skipping...


(35, 7, 100)

# 35 files in dir, but only 7 of them can be parsed 

In [60]:
# Next experiment - try to focus on archive 

import os
import requests
import logging
from typing import List


class OpenAlexReader:
    """OpenAlex Reader.
    Gets a search query, returns a list of Documents of the top corresponding scientific papers on OpenAlex.
    """

    def __init__(self) -> None:
        """Initialize with parameters."""
        super().__init__()

    def _fetch_openalex_results(self, search_query: str, max_results: int):
        """Fetch top scientific papers from OpenAlex based on search query."""
        base_url = "https://api.openalex.org/works"
        params = {
            "search": search_query,  # Add filter for full-text availability
            "per_page": max_results,
            "mailto": "your-email@example.com"  # Replace with your email
        }
        response = requests.get(base_url, params=params)

        if response.status_code != 200:
            raise Exception(f"Failed to fetch data from OpenAlex. Status code: {response.status_code}")

        return response.json().get("results", [])

    def get_papers(self, user_query: str, max_results=3) -> List[dict]:
        """
        Returns a list of dicts with document metadata extracted from OpenAlex.
        Title, authors, date, archive id and abstract. 
        """
        papers = self._fetch_openalex_results(user_query, max_results=max_results)
        results = []
        broken_results = []
        for i, paper in enumerate(papers):
            try:
                paper_metadata = {
                    ''
                    'title': paper.get('title', 'No title'),
                    'authors': "\n ".join([author.get('author', {}).get('display_name', 'Unknown') for author in
                                           paper.get('authorships', [])]),
                    'date': paper['publication_date'],
                    'archive_id': paper.get('doi', "/").split("/")[-1],
                    'doi': paper.get('doi', "N/A"),
                    'abstract': self.extract_abstract(paper.get('abstract_inverted_index', "")),
                }
                results.append(paper_metadata)
            except:
                print(i, paper.get('doi'))
                broken_results.append(paper)
        return papers

    def download_paper(self, papers_metadata: List[dict], download_dir="downloads"):
        """Download the PDF of a paper using its metadata.

        Args:
            paper_metadata (dict): Metadata of the paper containing the URL.
            download_dir (str): Directory to save downloaded papers.
        """
        # Ensure download directory exists
        os.makedirs(download_dir, exist_ok=True)
        if not isinstance(papers_metadata, list):
            papers_metadata = [papers_metadata]

        for paper_metadata in papers_metadata:
            pdf_url = None

            if 'open_access' in paper_metadata:
                if 'oa_url' in paper_metadata['open_access']:
                    if paper_metadata['open_access'].get('oa_url') is not None:
                        stats.append(True)
                        pdf_url = paper_metadata['open_access']['oa_url'].replace('abs', 'pdf')
  
            # Try to extract a URL for the PDF
            # Typically, look for the 'doi' or other links in the paper_metadata

            if not pdf_url and paper_metadata.get('primary_location'):
                if paper_metadata.get('primary_location').get("pdf_url"):
                    pdf_url = paper_metadata.get('primary_location').get('pdf_url')
            if not pdf_url:
                pdf_url = paper_metadata.get('doi')  # Use DOI if available

            if not pdf_url:
                # If DOI is not available, you might look for other links in the metadata
                pdf_url = paper_metadata.get('url_for_pdf')  # Replace with correct key if available

            if pdf_url:
                # You might need to replace this with the actual PDF URL
                logging.debug(f"> Downloading PDF from {pdf_url}")

                try:
                    pdf_response = requests.get(pdf_url)
                    pdf_response.raise_for_status()  # Raise error for bad responses

                    # Create a filename based on the title or DOI
                    title = paper_metadata.get('display_name', 'unknown_paper').replace('/', '_')
                    filename = f"{title}.pdf"
                    file_path = os.path.join(download_dir, filename)

                    with open(file_path, 'wb') as f:
                        f.write(pdf_response.content)

                    logging.debug(f"> Successfully downloaded {filename}")
                except requests.RequestException as e:
                    logging.error(f"Error downloading PDF: {e}")
            else:
                logging.warning("No downloadable PDF URL found for this paper.")

    def extract_abstract(self, abstract_inverted_index):
        """Convert an abstract_inverted_index into a readable text format.
    
        Args:
            abstract_inverted_index (dict): The abstract inverted index from OpenAlex.
    
        Returns:
            str: The reconstructed abstract as a plain text string.
        """
        if not abstract_inverted_index:
            return "No abstract available"
        # Initialize a list to hold the words based on their positions
        abstract_length = max(max(pos) for pos in abstract_inverted_index.values()) + 1
        abstract_words = [""] * abstract_length

        # Place each word into the correct position
        for word, positions in abstract_inverted_index.items():
            for pos in positions:
                abstract_words[pos] = word

        # Join the words to form the complete abstract text
        return ' '.join(word for word in abstract_words if word)


#USAGE EXAMPLE 
"""
r = OpenAlexReader()
metadata = r.get_papers("Transformers", max_results=1)
r.download_paper(metadata)
"""
!rm -rf downloads
r = OpenAlexReader()
metadata = r._fetch_openalex_results(search_query="Transformers", max_results=100)
r.download_paper(metadata)

ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/2020.emnlp-demos.6.pdf
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/P19-1285.pdf
ERROR:root:Error downloading PDF: 403 Client Error: Forbidden for url: https://digital-library.theiet.org/content/journals/10.1049/piee.1966.0236
ERROR:root:Error downloading PDF: 403 Client Error: Forbidden for url: https://ojs.aaai.org/index.php/AAAI/article/download/17325/17132
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/2020.emnlp-demos.6.pdf
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9710703/
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/868049/
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclwe

In [61]:
reader = SimpleDirectoryReader('downloads', required_exts=[".pdf"])
documents = reader.load_data()
filenames = set([documents[i].metadata['file_name'] for i in range(len(documents))])
len(os.listdir('downloads')), len(filenames), len(metadata)



Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/A survey of transformers.pdf with error: RetryError[<Future at 0x300081c10 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Experimental thermodynamic evaluation for a single stage heat transformer prototype build with commercial PHEs.pdf with error: RetryError[<Future at 0x3008c7c10 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Inductors and Transformers for Power Electronics.pdf with error: RetryError[<Future at 0x177b6e7d0 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Loss of Life Transformer Prediction Based on Stacking Ensemble Improved by Genetic Algorithm By IJISRT.pdf with error: RetryError[<Future at 0x300072310 state=finished raised PdfStreamError>]. Skipping...




Failed to load file /Users/aapoliakova/PycharmProjects/test_hybdrid_search_engines/downloads/Transformer and Inductor Design Handbook.pdf with error: RetryError[<Future at 0x179180790 state=finished raised PdfStreamError>]. Skipping...


(68, 63, 100)

## with this approach 68 files downloaded, 63 is parsable to pdf 

# Please don't pay attention at the next section, is just some code for testing 

In [46]:
stats = []
urls = []
broken = []

for i, paper in enumerate(metadata):
    if 'open_access' in paper:
        if 'oa_url' in paper['open_access']:
            if paper['open_access'].get('oa_url') is not None:
                stats.append(True)
                urls.append(paper['open_access']['oa_url'].replace('abs', 'pdf'))
            else:
                broken.append(paper)
    else:
        stats.append(False)
        broken.append(paper)

In [47]:
is_open_repo = []
for i in range(len(broken)):
    ans = broken[i]['open_access']['any_repository_has_fulltext']
    is_open_repo.append(ans)

In [48]:
from collections import Counter

Counter(stats)

Counter({True: 75})

In [93]:
len(metadata)

100

In [51]:
metadata[0]

{'id': 'https://openalex.org/W2896457183',
 'doi': 'https://doi.org/10.48550/arxiv.1810.04805',
 'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding',
 'display_name': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding',
 'relevance_score': 8320.475,
 'publication_year': 2018,
 'publication_date': '2018-01-01',
 'ids': {'openalex': 'https://openalex.org/W2896457183',
  'doi': 'https://doi.org/10.48550/arxiv.1810.04805',
  'mag': '2896457183'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://arxiv.org/abs/1810.04805',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'is_core': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['ht

In [61]:
import os
import random
import ssl
import requests
from urllib.parse import urlparse

# Constants
BASE_DOWNLOAD_PATH = ""  # Change to your path
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
]

HEADER = lambda url: {
    "User-Agent": random.choice(USER_AGENTS),
    "Referer": url,
}


def download_pdf(url, uuid):
    """
    Downloads a PDF from a given URL and saves it to the local directory with a unique name.
    
    Args:
        url (str): The URL of the PDF file.
        uuid (str): A unique identifier for the file (used for naming).
        
    Returns:
        str: The path where the file is saved, or 'error' if it failed.
    """
    try:
        response = requests.get(url, headers=HEADER(url), stream=True)
        if response.status_code == 200:
            uuid = urlparse(uuid).path.replace(r"/", "")
            filename = os.path.join(BASE_DOWNLOAD_PATH, f"{uuid}.pdf")
            file_number = 1
            while os.path.exists(filename):
                filename = os.path.join(BASE_DOWNLOAD_PATH, f"{uuid}_{file_number}.pdf")
                file_number += 1
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"Saved {filename}")
            return filename
        else:
            print(f"Failed to download {url}. Status code: {response.status_code}")
            return "error"
    except Exception as e:
        print(f"An error occurred while downloading {url}: {e}")
        return "error"


def download_from_queries(queries):
    """
    Downloads PDFs from a list of queries (URLs) and saves them locally.
    
    Args:
        queries (list of tuples): A list of (url, uuid) pairs to download PDFs.
        
    Returns:
        list: A list of file paths where the PDFs were saved.
    """
    output = []
    for uuid, url in queries:
        print(f"Downloading: {url}")
        filename = download_pdf(url, uuid)
        if filename != "error":
            output.append(filename)

    return output


# Example usage
if __name__ == "__main__":
    # Example list of queries with (url, uuid)
    # queries = [
    #     ("https://example.com/sample1.pdf", "sample1"),
    #     ("https://example.com/sample2.pdf", "sample2"),
    #    
    # ]
    queries = Counter(dois).items()

    results = download_from_queries(queries)
    print("Downloaded files:", results)


Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 7
An error occurred while downloading 7: Invalid URL '7': No scheme supplied. Perhaps you meant https://7?
Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 1
An error occurred while downloading 1: Invalid URL '1': No scheme supplied. Perhaps you meant https://1?
Downloading: 1
An error occurred while d

In [95]:
download_pdf(urls[0], "sample1")

Saved sample1_4.pdf


'sample1_4.pdf'

In [11]:
r = OpenAlexReader()
metadata = r.get_papers("Transformers", max_results=100)
# r.download_paper(metadata)

3 None
13 None
33 None
37 None
57 None
81 None
89 None


In [15]:
requests.get(metadata[0]['id'])

<Response [200]>

In [25]:
len(os.listdir('downloads'))

35

In [19]:
len(metadata)

7

In [21]:
out = r._fetch_openalex_results("Transformers", max_results=100)

In [24]:
os.listdir()

ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9710580/
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/2020.emnlp-demos.6
ERROR:root:Error downloading PDF: 406 Client Error: Not Acceptable for url: https://www.aclweb.org/anthology/P19-1285
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/726791/
ERROR:root:Error downloading PDF: 403 Client Error: Forbidden for url: https://digital-library.theiet.org/content/journals/10.1049/piee.1966.0236
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9709990/
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9578646/
ERROR:root:Error downloading PDF: 418 Client Error: Unknown Code for url: https://ieeexplore.ieee.org/document/9607618/
ERROR:roo

In [19]:
import os
import pandas as pd
from llama_index.core import SimpleDirectoryReader


def parse_pdfs_to_text_and_save(input_folder, output_type='text'):
    """
    Parse all PDF files in a given folder into text and save them either as .txt files or in a pandas DataFrame.

    Args:
        input_folder (str): The folder containing PDF files.
        output_type (str): The type of output ('text' for text files, 'df' for DataFrame). Default is 'text'.
    
    Returns:
        If output_type='df', returns a pandas DataFrame with filenames and text content.
        Otherwise, saves .txt files for each PDF in the same folder.
    """
    # Read PDFs from the directory using llamaindex's SimpleDirectoryReader
    reader = SimpleDirectoryReader(input_folder, required_exts=[".pdf"])
    documents = reader.load_data()

    data = []

    # Iterate over each document
    for doc in documents:
        text_content = doc.text
        pdf_name = doc.metadata['file_name']

        # If saving as text files
        if output_type == 'text':
            text_file_name = os.path.join(input_folder, pdf_name.replace('.pdf', '.txt'))
            with open(text_file_name, 'w', encoding='utf-8') as f:
                f.write(text_content)
            print(f"Saved {text_file_name}")

        # Collect data for DataFrame
        data.append({'filename': pdf_name, 'content': text_content})

    # If returning as DataFrame
    if output_type == 'df':
        df = pd.DataFrame(data)
        return df


# Example usage:
input_folder = 'downloads'  # Replace with your actual downloads folder path
df = parse_pdfs_to_text_and_save(input_folder, output_type='df')
df.to_csv('test_data_with_papers.csv')
print(df.head())

                                            filename  \
0  A Precise Low Temperature dc Ratio Transformer...   
1  A Precise Low Temperature dc Ratio Transformer...   
2  A Precise Low Temperature dc Ratio Transformer...   
3  A Precise Low Temperature dc Ratio Transformer...   
4  A Precise Low Temperature dc Ratio Transformer...   

                                             content  
0  remote sensing  \nArticle\nVision Transformers...  
1  Remote Sens. 2021 ,13, 516 2 of 19\neasier to ...  
2  Remote Sens. 2021 ,13, 516 3 of 19\net al. [ 3...  
3  Remote Sens. 2021 ,13, 516 4 of 19\nposition e...  
4  Remote Sens. 2021 ,13, 516 5 of 19\nis encoded...  


30


In [34]:
import pdfplumber


def extract_text_pdfplumber(pdf_path):
    """Extract text from a PDF using pdfplumber."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
            return text
    except Exception as e:
        print(f"Failed to extract {pdf_path}: {e}")
        return None


def parse_pdfs_to_text_and_save(input_folder, output_type='text'):
    reader = SimpleDirectoryReader(input_folder, required_exts=[".pdf"])
    data = []
    skipped_files = []

    documents = reader.load_data()
    for doc in documents:
        try:
            text_content = doc.text
            pdf_name = doc.metadata['file_name']
        except Exception as e:
            print(f"LlamaIndex failed to load {doc.metadata['file_name']} with error: {e}")
            # Fallback to pdfplumber
            pdf_path = os.path.join(input_folder, doc.metadata['file_name'])
            text_content = extract_text_pdfplumber(pdf_path)
            if text_content is None:
                skipped_files.append(doc.metadata['file_name'])
                continue

        if output_type == 'text':
            text_file_name = os.path.join(input_folder, pdf_name.replace('.pdf', '.txt'))
            with open(text_file_name, 'w', encoding='utf-8') as f:
                f.write(text_content)
            print(f"Saved {text_file_name}")

        data.append({'filename': pdf_name, 'content': text_content})

    if output_type == 'df':
        df = pd.DataFrame(data)
        return df

    print(f"Skipped files: {skipped_files}")


In [None]:
df = parse_pdfs_to_text_and_save('downloads', output_type='df')