In [None]:
from dotenv import load_dotenv
import os
import requests
import json
import pprint
import time

load_dotenv()

basic = "https://api.core.ac.uk"


session = requests.Session()

def robust_request(url, max_retries=5, timeout=60):
    retries = 0
    while retries < max_retries:
        try:
            response = session.get(url, stream=True, timeout=timeout)
            return response
        except (requests.exceptions.ChunkedEncodingError, requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
            print(f"Error: {e}. Retrying {retries + 1}/{max_retries}...")
            retries += 1
            time.sleep(5 * retries)
            continue
    print("Failed to retrieve data after multiple attempts.")
    return None


def save_pdf_for_paper(paper):
    paper_id = paper['id']

    if paper['links'][0]['type'] == 'download':
        pdf_url = paper['links'][0]['url']
        
        pdf_response = robust_request(pdf_url)
        if pdf_response:
            if pdf_response.status_code == 200:
                filename = f"./retrieved_papers/{str(paper_id).replace('/', '_')}.pdf"
                with open(filename, "wb") as pdf_file:
                    try:
                        for chunk in pdf_response.iter_content(chunk_size=8192):  # 8KB chunks
                            if chunk:  # filter out keep-alive chunks
                                pdf_file.write(chunk)
                    except requests.exceptions.ChunkedEncodingError as e:
                        print(f"ChunkedEncodingError for {paper_id}: {e}")
                    except requests.exceptions.ConnectionError as e:
                        print(f"ConnectionError for {paper_id}: {e}")
                    finally:
                        pdf_response.close()  # Explicitly close the response
                print(f"PDF Saved {paper_id}")

                # Save JSON
                save_as_json = json.dumps(paper, indent=4)
                with open(filename.replace(".pdf", ".json"), "w") as json_file:
                    json_file.write(save_as_json)
            else:
                print(f"Error downloading PDF: {pdf_response.status_code}")


def search_entities(entity_type, query, offset, limit, stats):
    endpoint = f"/v3/search/{entity_type}/"
    url = basic + endpoint
    headers = {
        "Authorization": f"Bearer {os.getenv('API_KEY')}"
    }
    params = {
        "q": query,
        "offset": offset,
        "limit": limit,
        "stats": stats
    }

    response = requests.get(url, headers=headers, params=params)

    # Check if the response is valid JSON
    if response.status_code == 200:
        try:
            return response.json()
        except json.JSONDecodeError:
            print("Error: Unable to decode JSON response")
            return {}
    else:
        print(f"Error: Received status code {response.status_code}")
        return response.status_code

def collect_data(query, pulls, runs):
    data = []
    current_run = 1
    while current_run <= runs:
        data = []
        print(f"Pulling data for run {current_run}/{runs}")
        while True:
            results = search_entities(
                entity_type="works",
                query=query,
                offset=(current_run*pulls),
                limit=pulls,
                stats=False
            )
            if isinstance(results, int) and results == 429:
                print("Error 429: Too Many Requests. Waiting 60 seconds before retrying...")
                time.sleep(60)
            else:
                break
        if isinstance(results, dict):
        
            #data.extend(results.get('results', []))
            print(f"Number of papers retrieved in this pull: {len(results.get('results', []))}/{pulls}")
            # save the pdfs associated with the papers id
            results = results.get('results', [])
            for paper in results:
                save_pdf_for_paper(paper)


        current_run += 1
    print(f"Retrieved {len(data)} papers - Offset: {current_run*pulls}")
    return data

data = collect_data(query="oceanographic OR oceanography", pulls=10, runs=10000)
pprint.pprint(data)


Pulling data for run 1/10000
Number of papers retrieved in this pull: 10/10
PDF Saved 2134507
PDF Saved 2134467
PDF Saved 164780042
PDF Saved 2134357
PDF Saved 77578576
PDF Saved 2134542
PDF Saved 77578534
PDF Saved 2134317
PDF Saved 2135094
PDF Saved 2137913
Pulling data for run 2/10000
Number of papers retrieved in this pull: 10/10
PDF Saved 24896951
PDF Saved 77580449
PDF Saved 2134397
PDF Saved 2134977
PDF Saved 2134291
PDF Saved 2134345
PDF Saved 164780051
PDF Saved 43609326
PDF Saved 2134309
PDF Saved 62534195
Pulling data for run 3/10000
Number of papers retrieved in this pull: 10/10
PDF Saved 4474318
PDF Saved 45228705
PDF Saved 2039634
PDF Saved 28318002
PDF Saved 28317997
PDF Saved 216505
PDF Saved 33902089
PDF Saved 28317952
PDF Saved 28317861
PDF Saved 63177924
Pulling data for run 4/10000
Number of papers retrieved in this pull: 10/10
PDF Saved 125175824
PDF Saved 28317958
PDF Saved 24840768
PDF Saved 28318662
PDF Saved 12804915
PDF Saved 45228600
PDF Saved 28316043
PDF Sa

In [37]:
import requests

url = "https://core.ac.uk/download/pdf/77577606.pdf"
pdf_filename = "downloaded_document.pdf"

response = requests.get(url)
if response.status_code == 200:
    with open(pdf_filename, "wb") as f:
        f.write(response.content)
    print(f"PDF downloaded successfully as {pdf_filename}")
else:
    print(f"Failed to download PDF. Status code: {response.status_code}")


PDF downloaded successfully as downloaded_document.pdf


In [None]:
import re
import string
import os
import json

def preprocess_text(text):
    if text is None:
        text = ""
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = text.strip()
    return text

def preprocess_data(data):
    for item in data:
        item['title'] = preprocess_text(item.get('title', ''))
        item['abstract'] = preprocess_text(item.get('abstract', ''))
    return data

def save_preprocessed_data(data, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    for i, item in enumerate(data):
        file_path = os.path.join(folder_path, f"paper_{i+1}.json")
        with open(file_path, 'w') as f:
            json.dump(item, f, indent=4)

from getPaperAPI import collect_data
data = collect_data(query="oceanographic", pulls=100, runs=10)
preprocessed_data = preprocess_data(data)
save_preprocessed_data(preprocessed_data, folder_path="preprocessed_data")

Pulling data for run 1/1
Number of papers retrieved in this pull: 9/10
Retrieving DOI for paper 26260541...
Failed to retrieve Sci-Hub URL. Using default backup.
Fetching PDF from Sci-Hub: https://sci-hub.se/10.1121/1.4799118
Could not find a valid PDF link for 10.1121/1.4799118 on Sci-Hub.
Retrieving DOI for paper 62534195...
No DOI found for paper 62534195. Cannot fetch from Sci-Hub.
Retrieving DOI for paper 224858...
No DOI found for paper 224858. Cannot fetch from Sci-Hub.
Retrieving DOI for paper 24896951...
No DOI found for paper 24896951. Cannot fetch from Sci-Hub.
Retrieving DOI for paper 2632022...
No DOI found for paper 2632022. Cannot fetch from Sci-Hub.
Retrieving DOI for paper 43609326...
No DOI found for paper 43609326. Cannot fetch from Sci-Hub.
Retrieving DOI for paper 17585103...
No DOI found for paper 17585103. Cannot fetch from Sci-Hub.
Retrieving DOI for paper 33917309...
Failed to retrieve Sci-Hub URL. Using default backup.
Fetching PDF from Sci-Hub: https://sci-hu