### Documentation here https://dev.elsevier.com/documentation/ArticleRetrievalAPI.wadl

In [None]:
# https://api.elsevier.com/content/article/doi/{doi}

In [1]:
import pandas as pd
import requests
import os
import xmltodict
import json
from credentials import keys
import csv

#### Delete Repeat Items:

In [2]:
# Load the dataset
file_path = 'sciencedirect_search_results.csv'
df = pd.read_csv(file_path)

# Print the original number of rows for comparison
print(f"Original number of rows: {len(df)}")

# Remove duplicates based on the 'DOI' column
df_cleaned = df.drop_duplicates(subset=['DOI'])

# Print the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {len(df_cleaned)}")

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path = 'elsevier_search_results_cleaned.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to {cleaned_file_path}")

Original number of rows: 8700
Number of rows after removing duplicates: 8587
Cleaned data saved to elsevier_search_results_cleaned.csv


#### Download Files:

In [9]:
# Reading the CSV file
df = pd.read_csv('elsevier_search_results_cleaned.csv')

# Creating a folder to save the downloaded articles
download_dir = 'downloaded_articles'
os.makedirs(download_dir, exist_ok=True)

# Getting the total number of articles to download
total_articles = len(df)

# Initializing the counter for downloaded articles
downloaded_articles = 0

# Iterating through the list of DOIs
for index, row in df.iterrows():
    doi = row['DOI']  # Assuming there's a 'doi' column in the CSV file
    url = f"https://api.elsevier.com/content/article/doi/{doi}"
    
    # Sending the request
    response = requests.get(url, headers={
        "X-ELS-APIKey": keys["els-apikey"],
        "X-ELS-Insttoken": keys["els-inst-token"],
        "Accept": "application/json"
    })
    
    if response.status_code == 200:
        # Increment the counter if the download is successful
        downloaded_articles += 1
        article_data = response.json()
        filename = doi.replace('/', '_') + '.json'
        file_path = os.path.join(download_dir, filename)
        
        # Saving the article data to a file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
    # Use '\r' to return to the start of the line and 'end=""' to prevent new line. Flush to ensure it's displayed immediately.
    print(f"\rDownloaded: {downloaded_articles}/{total_articles}", end='', flush=True)

# Adding a new line at the end of the process to ensure the command prompt appears correctly after the script finishes.
print("\nAll articles processed.")

Downloaded: 22/8033

KeyboardInterrupt: 

#### Delete Error Files

In [11]:
# Define the directory path that contains your JSON files
directory_path = 'C:/Users/wenha/OneDrive - University College London/Desktop/first_paper_code/downloaded_articles'

# Loop through each file in the specified directory
for filename in os.listdir(directory_path):
    # Construct the full path of the file
    file_path = os.path.join(directory_path, filename)
    # Check if the file is a JSON file
    if filename.endswith('.json'):
        # Get the size of the file in kilobytes (KB)
        file_size_kb = os.path.getsize(file_path) / 1024
        # Check if the file size is less than 6KB
        if file_size_kb < 6:
            # Delete the file
            os.remove(file_path)
            print(f"Deleted {filename} because it was smaller than 6KB.")