### Documentation here https://dev.elsevier.com/documentation/ArticleRetrievalAPI.wadl
https://api.elsevier.com/content/article/doi/{doi}

In [1]:
pip install notebook ipykernel


Defaulting to user installation because normal site-packages is not writeable
Collecting notebook
  Downloading notebook-6.5.7-py3-none-any.whl (529 kB)
Collecting nbformat
  Downloading nbformat-5.8.0-py3-none-any.whl (77 kB)
Collecting ipython-genutils
  Downloading ipython_genutils-0.2.0-py2.py3-none-any.whl (26 kB)
Collecting argon2-cffi
  Using cached argon2_cffi-23.1.0-py3-none-any.whl (15 kB)
Collecting nbconvert>=5
  Downloading nbconvert-7.6.0-py3-none-any.whl (290 kB)
Collecting nbclassic>=0.4.7
  Downloading nbclassic-1.2.0-py3-none-any.whl (10.0 MB)
Collecting terminado>=0.8.3
  Downloading terminado-0.17.1-py3-none-any.whl (17 kB)
Collecting prometheus-client
  Downloading prometheus_client-0.17.1-py3-none-any.whl (60 kB)
Collecting Send2Trash>=1.8.0
  Using cached Send2Trash-1.8.3-py3-none-any.whl (18 kB)
Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
Collecting fastjsonschema
  Downloading fastjsonschema-2.21.2-py3-none-any.whl (24 kB)
Collecting 

You should consider upgrading via the 'c:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\python.exe -m pip install --upgrade pip' command.


In [1]:
import pandas as pd
import requests
import os
import xmltodict
import json
from credentials import keys
import csv

In [3]:
# Load the dataset
file_path = "csv_output/101_sorted_manual_check/by_reason/papers_KEEP_operational_EE_control.csv"
df = pd.read_csv(file_path)

# Print the original number of rows for comparison
print(f"Original number of rows: {len(df)}")

# Remove duplicates based on the 'DOI' column
df_cleaned = df.drop_duplicates(subset=['DOI'])

# Print the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {len(df_cleaned)}")

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path = 'elsevier_search_results_cleaned.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to {cleaned_file_path}")


df = pd.read_csv('elsevier_search_results_cleaned.csv')

# Creating a folder to save the downloaded articles
download_dir = 'downloaded_articles'
os.makedirs(download_dir, exist_ok=True)

# Getting the total number of articles to download
total_articles = len(df)

# Initializing the counter for downloaded articles
downloaded_articles = 0

# Iterating through the list of DOIs
for index, row in df.iterrows():
    doi = row['DOI']  # Assuming there's a 'doi' column in the CSV file
    url = f"https://api.elsevier.com/content/article/doi/{doi}"
    
    # Sending the request
    response = requests.get(url, headers={
        "X-ELS-APIKey": keys["els-apikey"],

        "Accept": "application/json"
    })
    
    if response.status_code == 200:
        # Increment the counter if the download is successful
        downloaded_articles += 1
        article_data = response.json()
        filename = doi.replace('/', '_') + '.json'
        file_path = os.path.join(download_dir, filename)
        
        # Saving the article data to a file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
    # Use '\r' to return to the start of the line and 'end=""' to prevent new line. Flush to ensure it's displayed immediately.
    print(f"\rDownloaded: {downloaded_articles}/{total_articles}", end='', flush=True)

# Adding a new line at the end of the process to ensure the command prompt appears correctly after the script finishes.
print("\nAll articles processed.")

Original number of rows: 1137
Number of rows after removing duplicates: 1137
Cleaned data saved to elsevier_search_results_cleaned.csv
Downloaded: 1137/1137
All articles processed.


In [None]:
# Delete file without 'originalText'
import os
import json

def process_json_files(directory):
    error_files = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                
                # Extract the original text from the JSON structure
                text = data['full-text-retrieval-response']['originalText']
                # Attempt to convert to lowercase to check for errors
                _ = text.lower()
            except AttributeError:
                error_files.append(filename)
                # Delete the file
                os.remove(file_path)
                print(f"Deleted {filename} due to an AttributeError.")
            except KeyError:
                print(f"KeyError: 'originalText' not found in {filename}")
    
    return error_files

# Define the directory path that contains your JSON files
directory = 'papers_json'

# Run the processing function
error_files = process_json_files(directory)

# Print the problematic files
print("Deleted Error Files:")
for file in error_files:
    print(file)


Deleted Error Files:


# NLTK

IF it is the first time using this code, please download dependencies:
```python
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4') 
```

In [None]:
import os
import re
import json
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('punkt_tab')


def preprocess_keyword(keyword):
    # Convert to lowercase
    keyword = keyword.lower()
    # Remove parentheses and their contents
    keyword = re.sub(r'\s*\([^)]*\)', '', keyword)
    # Tokenize words
    processed_words = nltk.word_tokenize(keyword)

    return processed_words

def extract_compound_keywords_from_json(folder_path):
    compound_keywords_dict = {}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
                # Extract keywords from the JSON structure
                keywords_data = data.get('full-text-retrieval-response', {}).get('coredata', {}).get('dcterms:subject', [])
                keywords = [kw['$'] for kw in keywords_data if '$' in kw]
                
                for keyword in keywords:
                    processed_words = preprocess_keyword(keyword)
                    if len(processed_words) > 1:
                        compound_keyword = '_'.join(processed_words)
                        normal_keyword = ' '.join(processed_words)
                        compound_keywords_dict[normal_keyword] = compound_keyword
                    
    return compound_keywords_dict

def save_dict_as_py(dict_obj, output_file):
    # Convert dictionary to string and add import statement
    dict_content = f"compound_keywords = {dict_obj}\n"
    # Write to a .py file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(dict_content)
        
# Define the directory path that contains your JSON files
folder_path = 'downloaded_articles'
compound_keywords = extract_compound_keywords_from_json(folder_path)

# Save dictionary to a .py file
output_file = '_cpwords.py'
save_dict_as_py(compound_keywords, output_file)
print(f"Dictionary has been saved to {output_file}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s2589602\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\s2589602\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Dictionary has been saved to compound_keywords.py
