In [None]:
import requests
import yaml
import pandas as pd
import json
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Access the API key
api_key = config['elsevier_api']['api_key']

In [None]:
# Replace with a valid article DOI or EID for testing
# article_doi = "10.1016/j.trc.2023.104311"
def retrieve_full_text(article_doi, save_folder, api_key=api_key):
    # API Endpoint for Article Retrieval
    url = f"https://api.elsevier.com/content/article/doi/{article_doi}"
    
    # Headers including API Key and other parameters
    headers = {
        "X-ELS-APIKey": api_key,
        "Accept": "application/xml",  # Change to accept XML format
    }

    # Optional: Specify parameters (e.g., view level or HTTP response language)
    params = {
        "view": "FULL",  # Options: "REF", "FULL", "META"
    }
    try:
        # Send the request
        response = requests.get(url, headers=headers, params=params)

        # Check the status of the response
        if response.status_code == 200:
            print("Full Text Data Retrieved Successfully!")

            # Save the XML response directly
            unique_id = article_doi.replace('/', '_')
            with open(f'{save_folder}/{unique_id}.xml', 'wb') as file:  # Use 'wb' for binary write
                file.write(response.content)  # Write the raw XML content

            # Extract abstract from the XML response
            from xml.etree import ElementTree as ET
            root = ET.fromstring(response.content)
            abstract = root.find('.//dc:description', namespaces={'dc': 'http://purl.org/dc/elements/1.1/'})
            abstract_text = abstract.text if abstract is not None else "N/A"
            # print(f"Abstract: {abstract_text}\n")
            title = root.find('.//dc:title', namespaces={'dc': 'http://purl.org/dc/elements/1.1/'})
            title_text = title.text if title is not None else "N/A"
            print(f"Title: {title_text}\n")

        elif response.status_code == 403:
            print("Access Denied. Check if you have proper entitlements.")
        else:
            print(f"Failed to retrieve full text. Status Code: {response.status_code}")
            print("Response:", response.text)
    except Exception as e:
        print(f"An error occurred: {e}")
    return abstract_text  # Return the extracted abstract

In [None]:
# read the journal-meta folder and get a list of csv files
import os
folder = '../journal-meta'
full_text_folder = '../journal-full-text'
csv_files = [file for file in os.listdir(folder) if file.endswith('.csv')]
for csv_file in csv_files:
    journal_info = csv_file.split('_')[0]
    # create a folder in the full_text_folder under the journal info
    save_folder = os.path.join(full_text_folder, journal_info)
    # Check if the folder doesn't already exist and create it if necessary
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    article_info = pd.read_csv(folder + '/' + csv_file)
    article_info['year'] = article_info['date'].str.extract(r'(\d{4})')
    # get the month from the format of YYYY-MM-DD
    article_info['month'] = article_info['date'].str.extract(r'-(\d{2})-')
    article = article_info[article_info['year'] >= '2019']
    article.loc[:, 'abstract'] = article.apply(lambda x: retrieve_full_text(x['doi'],save_folder), axis=1)
    article.to_csv(full_text_folder + '/' + journal_info + '.csv', index=False)