Importing OpenAI api. Use your own API key!

In [None]:
from openai import OpenAI
# Use your own api key here!
client = OpenAI(api_key="")

From your search extract the add your list of DOI as a text file in the same directory

In [None]:
doi_file = ".txt"

with open(doi_file) as f:
    doi_list = [doi.rstrip('\n') for doi in f]

Load in the function to get Beautiful Soup data from a given doi

In [None]:
from bs4 import BeautifulSoup
import requests

def get_data(doi):
    url = "https://doi.org/" + doi

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    return ''.join(soup.body.get_text().split())

Setup the API to analyse the extracted webpage

Edit as necessary the following inputs:
1. The preferred output format
2. The prompts for the system
3. OpenAI model to use

In [None]:
from pydantic import BaseModel

# INPUT: OpenAI preferred output format
class PaperExtraction(BaseModel):
    title: str
    population: str
    intervention: str
    control: str
    outcomes: str

# INPUT: Edit prompts and Choose model
system_message = """
    You are an assisting a researcher working on a systematic review. 
    For the given article extract the data in the requested format. 
    If no data is available respond NO DATA to all fields.
    """
chat_gpt_model = "gpt-4o"

# Load in function to analyse webpage
def analyse(soup_content, system_message = system_message):
    completion = client.beta.chat.completions.parse(
        model= chat_gpt_model,
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": "Content:\n " + soup_content
            },
        ],
        response_format = PaperExtraction
    )
    return completion.choices[0].message.content

Analyse the data using the below code chunk

In [None]:
from tqdm import tqdm # to get progress bar

compiled_data = {}

for doi in tqdm(doi_list):
    try:
        raw_data = analyse(get_data(doi))
        compiled_data[doi] = raw_data
    except AttributeError:
        compiled_data[doi] = None

Process the code into an acceptable format and check dois with empty data

In [None]:
import json

def process_data():
    # Extract data as a dictionary and store record empty values
    for doi in compiled_data:
        try:
            compiled_data[doi] = json.loads(compiled_data[doi])
        except:
            continue

    no_data = []
    # Check and record empty values
    for data in compiled_data:
        if compiled_data[data] == None:
            no_data.append(data)
        elif compiled_data[data]['title'] == 'NO DATA':
            no_data.append(data)

    print(len(no_data))
    return no_data

no_data = process_data()

# Re run previous cell as necessary

OPTIONAL: setting up selenium to bypass login

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# Function to extract data using selenium
def get_selenium_data(doi, driver):
    url = "https://doi.org/" + doi

    driver = driver
    driver.get(url)

    while input("Enter to extract data"):
        pass

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    return ''.join(soup.body.get_text().split())


In [None]:
driver = webdriver.Chrome()

for doi in tqdm(no_data):
    try:
        compiled_data[doi] = analyse(get_selenium_data(doi, driver))
    except AttributeError:
        continue

driver.close()

process_data()

Store data into a csv file using pandas

In [None]:
import pandas as pd

csv_file_name = 'extracted_data'

df = pd.DataFrame(compiled_data)
df = df.transpose()
df.to_csv(csv_file_name + '.csv')