In [1]:
import os
from IPython.display import clear_output

import time

import csv
import requests
import pandas as pd

from tqdm import tqdm  
from pathlib import Path

from bs4 import BeautifulSoup


data_dir = Path('/Volumes/Juan/Data/crossref XML')


In [15]:
df = pd.read_csv(data_dir / '01_raw_data.csv')
df.sample().iloc[0]

indexed                   {'date-parts': [[2022, 4, 2]], 'date-time': '2...
reference-count                                                           0
publisher                             Japan Society for Clinical Anesthesia
license                                                                 NaN
content-domain               {'domain': [], 'crossmark-restriction': False}
short-container-title                                         ['J.J.S.C.A']
abstract                                                                NaN
DOI                                                      10.2199/jjsca.5.84
type                                                        journal-article
created                   {'date-parts': [[2011, 6, 29]], 'date-time': '...
page                                                                  84-87
update-policy                                                           NaN
source                                                             Crossref
is-reference

In [22]:
just_read = pd.read_csv(data_dir / 'doi_file.csv', header=None)

In [24]:
id_list = list(set(id_list).difference(just_read[0].str.lower()))

In [None]:
progress_bar = tqdm(total=20000)

new_dois = set()
i = 0
try:
    with open(data_dir / 'doi_file_2.csv', 'r') as file:
        for line in file:
            i+=1
            progress_bar.update(1)
            if i < 265141:
                continue
            
            # Split each line at the first comma and take the part before it
            doi = line.split(',', 1)[0].strip().lower()
            if doi[:3] != '10.':
                print(doi)
                break

            new_dois.add(doi)


except KeyboardInterrupt:
    print(line)
    raise
            
progress_bar.close()

        



 50%|█████     | 266030/530282 [24:02<12:33, 350.70it/s][A[A

In [20]:
def read_first_column(file_path):
    column_data = []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            if row:  # Checking if the row is not empty
                column_data.append(row[0])  # Appending the first element of each row to the list
    return column_data

def fetch_data(doi: str):
    """Request function to query Crossref API.

    Args:
        doi (str): The DOI of an item, used for querying Crossref API

    Returns:
        JSON: with r.status_code == 200, returns JSON response
        None: r.status_code == 404 will return None as the resource was not found
        function: r.status_code == 504 returns the function to retry the query
    """
    base_url = 'https://doi.crossref.org/search/doi'
    params = {'pid': 'juan@alperin.ca',
             'format': 'unixsd',
             'doi': doi}
    try:
        r = requests.get(base_url, params=params)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content.decode('utf-8').replace('\n', '').replace('\r', ''), 'xml')
            return  ''.join(str(tag) for tag in soup.find_all()).replace('\n', '').replace('\r', '').replace('\t', '')
        elif r.status_code == 404:
            return None  
        elif r.status_code == 504:
            print(r.status_code)
            time.sleep(1)
            return fetch_data(doi)
        else:
            return None
    except Exception as e:
        print(f"Error fetching DOI {doi}: {e}")
        return None
    
def get_crossref(id_list: list):
    """Primary function for querying Crossref API and collecting responses

    Args:
        id_list (list): List of all DOIs to be queried.
    """
    chunk_size = 5000
    tmp = []
    
    print(f"Going after: {len(id_list)}.")
    
    file_path = data_dir / 'doi_file.csv'
    if file_path.is_file():
        print(f"The file {file_path} exists.")
        # cut -d',' -f1 doi_file.csv > dois_read.csv
    else:
        pd.DataFrame(columns=['DOI', 'message']).to_csv(file_path, mode='w', index=False)

#     already_read = pd.read_csv(data_dir / 'dois_read.csv')
#     print(f"Already read: {len(already_read)}.")
#     id_list = list(set(id_list).difference(already_read.DOI.str.lower()))
    print(f"Going after: {len(id_list)}.")
        
    # Record the starting time
    start_time = time.time()
    
   
    with tqdm(total=len(id_list)) as pbar:
        for i, doi in enumerate(id_list):
            try:
                result = fetch_data(doi)
                if result is not None:
                    tmp.append({'DOI': doi, 'message': result})
                    
                if i % chunk_size == 0 or (i+1) == len(id_list):
                    pd.DataFrame(tmp).to_csv(data_dir / 'doi_file.csv', mode='a', index=False, header=False)
                    tmp = []
                    end_time = time.time()
                    if i/3 > (end_time - start_time):
                        pause = i/3 - (end_time - start_time) 
                        print(f"Sleeping: {int(pause)} seconds")
                        time.sleep(pause)

                pbar.update(1)
            except KeyboardInterrupt:
                if len(tmp) > 1: 
                    pd.DataFrame(tmp).to_csv(data_dir / 'doi_file.csv', mode='a', index=False, header=False)                
                raise
            except Exception as err:                
                print(err)

                
                

In [25]:
get_crossref(id_list)

  0%|          | 0/106402 [00:00<?, ?it/s]

Going after: 106402.
Going after: 106402.


100%|██████████| 106402/106402 [13:19:05<00:00,  2.22it/s]   


# Fetching data ends here

In [19]:
import xml.etree.ElementTree as ET

x = df.sample().iloc[0]['message']

In [29]:
x = fetch_data('10.4138/atlgeol.2019.011')

In [2]:
import re

def find_all_xml_tags(xml_string, tag_name):
    pattern = re.compile(r'<{}([^>]*)>(.*?)</{}>'.format(tag_name, tag_name), re.DOTALL)
    matches = re.findall(pattern, xml_string)
    
    tag_list = []
    for attributes, content in matches:
        attributes_dict = dict(re.findall(r'\b(\S+?)\s*=\s*[\'"](.*?)[\'"]', attributes))
        tag_list.append({
            'attributes': attributes_dict,
            'content': content
        })

    return tag_list

# Example usage
xml_string = x
desired_tag_name = 'jats:abstract'

result = find_all_xml_tags(xml_string, desired_tag_name)
for i, match in enumerate(result):
    print("Match {}: {}".format(i + 1, match))

In [4]:
# dois = set()

df = pd.read_csv(data_dir / 'outfile_5.csv', header=None)
dois = set(df[0])
df.sample()

Unnamed: 0,0,1
110498,10.1186/s13102-023-00654-y,"<crossref_result version=""3.0"" xmlns=""http://w..."


In [14]:
def list_files_with_prefix(directory, prefix):
    file_list = [f for f in os.listdir(directory) if f.startswith(prefix) and os.path.isfile(os.path.join(directory, f))]
    return file_list

def reverse_sorted_files_by_mtime(directory, prefix):
    file_list = list_files_with_prefix(directory, prefix)
    file_list.sort(key=lambda x: os.path.getmtime(os.path.join(directory, x)), reverse=True)
    return file_list

# Example usage

file_prefixes = ['4', '3', '2', '1', 'a']


for file_prefix in file_prefixes: 
    clear_output(wait=True)
    sorted_files = reverse_sorted_files_by_mtime(data_dir, "doi_{}".format(file_prefix))
    # Using tqdm with a multiplier for multiple updates per iteration
    with tqdm(total=20000*len(sorted_files), desc="Processing {}".format(file_prefix), unit="item") as pbar:
        with open(data_dir / 'outfile_{}.csv'.format(file_prefix), 'a') as outfile:
            for filename in sorted_files:
                one_new_in_file = False
                
                # Your processing code goes here
                with open(data_dir / filename, 'r') as infile:
                    for line in infile:
                        # Update the progress bar
                        pbar.update(1)
                        
                        # Split each line at the first comma and take the part before it
                        doi = line.split(',', 1)[0].strip().lower()
                        if doi[:3].lower() == 'doi':
                            continue
                        
                        elif doi[:3] != '10.':
                            print("Something went wrong when reading: {}".format(doi))
                            break
                        
                        # skip line if we already have the DOI
                        if doi in dois: 
                            continue

                        # copy the line to a new outfile and record that we already have the DOI
                        outfile.write(line)   
                        dois.add(doi)                            
                        # if we don't find any new DOIs in the whole file, stop checking
                        # the rest of the files in this set
                        one_new_in_file = True

                # if no new DOIs found in the whole file, move onto next prefix
                if one_new_in_file == False:
                    break
                tqdm.write(f"Found dois: {len(dois)}", end="")
                      
        pbar.reset()

Processing a:   3%|▎         | 20000/620000 [01:54<59:10, 168.97item/s]  

Found dois: 354999

Processing a:   3%|▎         | 20925/620000 [02:01<1:17:00, 129.65item/s]

Found dois: 355924

Processing a:   7%|▋         | 40925/620000 [04:15<1:14:16, 129.94item/s]

Found dois: 375924

Processing a:  10%|▉         | 60925/620000 [06:29<57:11, 162.91item/s]  

Found dois: 395924

Processing a:  13%|█▎        | 80925/620000 [08:15<1:02:07, 144.63item/s]

Found dois: 410000

Processing a:   0%|          | 0/620000 [00:00<?, ?item/s]                


In [32]:
root = ET.fromstring(x)


ParseError: junk after document element: line 1, column 5943 (<string>)

In [60]:
for person in soup.crossref.contributors.findChildren():
    print(person)

<person_name contributor_role="author" sequence="first"> <given_name>Mirela</given_name> <surname>Feurdean</surname> </person_name>
<given_name>Mirela</given_name>
<surname>Feurdean</surname>
<person_name contributor_role="author" sequence="additional"> <given_name>Daniel</given_name> <surname>Matassa</surname> </person_name>
<given_name>Daniel</given_name>
<surname>Matassa</surname>
<person_name contributor_role="author" sequence="additional"> <given_name>Mohleen</given_name> <surname>Kang</surname> </person_name>
<given_name>Mohleen</given_name>
<surname>Kang</surname>
<person_name contributor_role="author" sequence="additional"> <given_name>Genevieve</given_name> <surname>Matthews</surname> </person_name>
<given_name>Genevieve</given_name>
<surname>Matthews</surname>
<person_name contributor_role="author" sequence="additional"> <given_name>Neil</given_name> <surname>Kothari</surname> </person_name>
<given_name>Neil</given_name>
<surname>Kothari</surname>
