In [19]:
import os
from pathlib import Path
import crossref_commons.retrieval
from time import sleep
from parsebib import bib2df
from tqdm import tqdm  # Import tqdm for progress tracking

dir_path = os.path.dirname(os.path.realpath(__file__))
os.chdir(dir_path)

RECURSION_DEPTH = 4

DIR_CWD = Path.cwd()
DIR_DATA = DIR_CWD / 'data'
DIR_OUTPUT = DIR_DATA / 'output'
DIR_INPUT = DIR_DATA / 'input'
DIR_TMP = DIR_DATA / 'tmp'

DIRS = [DIR_DATA, DIR_OUTPUT, DIR_INPUT, DIR_TMP]
for DIR in DIRS: DIR.mkdir(exist_ok=True)

FILE_BIB = DIR_INPUT / 'alphabet.bib'
FILE_TXT_OUT = DIR_OUTPUT / 'alphabet.txt'

df = bib2df(FILE_BIB)
if df.empty:
    print('No data found, check input file')
    exit()            

df['doi'] = df['doi'].str.replace(r'\\_', '_', regex=True)
bib_dois = df['doi'].dropna().tolist()

# setup crossref user agent using environmental variables
os.environ['CR_API_AGENT'] = "polite user agent; including mailto:scmm@pm.me"
os.environ['CR_API_MAILTO'] = "scmm@pm.me"

# function to get related dois
def get_related_dois(bib_dois, i=0):
    bib_dois_all = bib_dois.copy()
    bib_dois_extended = bib_dois.copy()
    sleep_duration = 0.1  # Initial sleep duration

    for j, bib_doi in enumerate(tqdm(bib_dois, desc=f'Iteration {i+1}')):
        sleep_duration = max(0, sleep_duration - 0.1*sleep_duration)  # Decrease sleep duration, down to a minimum of 0.5 seconds
        try:
            ref = crossref_commons.retrieval.get_publication_as_json(bib_doi)
            refs = [x['DOI'] for x in ref['reference'] if 'DOI' in x.keys() and x['DOI'] not in bib_dois_extended]
            refs_all = [x['DOI'] for x in ref['reference'] if 'DOI' in x.keys()]
            bib_dois_extended.extend(refs)
            bib_dois_all.extend(refs_all)
            sleep(sleep_duration)
        except Exception as e:
            if '503' in str(e):
                sleep_duration = min(10, sleep_duration + 0.1*sleep_duration)  # Increase sleep duration, up to a maximum of 10 seconds
                sleep(sleep_duration)
            pass
    
    print(f'\n Finished!')
    print(f'Length of doi list (inc. duplicates {len(bib_dois_all)}')
    print(f'Length of doi list (suppodedly no duplicates {len(bib_dois_extended)}')
    bib_dois_ext = list(set(bib_dois_extended))
    print(f'Length of doi set (actually no duplicates) {len(bib_dois_ext)}')

    filename = Path(f"{DIR_OUTPUT / FILE_TXT_OUT.stem}_{i}{FILE_TXT_OUT.suffix}")
    with open(filename, 'w') as f:
        for item in bib_dois_ext:
            f.write("%s\n" % item)
    print(f'Wrote to {FILE_TXT_OUT.name}\n')
    
    return bib_dois_ext

# repeat the process to get references of references and references of references of references and references of references..... 
# be aware that number of dois will increase exponentially
# something like --> unique_dois = (number in original)*25^recursion_depth 
for i in range(RECURSION_DEPTH):
    print(f'{"="*80}\n')
    print('\t** Searching for refererences of references **')
    print(f'{"-"*80}\n')
    print(f'\t Iteration {i+1}/{RECURSION_DEPTH}')
    print(f'\t Using {FILE_BIB.name} as input')
    print(f'\t Length of doi list {len(bib_dois)}')
    print(f'{"-"*80}\n')
    bib_dois = get_related_dois(bib_dois, i)
    
    print(f'{"="*80}\n')


	** Searching for refererences of references **
--------------------------------------------------------------------------------

	 Iteration 1/4
	 Using alphabet.bib as input
	 Length of doi list 1
--------------------------------------------------------------------------------



Iteration 1: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]



 Finished!
Length of doi list (inc. duplicates 27
Length of doi list (suppodedly no duplicates 27
Length of doi set (actually no duplicates) 27
Wrote to alphabet.txt



	** Searching for refererences of references **
--------------------------------------------------------------------------------

	 Iteration 2/4
	 Using alphabet.bib as input
	 Length of doi list 27
--------------------------------------------------------------------------------



Iteration 2: 100%|██████████| 27/27 [00:18<00:00,  1.49it/s]



 Finished!
Length of doi list (inc. duplicates 548
Length of doi list (suppodedly no duplicates 428
Length of doi set (actually no duplicates) 428
Wrote to alphabet.txt



	** Searching for refererences of references **
--------------------------------------------------------------------------------

	 Iteration 3/4
	 Using alphabet.bib as input
	 Length of doi list 428
--------------------------------------------------------------------------------



Iteration 3: 100%|██████████| 428/428 [04:32<00:00,  1.57it/s]



 Finished!
Length of doi list (inc. duplicates 10902
Length of doi list (suppodedly no duplicates 7291
Length of doi set (actually no duplicates) 7228
Wrote to alphabet.txt



	** Searching for refererences of references **
--------------------------------------------------------------------------------

	 Iteration 4/4
	 Using alphabet.bib as input
	 Length of doi list 7228
--------------------------------------------------------------------------------



Iteration 4: 100%|██████████| 7228/7228 [1:21:07<00:00,  1.48it/s]


 Finished!
Length of doi list (inc. duplicates 164062
Length of doi list (suppodedly no duplicates 95631
Length of doi set (actually no duplicates) 94958
Wrote to alphabet.txt





