# Get List of articles usnig crossref api

### import the required libraries

In [1]:
import requests
import pandas as pd
from time import time

### Set API address and parameters

In [2]:
api_url = "https://api.crossref.org/works"
query_params = {
    "query": "LiDAR SLAM",
    "rows": 1000,  # Adjust based on how many results you want per page
    # "offset": 0, # Deprecated use cursor instead
    "cursor": "*"  # Starting cursor
}

### Make test request

In [3]:
test_params = query_params.copy()
test_params["rows"] = 10

response = requests.get(api_url, params=test_params)
if response.status_code != 200: raise SystemExit("Bad response")

data = response.json()
Number_of_articles = data['message']['total-results']
print(f"Query: {data['message']['query']}")
print(f"The number of work matched criteria = {Number_of_articles}\n")

Query: {'start-index': 0, 'search-terms': 'LiDAR SLAM'}
The number of work matched criteria = 46933



### Function to get information about the article

In [14]:
import unicodedata

def preprocess_text(text):
    text = text.replace("\n", " ") # remove new lines
    text = text.replace('/',' / ') # ensure space between words
    text = text.replace(",", ", ").replace(",  ", ", ") # replace commas without space
    return unicodedata.normalize('NFKD', text) 

def get_article_info(item):
    Authors = item.get('author', [])
    article_info = {
        "title": preprocess_text(item.get('title', ['No Title'])[0]),
        "DOI": item.get('DOI'),
        "publisher": item.get('publisher'),
        "publisher_location": item.get('publisher-location'),
        "year": item.get('created', {}).get('date-parts', [[None]])[0][0],
        "type": item.get('type'),
        "reference-count": item.get('reference-count'),
        "is-referenced-by-count": item.get('is-referenced-by-count'),
        "authors": ', '.join([author.get('given', '') + " " + author.get('family', '') for author in Authors])
    }
    
    try: # use try because some fields may be empty
        first_author_affiliation = Authors[0].get('affiliation', [])
        affiliation_location = first_author_affiliation[0].get('name')
        article_info["publisher_affiliation"] = preprocess_text(affiliation_location)
    except Exception:
        article_info["publisher_affiliation"] = None
    
    return article_info

### Getting the list of articles and saving them into an array 

In [15]:
cur_query_params = query_params.copy()
articles = []  # List to store article information

start = time()
while True: 
    # make request
    response = requests.get(api_url, params=cur_query_params) 
    if response.status_code != 200: raise SystemExit(f"Bad response ({response.status_code}): {response.json()["message"]}")

    # read list of works    
    data = response.json()
    items = data['message']['items']

    # stop if no work remains
    if not items: break

    # get article info
    for item in items:
        articles.append(get_article_info(item))

    cur_query_params['cursor'] = data['message'].get('next-cursor')
    print(f"articles found: {len(articles)}/{Number_of_articles} ({(time() - start):.2f} sec)")

articles found: 1000/46933 (5.76 sec)
articles found: 2000/46933 (10.11 sec)
articles found: 3000/46933 (14.48 sec)
articles found: 4000/46933 (20.64 sec)
articles found: 5000/46933 (25.33 sec)
articles found: 6000/46933 (30.49 sec)
articles found: 7000/46933 (35.02 sec)
articles found: 8000/46933 (39.57 sec)
articles found: 9000/46933 (44.82 sec)
articles found: 10000/46933 (49.52 sec)
articles found: 11000/46933 (54.44 sec)
articles found: 12000/46933 (59.11 sec)
articles found: 13000/46933 (63.45 sec)
articles found: 14000/46933 (68.39 sec)
articles found: 15000/46933 (73.08 sec)
articles found: 16000/46933 (78.61 sec)
articles found: 17000/46933 (83.39 sec)
articles found: 18000/46933 (89.04 sec)
articles found: 19000/46933 (93.35 sec)
articles found: 20000/46933 (99.47 sec)
articles found: 21000/46933 (107.07 sec)
articles found: 22000/46933 (114.33 sec)
articles found: 23000/46933 (119.18 sec)
articles found: 24000/46933 (125.55 sec)
articles found: 25000/46933 (130.96 sec)
artic

### Create pandas dataframe 

In [16]:
df = pd.DataFrame(articles)
df.iloc[:].head()

Unnamed: 0,title,DOI,publisher,publisher_location,year,type,reference-count,is-referenced-by-count,authors,publisher_affiliation
0,D3VIL-SLAM: 3D Visual Inertial LiDAR SLAM for ...,10.1109/iv55152.2023.10186534,IEEE,,2023,proceedings-article,23.0,0.0,"Matteo Frosi, Matteo Matteucci",Information and Bioengineering Politecnico di ...
1,Visual and lidar-based SLAM by variational bay...,10.32657/10356/139813,Nanyang Technological University,,2020,dissertation,0.0,0.0,", Xiaoyue Jiang",
2,A Comparison of Outdoor 3D Reconstruction betw...,10.1109/cacs60074.2023.10325866,IEEE,,2023,proceedings-article,10.0,0.0,"Yi-Tian Hong, Han-Pang Huang","National Taiwan University, Department of Mech..."
3,Indoor mapping and positioning applications of...,10.51946/melid.927004,Turkiye lidar dergisi (Mersin University),,2021,journal-article,29.0,5.0,Mustafa ZEYBEK,
4,SC-LiDAR-SLAM: A Front-end Agnostic Versatile ...,10.1109/iceic54506.2022.9748644,IEEE,,2022,proceedings-article,46.0,10.0,"Giseop Kim, Seungsang Yun, Jeongyun Kim, Ayoun...","KAIST, Dept. of Civil and Envtl. Eng., Daejeon..."


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46933 entries, 0 to 46932
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   title                   46933 non-null  object 
 1   DOI                     46933 non-null  object 
 2   publisher               46933 non-null  object 
 3   publisher_location      3887 non-null   object 
 4   year                    46933 non-null  int64  
 5   type                    46931 non-null  object 
 6   reference-count         46898 non-null  float64
 7   is-referenced-by-count  46898 non-null  float64
 8   authors                 46933 non-null  object 
 9   publisher_affiliation   9170 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 3.6+ MB


### Save data

In [18]:
# Save the DataFrame to a CSV file with proper handling of newline characters
df.to_csv('slam_articles.csv', index=False, encoding='utf-8', quoting=1)