In [52]:
import requests
import pandas as pd
from time import time

In [53]:
api_url = "https://api.crossref.org/works"
query_params = {
    "query": "LiDAR SLAM",
    "rows": 1000,  # Adjust based on how many results you want per page
    # "offset": 0, # Deprecated use cursor instead
    "cursor": "*"  # Starting cursor
}

In [54]:
test_params = query_params.copy()
test_params["rows"] = 10

response = requests.get(api_url, params=test_params)
if response.status_code != 200: raise SystemExit("Bad response")

data = response.json()
Number_of_articles = data['message']['total-results']
print(f"Query: {data['message']['query']}")
print(f"The number of work matched criteria = {Number_of_articles}\n")

Query: {'start-index': 0, 'search-terms': 'LiDAR SLAM'}
The number of work matched criteria = 46781



In [61]:
def get_article_info(item):
    Authors = item.get('author', [])
    article_info = {
        "title": item.get('title', ['No Title'])[0].replace("\n", " ").replace("\xa0", " "),
        "publisher_location": item.get('publisher-location'),
        "year": item.get('created', {}).get('date-parts', [[None]])[0][0],
        "type": item.get('type'),
        "reference-count": item.get('reference-count'),
        "authors": ', '.join([author.get('given', '') + " " + author.get('family', '') for author in Authors])
    }
    try:
        first_author_affiliation = Authors[0].get('affiliation', [])
        affiliation_location = first_author_affiliation[0].get('name')
        article_info["publisher_affiliation"] = affiliation_location.replace("\n", " ").replace("\xa0", " ")
    except Exception:
        article_info["publisher_affiliation"] = "Unknown"
    
    return article_info

In [62]:
cur_query_params = query_params.copy()
articles = []  # List to store article information

start = time()
while True: 
    response = requests.get(api_url, params=cur_query_params)
    if response.status_code != 200: raise SystemExit(f"Bad response ({response.status_code}): {response.json()["message"]}")

    data = response.json()
    items = data['message']['items']

    if not items: break

    for item in items:
        articles.append(get_article_info(item))

    cur_query_params['cursor'] = data['message'].get('next-cursor')
    print(f"articles found: {len(articles)}/{Number_of_articles} ({(time() - start):.2f} sec)")

articles found: 1000/46781 (4.71 sec)
articles found: 2000/46781 (7.58 sec)
articles found: 3000/46781 (10.96 sec)
articles found: 4000/46781 (13.90 sec)
articles found: 5000/46781 (17.58 sec)
articles found: 6000/46781 (22.81 sec)
articles found: 7000/46781 (28.61 sec)
articles found: 8000/46781 (34.74 sec)
articles found: 9000/46781 (39.34 sec)
articles found: 10000/46781 (45.70 sec)
articles found: 11000/46781 (50.97 sec)
articles found: 12000/46781 (55.55 sec)
articles found: 13000/46781 (61.44 sec)
articles found: 14000/46781 (67.36 sec)
articles found: 15000/46781 (71.48 sec)
articles found: 16000/46781 (77.53 sec)
articles found: 17000/46781 (82.68 sec)
articles found: 18000/46781 (87.45 sec)
articles found: 19000/46781 (91.55 sec)
articles found: 20000/46781 (97.92 sec)
articles found: 21000/46781 (103.24 sec)
articles found: 22000/46781 (107.89 sec)
articles found: 23000/46781 (113.45 sec)
articles found: 24000/46781 (119.17 sec)
articles found: 25000/46781 (124.39 sec)
articl

In [57]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,title,publisher_location,year,type,reference-count,authors,publisher_affiliation
0,D3VIL-SLAM: 3D Visual Inertial LiDAR SLAM for ...,,2023,proceedings-article,23.0,"Matteo Frosi, Matteo Matteucci",Information and Bioengineering Politecnico di ...
1,Visual and lidar-based SLAM by variational bay...,,2020,dissertation,0.0,", Xiaoyue Jiang",Unknown
2,A Comparison of Outdoor 3D Reconstruction betw...,,2023,proceedings-article,10.0,"Yi-Tian Hong, Han-Pang Huang","National Taiwan University,Department of Mecha..."
3,Indoor mapping and positioning applications of...,,2021,journal-article,29.0,Mustafa ZEYBEK,Unknown
4,SC-LiDAR-SLAM: A Front-end Agnostic Versatile ...,,2022,proceedings-article,46.0,"Giseop Kim, Seungsang Yun, Jeongyun Kim, Ayoun...","KAIST,Dept. of Civil and Envtl. Eng.,Daejeon,S..."


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46781 entries, 0 to 46780
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  46781 non-null  object 
 1   publisher_location     3883 non-null   object 
 2   year                   46781 non-null  int64  
 3   type                   46779 non-null  object 
 4   reference-count        46746 non-null  float64
 5   authors                46781 non-null  object 
 6   publisher_affiliation  46781 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 2.5+ MB


In [59]:
# Removal condition
condition = ~((df['publisher_location'].isna()) & (df['publisher_affiliation'] == 'Unknown'))

# Apply condition
df_filtered = df[condition]
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12545 entries, 0 to 46778
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  12545 non-null  object 
 1   publisher_location     3883 non-null   object 
 2   year                   12545 non-null  int64  
 3   type                   12544 non-null  object 
 4   reference-count        12545 non-null  float64
 5   authors                12545 non-null  object 
 6   publisher_affiliation  12545 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 784.1+ KB


In [60]:
# Save the DataFrame to a CSV file with proper handling of newline characters
df.to_csv('slam_articles.csv', index=False, encoding='utf-8', quoting=1)