In [96]:
import pandas as pd
import requests
import feedparser
from datetime import datetime as dt, timedelta as td

In [268]:
def query_arXiv(search_query='', categories=None, start_time=None, end_time=None, max_results=None):
    """
    Query the arXiv API to retrieve a list of articles based on specified criteria.
    
    Args:
        search_query (str): The search query string.
        categories (list): A list of categories (e.g., ['cs.AI', 'cs.CL']) to filter the search. Default is None.
        start_time (datetime): The start time for the search. Default is 24 hours ago.
        end_time (datetime): The end time for the search. Default is now.
        max_results (int): The maximum number of results to retrieve. Default is None (100 results). Set to a higher value for more results.
        
    Returns:
        pandas.DataFrame: A DataFrame containing article information including title, published date, authors, categories, and summary.
    """
    
    if not max_results:
        max_results = 100
        expand_search = True
    else:
        expand_search = False

    # Set default values if not provided
    if start_time is None:
        if dt.utcnow().hour > 18:
            start_time = dt.utcnow() - td(days=1)
        else:
            start_time = dt.utcnow() - td(days=2)
        start_time = start_time.replace(hour=18, minute=00)
    if end_time is None:
        end_time = dt.utcnow()

    start_str = dt.strftime(start_time, '%Y%m%d%H%M')
    end_str = dt.strftime(end_time, '%Y%m%d%H%M')

    left_par = '%28'
    right_par = '%29'

    if categories:
        categories_query = left_par + '+OR+'.join(['cat:' + cat for cat in categories]) + right_par
        if len(search_query) > 0:
            query = search_query + '+AND+' + categories_query
        else:
            query = categories_query
    
    query += f'+AND+submittedDate:[{start_str}+TO+{end_str}]'

    # Prepare the query parameters
    params = {
        'sortBy': 'submittedDate',
        'sortOrder': 'descending',
        'start': 0,
        'max_results': max_results
    }

    # Make the API request
    response = requests.get(f'http://export.arxiv.org/api/query?search_query={query}', params=params)
    if response.status_code == 200:
        # Parse the XML response using feedparser
        feed = feedparser.parse(response.content)
        if len(feed.entries) == max_results and expand_search: # Recursive call if there are more than 100 papers in the timeframe
            half_days = int((end_time-start_time).days/2)
            articles1 = query_arXiv(categories=categories, start_time=start_time, end_time=end_time-td(days=half_days))
            articles2 = query_arXiv(categories=categories, start_time=end_time-td(days=half_days), end_time=end_time)
            articles = pd.concat([articles1, articles2], ignore_index=True)
            return articles
        articles = pd.DataFrame(columns = ['id', 'arxiv_doi', 'title', 'published', 'authors', 'arxiv_primary_category',
                                           'categories', 'summary', 'arxiv_affiliation', 'arxiv_journal_ref'])
        
        for entry in feed.entries:
            etime = dt.strptime(entry.published, '%Y-%m-%dT%H:%M:%SZ')
            if 'arxiv_doi' in entry.keys():
                doi = entry.arxiv_doi
            else:
                doi = ''
            if 'arxiv_affiliation' in entry.keys():
                arx_aff = entry.arxiv_affiliation
            else:
                arx_aff = ''
            if 'arxiv_journal_ref' in entry.keys():
                jref = entry.arxiv_journal_ref
            else:
                jref = ''
            articles.loc[len(articles)] = [entry.id, doi, entry.title, etime, [author.name for author in entry.authors],
                                           entry.arxiv_primary_category, [tag['term'] for tag in entry.tags],
                                           entry.summary, arx_aff, jref]

        return articles
    else:
        print('Error:', response.status_code)
        return []

# Example usage
articles = query_arXiv(categories=['cs.AI', 'cs.CL'], start_time=dt.utcnow()-td(days=3))
print(len(articles))


216


In [275]:
articles.arxiv_affiliation.unique()

array(['', 'School of Computing, KAIST, Daejeon, Republic of Korea',
       'Imperial College London, UK', 'IETR, INSA Rennes', 'LAAS-ROC',
       'LIRIS, SyCoSMA', 'IRIT-SAMoVA'], dtype=object)

In [276]:
test = articles.drop(columns = ['categories']).join(articles['categories'].str.join(',').str.get_dummies(sep=',').astype(int))
test.drop(columns = ['id', 'arxiv_doi', 'title', 'published', 'authors', 'arxiv_primary_category', 'summary',
                     'arxiv_affiliation', 'arxiv_journal_ref']).sum()

 45Q05                                 1
 68T05                                 1
 90C31                                 1
15A29                                  1
68T01                                  1
68T40                                  1
F.2.2; I.2.7                           1
H.4.0                                  1
I.2.0; I.2.3; I.2.7; I.5.1; I.5.4      1
I.2.0; I.5.0                           1
I.2.10; I.3.7; I.4.0                   1
I.2.6                                  1
I.2; I.4; J.2                          1
I.4.9; J.2; I.2.1                      1
astro-ph.IM                            2
astro-ph.SR                            1
cond-mat.mtrl-sci                      1
cs.AI                                176
cs.CL                                 78
cs.CR                                  5
cs.CV                                 43
cs.CY                                  5
cs.DB                                  3
cs.DC                                  2
cs.FL           

In [202]:
if dt.utcnow().hour > 18:
    start = dt.strftime((dt.utcnow() - td(days=1)).date(), '%Y%m%d')+'1800'
else:
    start = dt.strftime((dt.utcnow() - td(days=2)).date(), '%Y%m%d')+'1800'
end = dt.strftime(dt.now(), '%Y%m%d%H%M')
print(start, end)
params = {
    'start': 0,
    'max_results': 100
}
response = requests.get(f'http://export.arxiv.org/api/query?search_query=%28cat:cs.AI+OR+cat:cs.CL%29+AND+submittedDate:[{start}+TO+{end}]', params=params)
feed = feedparser.parse(response.content)
print(len(feed.entries))

202308271800 202308291323
84


10

In [142]:
query = 'cat:cs.AI+OR+cat:cs.CL'
params1 = {
    'sortBy': 'lastUpdatedDate',
    'sortOrder': 'descending',
    'start': 0,
    'max_results': 100
}
params2 = {
    'start': 0,
    'max_results': 100
}
# Make the API request
response1 = requests.get(f'http://export.arxiv.org/api/query?search_query={query}', params=params1)
feed1 = feedparser.parse(response1.content)
response2 = requests.get(f'http://export.arxiv.org/api/query?search_query={query}&sort_by=lastUpdatedDate&sortOrder=descending', params=params2)
feed2 = feedparser.parse(response2.content)

# feed = feedparser.parse(response.content)
# len(feed.entries)

feed1.entries==feed2.entries

False

In [146]:
for i in range(10):
    print(feed1.entries[-1-i]['published'], feed2.entries[-1-i]['published'])

2023-08-24T02:03:29Z 2013-04-28T19:30:11Z
2023-07-07T04:20:36Z 2013-04-14T05:52:11Z
2023-07-06T16:28:35Z 2013-03-27T19:52:35Z
2023-04-04T23:41:20Z 2013-03-31T22:17:19Z
2023-08-28T06:05:18Z 2013-01-13T19:33:31Z
2023-08-21T10:37:49Z 2013-01-09T19:17:31Z
2023-02-26T08:43:08Z 2012-12-12T15:56:22Z
2023-08-28T06:10:26Z 2012-12-28T15:39:46Z
2023-08-28T06:15:14Z 2012-12-02T18:06:57Z
2023-08-28T06:17:15Z 2012-11-19T16:38:32Z
