# Data collection process

This notebook scrapes the arXiv website for papers in the category "cs.CV" (computer vision), "stat.ML" / "cs.LG" (Machine Learning) and "cs.AI" (Artificial Intelligence). The papers are then saved in the a csv file.

In [1]:
import arxiv
import pandas as pd 

from tqdm import tqdm
from pathlib import Path

In [2]:
PATH_DATA_BASE = Path.cwd().parent / "data"

# Scraping the arXiv website

Let's start by defining a list of keywords that we will use to query the arXiv API.

In [3]:
# You can added some keywords here to search for specific topics
query_keywords = [
    "\"image segmentation\"",
    "\"self-supervised learning\"",
    "\"representation learning\"",
    "\"image generation\"",
    "\"object detection\"",
    "\"transfer learning\"",
    "\"transformers\"",
    "\"adversarial training",
    "\"generative adversarial networks\"",
    "\"model compressions\"",
    "\"image segmentation\"",
    "\"few-shot learning\"",
    "\"natural language\"",
    "\"graph\"",
    "\"colorization\"",
    "\"depth estimation\"",
    "\"point cloud\"",
    "\"structured data\"",
    "\"optical flow\"",
    "\"reinforcement learning\"",
    "\"super resolution\"",
    "\"attention\"",
    "\"tabular\"",
    "\"unsupervised learning\"",
    "\"semi-supervised learning\"",
    "\"explainable\"",
    "\"radiance field\"",
    "\"decision tree\"",
    "\"time series\"",
    "\"molecule\"",
    "\"large language models\"",
    "\"llms\"",
    "\"language models\"",
    "\"image classification\"",
    "\"document image classification\"",
    "\"encoder\"",
    "\"decoder\"",
    "\"multimodal\"",
    "\"multimodal deep learning\"",
]

Afterwards, we define a function that creates a search object using the given query. It sets the maximum number of results for each category to 6000 and sorts them by the last updated date.

In [4]:
client = arxiv.Client(num_retries=20, page_size=500)


def query_with_keywords(query) -> tuple:
    """
    Query the arXiv API for research papers based on a specific query and filter results by selected categories.
    
    Args:
        query (str): The search query to be used for fetching research papers from arXiv.
    
    Returns:
        tuple: A tuple containing three lists - terms, titles, and abstracts of the filtered research papers.
        
            terms (list): A list of lists, where each inner list contains the categories associated with a research paper.
            titles (list): A list of titles of the research papers.
            abstracts (list): A list of abstracts (summaries) of the research papers.
            urls (list): A list of URLs for the papers' detail page on the arXiv website.
    """
    
    # Create a search object with the query and sorting parameters.
    search = arxiv.Search(
        query=query,
        max_results=6000,
        sort_by=arxiv.SortCriterion.LastUpdatedDate
    )
    
    # Initialize empty lists for terms, titles, abstracts, and urls.
    terms = []
    titles = []
    abstracts = []
    urls = []

    # For each result in the search...
    for res in tqdm(client.results(search), desc=query):
        # Check if the primary category of the result is in the specified list.
        if res.primary_category in ["cs.CV", "stat.ML", "cs.LG", "cs.AI"]:
            # If it is, append the result's categories, title, summary, and url to their respective lists.
            terms.append(res.categories)
            titles.append(res.title)
            abstracts.append(res.summary)
            urls.append(res.entry_id)

    # Return the four lists.
    return terms, titles, abstracts, urls

In [5]:
all_titles = []
all_abstracts = []
all_terms = []
all_urls = []

for query in query_keywords:
    terms, titles, abstracts, urls = query_with_keywords(query)
    all_titles.extend(titles)
    all_abstracts.extend(abstracts)
    all_terms.extend(terms)
    all_urls.extend(urls)

"image segmentation": 0it [00:00, ?it/s]

"image segmentation": 4744it [01:37, 48.73it/s]
"self-supervised learning": 0it [00:03, ?it/s]
"representation learning": 6000it [02:06, 47.32it/s]
"image generation": 5105it [02:52, 29.53it/s]
"object detection": 6000it [02:16, 43.83it/s]
"transfer learning": 6000it [03:04, 32.55it/s]
"transformers": 4501it [01:43, 49.91it/s]Bozo feed; consider handling: document declared as utf-8, but parsed as iso-8859-2
"transformers": 6000it [01:59, 50.11it/s]
"adversarial training: 0it [00:02, ?it/s]
"generative adversarial networks": 6000it [01:44, 57.57it/s]
"model compressions": 1154it [00:26, 44.14it/s]
"image segmentation": 4744it [01:18, 60.53it/s]
"few-shot learning": 0it [00:03, ?it/s]
"natural language": 6000it [02:36, 38.38it/s]
"graph": 6000it [02:28, 40.49it/s]
"colorization": 6000it [02:15, 44.19it/s]
"depth estimation": 2039it [00:46, 43.83it/s]
"point cloud": 6000it [02:09, 46.45it/s]
"structured data": 2810it [01:03, 44.46it/s]
"optical flow": 2087it [00:57, 36.44it/s]
"reinforcem

In [6]:
arxiv_data = pd.DataFrame({
    'titles': all_titles,
    'abstracts': all_abstracts,
    'terms': all_terms,
    'urls': all_urls
})

Finally, we export the DataFrame to a csv file.

In [7]:
arxiv_data.to_csv(PATH_DATA_BASE / "data.csv", index=False)