## Preparing the arXiv ID for SS

In [2]:
import re 

datas = []
unmatch = []
# Regular expression pattern to find and capture the numerical parts
pattern = r"https://arxiv.org/pdf/(\d{4}\.\d{5})\.pdf"


with open('data/reason.md', 'r') as f:
    for line in f.readlines():  
        if 'https://arxiv' in line:
            # Find all matches in the string
            matches = re.findall(pattern, line)
            datas.append(f'ARXIV:{matches[0]}')
        elif 'pdf' in line:
            unmatch.append(line)

datas[:5]

['ARXIV:2309.13625',
 'ARXIV:2310.13023',
 'ARXIV:2310.04560',
 'ARXIV:2310.04944',
 'ARXIV:2310.00299']

## Get TL;DR from SemanticScholar

In [3]:
from semanticscholar import SemanticScholar
sch = SemanticScholar()

In [3]:
def get_meta_data(paper):
    domains = ['title', 'authors', 'abstract', 'tldr', 'venue', 'referenceCount', 'citationCount', 'influentialCitationCount']

    meta_data = {}
    for domain in domains:
        if domain in paper.keys():
            if domain == 'authors':
                authors = []
                for author_domain in paper.__getattribute__(domain):
                    authors.append(author_domain['name'])
                meta_data['authors'] = authors
            else:
                domain_res = paper.__getattribute__(domain)
                if domain == 'tldr':
                    domain_res = str(domain_res)
                meta_data[domain] = domain_res
    return meta_data

In [5]:
from tqdm import tqdm 
import time

meta_datas = []
with open('result.json', 'a+') as fp:
    for data in tqdm(datas):
        try:
            paper = sch.get_paper(data)
            fp.writelines(get_meta_data(paper))
        except:
            fp.writelines({'Cannot find': data})
        
        meta_datas.append(get_meta_data(paper))
        time.sleep(2) # hold 2 sec

  2%|▏         | 2/111 [00:22<22:41, 12.49s/it]

## arXiv searching

In [8]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
# -----
# sort_by
# Relevance 
# LastUpdatedDate 
# SubmittedDate
# -----
# order
# Ascending
# Descending

search = arxiv.Search(
  query = "Large Language Models",
  max_results = 10,
  sort_by = arxiv.SortCriterion.SubmittedDate,
  sort_order = arxiv.SortOrder.Descending,
)

results = client.results(search)
for r in client.results(search):
  print(r.title, r.published)

DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models 2024-02-29 18:59:58+00:00
Single Electron Quantum Dot in Two-Dimensional Transition Metal Dichalcogenides 2024-02-29 18:59:57+00:00
Panda-70M: Captioning 70M Videos with Multiple Cross-Modality Teachers 2024-02-29 18:59:50+00:00
Learning a Generalized Physical Face Model From Data 2024-02-29 18:59:31+00:00
Impact of weak lensing on bright standard siren analyses 2024-02-29 18:59:30+00:00
The Counterfeit Conundrum: Can Code Language Models Grasp the Nuances of Their Incorrect Generations? 2024-02-29 18:59:25+00:00
The All-Seeing Project V2: Towards General Relation Comprehension of the Open World 2024-02-29 18:59:17+00:00
Retrieval-Augmented Generation for AI-Generated Content: A Survey 2024-02-29 18:59:01+00:00
Lifelong Benchmarks: Efficient Model Evaluation in an Era of Rapid Progress 2024-02-29 18:58:26+00:00
Loose LIPS Sink Ships: Asking Questions in Battleship with Language-Informed Program Sampling 2

In [2]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
search = arxiv.Search(
  query = "Large Language Models",
  max_results = 10,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

# `results` is a generator; you can iterate over its elements one by one...
for r in client.results(search):
  print(r.title)
# ...or exhaust it into a list. Careful: this is slow for large results sets.
all_results = list(results)
print([r.title for r in all_results])

# For advanced query syntax documentation, see the arXiv API User Manual:
# https://arxiv.org/help/api/user-manual#query_details
search = arxiv.Search(query = "au:del_maestro AND ti:checkerboard")
first_result = next(client.results(search))
print(first_result)

# Search for the paper with ID "1605.08386v1"
search_by_id = arxiv.Search(id_list=["1605.08386v1"])
# Reuse client to fetch the paper, then print its title.
first_result = next(client.results(search))
print(first_result.title)

DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models
Single Electron Quantum Dot in Two-Dimensional Transition Metal Dichalcogenides
Panda-70M: Captioning 70M Videos with Multiple Cross-Modality Teachers
Learning a Generalized Physical Face Model From Data
Impact of weak lensing on bright standard siren analyses
The Counterfeit Conundrum: Can Code Language Models Grasp the Nuances of Their Incorrect Generations?
The All-Seeing Project V2: Towards General Relation Comprehension of the Open World
Retrieval-Augmented Generation for AI-Generated Content: A Survey
Lifelong Benchmarks: Efficient Model Evaluation in an Era of Rapid Progress
Loose LIPS Sink Ships: Asking Questions in Battleship with Language-Informed Program Sampling
['DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models', 'Single Electron Quantum Dot in Two-Dimensional Transition Metal Dichalcogenides', 'Panda-70M: Captioning 70M Videos with Multiple Cross-Modality Tea

In [None]:
import requests
import json

def fetch_and_categorize_papers(venue_id):
    url = f"https://api2.openreview.net/notes?content.venueid={venue_id}"
    response = requests.get(url)
    data = response.json()
    
    # Initialize dictionaries to hold categorized papers
    papers_by_type = {'Oral': [], 'Spotlight': [], 'Poster': []}
    
    # Iterate over all papers and categorize them
    if 'notes' in data:
        for note in data['notes']:
            venue_info = note['content'].get('venue', {})
            if 'value' in venue_info:
                venue_value = venue_info['value']
                if 'oral' in venue_value.lower():
                    papers_by_type['Oral'].append(note)
                elif 'spotlight' in venue_value.lower():
                    papers_by_type['Spotlight'].append(note)
                elif 'poster' in venue_value.lower():
                    papers_by_type['Poster'].append(note)
    
    return papers_by_type

# Usage of the function
venue_id = "ICLR.cc/2024/Conference"

# venue_id = "NeurIPS.cc/2023/Conference"
papers_by_type = fetch_and_categorize_papers(venue_id)

# Print the results to verify
# for paper_type, notes in papers_by_type.items():
#     print(f"\n{paper_type} Papers:")
#     if notes:
#         for note in notes[:5]:  # Limiting to first 5 papers for brevity
#             title = note.get('content', {}).get('title', 'No title available')
#             authors = ", ".join(note.get('content', {}).get('authors', []))
#             abstract = note.get('content', {}).get('abstract', 'No abstract available')
#             tldr = note.get('content', {}).get('TLDR', 'No TL;DR available')
#             keywords = note.get('content', {}).get('keywords', 'No keywords available')
#             link = f"https://openreview.net/forum?id={note['id']}"
#             print(f"Title: {title}")
#             print(f"Authors: {authors}")
#             print(f"Abstract: {abstract}")
#             print(f"TL;DR: {tldr}")
#             print(f"Keywords: {keywords}")
#             print(f"Link: {link}")
#             print("---")
#     else:
#         print("No papers found.")



In [36]:
for paper_type, notes in papers_by_type.items():
    print(f"\n{paper_type} Papers:")
    if notes:
        for note in notes: 
            title = note.get('content', {}).get('title', 'No title available').get('value')
            authors = ", ".join(note.get('content', {}).get('authors', []).get('value'))
            abstract = note.get('content', {}).get('abstract', 'No abstract available').get('value')

            tldr = note.get('content', {}).get('TLDR', '')
            if tldr:
                tldr = tldr['value']
            else:
                tldr = 'No TL;DR available'

            keywords = note.get('content', {}).get('keywords', 'No keywords available').get('value')
            keywords = '; '.join(keywords)


            if ('reasoning' in tldr.lower()) or ('reasoning' in keywords.lower()) or ('reasoning' in abstract.lower()) or ('reasoning' in title.lower()):
                # print(f"Reasoning Paper")

                # link = f"https://openreview.net/forum?id={note['id']}"
                print(f"Title: {title}")
                print(f"Authors: {authors}")
                # print(f"Abstract: {abstract}")
                print(f"TL;DR: {tldr}")
                print(f"Keywords: {keywords}")
                # print(f"Link: {link}")
                print("---")


Oral Papers:
Title: Phenomenal Yet Puzzling: Testing Inductive Reasoning Capabilities of Language Models with Hypothesis Refinement
Authors: Linlu Qiu, Liwei Jiang, Ximing Lu, Melanie Sclar, Valentina Pyatkin, Chandra Bhagavatula, Bailin Wang, Yoon Kim, Yejin Choi, Nouha Dziri, Xiang Ren
TL;DR: No TL;DR available
Keywords: language model; natural language processing; inductive reasoning
---
Title: MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts
Authors: Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, Jianfeng Gao
TL;DR: We introduce MathVista, a novel benchmark for evaluating mathematical reasoning capabilities within visual contexts, and conduct extensive experiments on 11 foundation models.
Keywords: large language models; large multimodal models; mathematical reasoning; vision-language reasoning; foundation models and their evaluations
---
Title: SWE-bench: Can Language Mo

In [None]:
import requests
import json

# Test the API endpoint directly and print a portion of the response
url = "https://api2.openreview.net/notes?content.venueid=ICLR.cc/2024/Conference"
response = requests.get(url)
data = response.json()

# Print the entire response to understand its structure
print(json.dumps(data, indent=4))

# Check if specific expected keys are present and print the first few entries if available
if 'notes' in data and data['notes']:
    for note in data['notes'][:5]:  # Print details of the first 5 entries
        print("Title:", note.get('content', {}).get('title', 'No title available'))
        print("Authors:", ", ".join(note.get('content', {}).get('authors', [])))
        print("Abstract:", note.get('content', {}).get('abstract', 'No abstract available'))
        print("Link:", f"https://openreview.net/forum?id={note['id']}")
        print("---")
else:
    print("No notes found in the data.")

# import requests
# import json

# # Request a broad set of data to understand its structure
# url = "https://api2.openreview.net/notes?content.venueid=ICLR.cc/2024/Conference"
# response = requests.get(url)
# data = response.json()

# # Print a comprehensive view of the first few entries to inspect the structure
# print(json.dumps(data['notes'][:5], indent=4))  # Print details of the first 5 entries



In [16]:
import requests
import json

# Request a broad set of data to understand its structure
url = "https://api2.openreview.net/notes?content.venueid=NeurIPS.cc/2023/Conference"
response = requests.get(url)
data = response.json()

# Print a comprehensive view of the first few entries to inspect the structure
print(json.dumps(data['notes'][:5], indent=4))  # Print details of the first 5 entries

[
    {
        "id": "vq11gurmUY",
        "number": 15599,
        "cdate": 1683835182036,
        "tcdate": 1683835182036,
        "mdate": 1698949793552,
        "tmdate": 1698949793552,
        "signatures": [
            "NeurIPS.cc/2023/Conference/Submission15599/Authors"
        ],
        "readers": [
            "everyone"
        ],
        "writers": [
            "NeurIPS.cc/2023/Conference",
            "NeurIPS.cc/2023/Conference/Submission15599/Authors"
        ],
        "forum": "vq11gurmUY",
        "content": {
            "title": {
                "value": "Online PCA in Converging Self-consistent Field Equations"
            },
            "authors": {
                "value": [
                    "Xihan Li",
                    "Xiang Chen",
                    "Rasul Tutunov",
                    "Haitham Bou Ammar",
                    "Lei Wang",
                    "Jun Wang"
                ]
            },
            "authorids": {
                "value