## Preparing the arXiv ID for SS

In [None]:
import re 

datas = []
unmatch = []
# Regular expression pattern to find and capture the numerical parts
pattern = r"https://arxiv.org/pdf/(\d{4}\.\d{5})\.pdf"


with open('data/reason.md', 'r') as f:
    for line in f.readlines():  
        if 'https://arxiv' in line:
            # Find all matches in the string
            matches = re.findall(pattern, line)
            datas.append(f'ARXIV:{matches[0]}')
        elif 'pdf' in line:
            unmatch.append(line)

datas[:5]

## Get TL;DR from SemanticScholar

In [None]:
from semanticscholar import SemanticScholar
sch = SemanticScholar()

In [None]:
def get_meta_data(paper):
    domains = ['title', 'authors', 'abstract', 'tldr', 'venue', 'referenceCount', 'citationCount', 'influentialCitationCount']

    meta_data = {}
    for domain in domains:
        if domain in paper.keys():
            if domain == 'authors':
                authors = []
                for author_domain in paper.__getattribute__(domain):
                    authors.append(author_domain['name'])
                meta_data['authors'] = authors
            else:
                domain_res = paper.__getattribute__(domain)
                if domain == 'tldr':
                    domain_res = str(domain_res)
                meta_data[domain] = domain_res
    return meta_data

In [None]:
from tqdm import tqdm 
import time

meta_datas = []
with open('result.json', 'a+') as fp:
    for data in tqdm(datas):
        try:
            paper = sch.get_paper(data)
            fp.writelines(get_meta_data(paper))
        except:
            fp.writelines({'Cannot find': data})
        
        meta_datas.append(get_meta_data(paper))
        time.sleep(2) # hold 2 sec

## arXiv searching

In [None]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
# -----
# sort_by
# Relevance 
# LastUpdatedDate 
# SubmittedDate
# -----
# order
# Ascending
# Descending

search = arxiv.Search(
  query = "Large Language Models",
  max_results = 10,
  sort_by = arxiv.SortCriterion.SubmittedDate,
  sort_order = arxiv.SortOrder.Descending,
)

results = client.results(search)
for r in client.results(search):
  print(r.title, r.published)

In [None]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
search = arxiv.Search(
  query = "Large Language Models",
  max_results = 10,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

# `results` is a generator; you can iterate over its elements one by one...
for r in client.results(search):
  print(r.title)
# ...or exhaust it into a list. Careful: this is slow for large results sets.
all_results = list(results)
print([r.title for r in all_results])

# For advanced query syntax documentation, see the arXiv API User Manual:
# https://arxiv.org/help/api/user-manual#query_details
search = arxiv.Search(query = "au:del_maestro AND ti:checkerboard")
first_result = next(client.results(search))
print(first_result)

# Search for the paper with ID "1605.08386v1"
search_by_id = arxiv.Search(id_list=["1605.08386v1"])
# Reuse client to fetch the paper, then print its title.
first_result = next(client.results(search))
print(first_result.title)

In [None]:
import requests
import json

def fetch_and_categorize_papers(venue_id):
    url = f"https://api2.openreview.net/notes?content.venueid={venue_id}"
    response = requests.get(url)
    data = response.json()
    
    # Initialize dictionaries to hold categorized papers
    papers_by_type = {'Oral': [], 'Spotlight': [], 'Poster': []}
    
    # Iterate over all papers and categorize them
    if 'notes' in data:
        for note in data['notes']:
            venue_info = note['content'].get('venue', {})
            if 'value' in venue_info:
                venue_value = venue_info['value']
                if 'oral' in venue_value.lower():
                    papers_by_type['Oral'].append(note)
                elif 'spotlight' in venue_value.lower():
                    papers_by_type['Spotlight'].append(note)
                elif 'poster' in venue_value.lower():
                    papers_by_type['Poster'].append(note)
    
    return papers_by_type

# Usage of the function
venue_id = "ICML.cc/2024/Conference"

# venue_id = "NeurIPS.cc/2023/Conference"
papers_by_type = fetch_and_categorize_papers(venue_id)

# Print the results to verify
# for paper_type, notes in papers_by_type.items():
#     print(f"\n{paper_type} Papers:")
#     if notes:
#         for note in notes[:5]:  # Limiting to first 5 papers for brevity
#             title = note.get('content', {}).get('title', 'No title available')
#             authors = ", ".join(note.get('content', {}).get('authors', []))
#             abstract = note.get('content', {}).get('abstract', 'No abstract available')
#             tldr = note.get('content', {}).get('TLDR', 'No TL;DR available')
#             keywords = note.get('content', {}).get('keywords', 'No keywords available')
#             link = f"https://openreview.net/forum?id={note['id']}"
#             print(f"Title: {title}")
#             print(f"Authors: {authors}")
#             print(f"Abstract: {abstract}")
#             print(f"TL;DR: {tldr}")
#             print(f"Keywords: {keywords}")
#             print(f"Link: {link}")
#             print("---")
#     else:
#         print("No papers found.")



In [None]:
for paper_type, notes in papers_by_type.items():
    print(f"\n{paper_type} Papers:")
    if notes:
        for note in notes: 
            title = note.get('content', {}).get('title', 'No title available').get('value')
            authors = ", ".join(note.get('content', {}).get('authors', []).get('value'))
            abstract = note.get('content', {}).get('abstract', 'No abstract available').get('value')

            tldr = note.get('content', {}).get('TLDR', '')
            if tldr:
                tldr = tldr['value']
            else:
                tldr = 'No TL;DR available'

            keywords = note.get('content', {}).get('keywords', 'No keywords available').get('value')
            keywords = '; '.join(keywords)


            if ('reasoning' in tldr.lower()) or ('reasoning' in keywords.lower()) or ('reasoning' in abstract.lower()) or ('reasoning' in title.lower()):
                # print(f"Reasoning Paper")

                # link = f"https://openreview.net/forum?id={note['id']}"
                print(f"Title: {title}")
                print(f"Authors: {authors}")
                # print(f"Abstract: {abstract}")
                print(f"TL;DR: {tldr}")
                print(f"Keywords: {keywords}")
                # print(f"Link: {link}")
                print("---")

In [None]:
import requests
import json

# Test the API endpoint directly and print a portion of the response
url = "https://api2.openreview.net/notes?content.venueid=ICLR.cc/2024/Conference"
response = requests.get(url)
data = response.json()

# Print the entire response to understand its structure
print(json.dumps(data, indent=4))

# Check if specific expected keys are present and print the first few entries if available
if 'notes' in data and data['notes']:
    for note in data['notes'][:5]:  # Print details of the first 5 entries
        print("Title:", note.get('content', {}).get('title', 'No title available'))
        print("Authors:", ", ".join(note.get('content', {}).get('authors', [])))
        print("Abstract:", note.get('content', {}).get('abstract', 'No abstract available'))
        print("Link:", f"https://openreview.net/forum?id={note['id']}")
        print("---")
else:
    print("No notes found in the data.")

# import requests
# import json

# # Request a broad set of data to understand its structure
# url = "https://api2.openreview.net/notes?content.venueid=ICLR.cc/2024/Conference"
# response = requests.get(url)
# data = response.json()

# # Print a comprehensive view of the first few entries to inspect the structure
# print(json.dumps(data['notes'][:5], indent=4))  # Print details of the first 5 entries



In [None]:
import requests
import json

# Request a broad set of data to understand its structure
url = "https://api2.openreview.net/notes?content.venueid=NeurIPS.cc/2023/Conference"
response = requests.get(url)
data = response.json()

# Print a comprehensive view of the first few entries to inspect the structure
print(json.dumps(data['notes'][:5], indent=4))  # Print details of the first 5 entries

In [None]:
import requests

url = "https://api.together.xyz/v1/chat/completions"

payload = {
    "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "temperature": 0.7,
    "frequency_penalty": 0,
    "presence_penalty": 0,
    "max_tokens": 512,
    "stop": ["</s>", "[/INST]"],
    "top_p": 0.7,
    "top_k": 50,
    "repetition_penalty": 1,
    "messages": [
        {
            "role": "system",
            "content": "You are a helpful travel agent"
        },
        {
            "role": "user",
            "content": "Tell me about San Francisco"
        }
    ]
}
headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "Authorization": "Bearer 1940cc7aab91e20ee2f4d6d971dd8abe078db62293b29214485a956e193fb532"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)