In [52]:
# 提取arxiv 源文件
import arxiv

query_name = input('input your paper here (only arxiv paper is ok)')

search = arxiv.Search(
    query=query_name,
    id_list=[],
    max_results=1,
    sort_by=arxiv.SortCriterion.Relevance,
    sort_order=arxiv.SortOrder.Descending,
)

paper = next(search.results())
print(paper.title)

directory = "./data/paper"
if not os.path.exists(directory):
    os.makedirs(directory)
# Download the archive to a specified directory with a custom filename.
paper.download_source(dirpath="./data/paper", filename=f"{query_name}.tar.gz")


ADBench: Anomaly Detection Benchmark


'./data/paper/ADBench.tar.gz'

In [53]:
# arxiv cleaner过一遍
import tarfile
import os
def extract_tex_files(dirpath,save_path):
    with tarfile.open(dirpath, 'r:gz') as tar:
        # 遍历压缩文件中的每个文件/文件夹
        for member in tar.getmembers():
            # 检查文件是否为 .tex 格式
            if member.isreg() and member.name.endswith('.tex'):
                os.makedirs(save_path, exist_ok=True)
                tar.extract(member, save_path)
                print(f"Extracted: {member.name}")

# 示例用法
dirpath = f'./data/paper/{query_name}.tar.gz'
save_path = dirpath.split('.tar.gz')[0]
extract_tex_files(dirpath, save_path)
os.system(f'arxiv_latex_cleaner {save_path} --config cleaner_config.yaml')

Extracted: 00abstract.tex
Extracted: 1intro.tex
Extracted: 2related.tex
Extracted: 3setting.tex
Extracted: 4exp.tex
Extracted: 5discussion.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCROC_label_1.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCROC_label_0.25.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCPR_label_0.25.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/20news.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCROC_unsup.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCROC_label_0.75.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCPR_unsup.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCPR_label_0.5.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/datasets.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCPR_label_0.75.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/hyperparameter_grid.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCPR_label_1.tex
Extracted: ADBench-NeurIPS-Revision (1)/tables/AUCPR_label_0.1.tex
Ex

0

In [54]:
# 处理arXiv 文本内容
import os
import re

def find_main_tex_file(directory_path):
    """在给定目录中查找包含“main”的.tex文件。"""
    for file_name in os.listdir(directory_path):
        if "main" in file_name and file_name.endswith(".tex"):
            return file_name
    return None

def replace_input_with_content(match, directory_path):
    """替换\input命令为实际文件内容。"""
    file_name = match.group(1) + ".tex"
    file_path = os.path.join(directory_path, file_name)
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    return ""

def integrate_content(directory_path):
    main_file_name = find_main_tex_file(directory_path)
    if not main_file_name:
        raise ValueError("主文件未找到。")

    main_file_path = os.path.join(directory_path, main_file_name)
    with open(main_file_path, 'r', encoding='utf-8') as f:
        main_content = f.read()
        
    # 删除包含 "appendix" 的 \input{} 命令
    main_content = re.sub(r"\\input\{.*?appendix.*?\}", "", main_content)
    
    # 删除 \section{} 和 \subsection{} 中包含 "appendix" 的部分及其之后的内容，直到下一个 \section{} 或 \subsection{}
    main_content = re.sub(r"(\\section\{.*?appendix.*?\}.*?)(?=\\section|\Z)", "", main_content, flags=re.DOTALL|re.IGNORECASE)
    main_content = re.sub(r"(\\subsection\{.*?appendix.*?\}.*?)(?=\\section|\\subsection|\Z)", "", main_content, flags=re.DOTALL|re.IGNORECASE)
    pattern = re.compile(r"\\section\*?\{.*?acknowledgement.*?\}", re.IGNORECASE)
    main_content = re.sub(pattern, '', main_content)

    # 替换\input命令
    pattern = r"\\input\{([^\}]+)\}"
    main_content = re.sub(pattern, lambda m: replace_input_with_content(m, directory_path), main_content)
    main_content = re.sub(r'\\label\{.*?\}', '', main_content)
    main_content = re.sub(r'\\input\{.*?\}', '', main_content)
    commands_to_remove = [
        r'\\small',
        r'\\clearpage',
        r'\\newpage',
        r'\\normalsize',
        r'\\maketitle'
    ]
    for cmd in commands_to_remove:
        main_content = re.sub(cmd, '', main_content)
    return main_content

def extract_content_between_document_tags(main_content):
    pattern = r"\\begin\{document\}(.*?)\\end\{document\}"
    matches = re.search(pattern, main_content, re.DOTALL)
    if matches:
        return matches.group(1).strip()
    else:
        return ""

def remove_empty_and_single_symbol_lines(content):
    # 使用正则表达式分割内容为行
    lines = content.split('\n')
    # 过滤出非空行以及长度超过1的行
    cleaned_lines = [line for line in lines if line.strip() and len(line.strip()) > 1]
    # 合并过滤后的行并返回
    return '\n'.join(cleaned_lines)

def remove_figure_environment(result_content):
    pattern = r'\\begin\{figure\*?(\[[^\]]*\])?\}.*?\\end\{figure\*?\}'
    result_content = re.sub(pattern, '', result_content, flags=re.DOTALL)
    return result_content

def remove_newlines_within_sections(content):
    # 将latex命令存储到临时变量，并用一个占位符替换
    placeholder = "%%LATEXCOMMANDPLACEHOLDER%%"
    commands = re.findall(r'\\[a-zA-Z]+\{[^\}]*\}', content)
    for command in commands:
        content = content.replace(command, placeholder + command + placeholder)

    def replace_newlines_in_match(match):
        # 取得匹配到的内容
        section_content = match.group(0)
        # 替换内容中的换行，但不替换冒号后的换行
        return re.sub(r'(?<!:)\n', ' ', section_content)

    # 对于 \section 和 \subsection 的内容替换换行符
    pattern = r'(\\section\{.*?\}.*?)(?=(\\section|\\subsection|\Z))'
    content = re.sub(pattern, replace_newlines_in_match, content, flags=re.DOTALL)
    
    pattern = r'(\\subsection\{.*?\}.*?)(?=(\\section|\\subsection|\Z))'
    content = re.sub(pattern, replace_newlines_in_match, content, flags=re.DOTALL)
    
    # 恢复之前存储的latex命令
    for command in commands:
        content = content.replace(placeholder + command + placeholder, command)
    
    return content

def add_newlines_before_commands(content):
    # 删除多余的换行，保留冒号后的换行
    content = re.sub(r'(?<!:)\n', ' ', content)

    # 在特定命令前添加换行
    commands_to_start_with_newline = [
        r'\\section{',
        r'\\subsection{',
        r'\\begin{'
    ]

    for cmd in commands_to_start_with_newline:
        content = re.sub(cmd, '\n' + cmd, content)

    return content



# 指定存放TeX文件的目录路径
directory_path = save_path+'_arXiv'
result_content = integrate_content(directory_path)
result_content = extract_content_between_document_tags(result_content)
result_content = remove_empty_and_single_symbol_lines(result_content)
result_content = remove_figure_environment(result_content)
result_content = remove_newlines_within_sections(result_content)
result_content = add_newlines_before_commands(result_content)

# 将结果写入新的文件
output_file_path = os.path.join(directory_path, "integrated_main.tex")
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write("Full paper contents:")
    f.write(result_content)

print(f"Integrated content has been written to {output_file_path}")


Integrated content has been written to ./data/paper/ADBench_arXiv/integrated_main.tex


In [55]:
# 爬取OpenReview 审稿意见
! pip install wordcloud nltk pandas imageio selenium tqdm
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /Users/ppwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ppwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/ppwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ppwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
# get openreview results
import os
import time
import argparse
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.edge.service import Service

driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')

# automate search query_name in openreview and get first link.
search_link = f'https://openreview.net/search?term={query_name}&group=all&content=all&source=all'
driver.get(search_link)

# XPath for the link
xpath = '//*[@id="content"]/div/div/ul/li[1]/div/h4/a[1]'

# Waiting for the link to appear
link_element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, xpath))
)

review_path = link_element.get_attribute("href")
print(review_path)

#link = input("link")
driver.get(review_path)
xpath = '//div[@id="note_children"]//span[@class="note_content_value"]/..'
cond = EC.presence_of_element_located((By.XPATH, xpath))
WebDriverWait(driver, 60).until(cond)

elems = driver.find_elements_by_xpath(xpath)
assert len(elems), 'empty ratings'
ratings = [int(x.text.split(': ')[1]) for x in elems if x.text.startswith('Rating:')]
decisions = [x.text.split(': ')[1] for x in elems if x.text.startswith('Decision:')]
summary_elems = driver.find_elements_by_xpath("//span[contains(text(), 'Summary And Contributions:')]/following-sibling::span[@class='note_content_value markdown-rendered']")
summaries = [' '.join(x.text for x in elem.find_elements_by_xpath(".//p")) for elem in summary_elems]

strengths_elems = driver.find_elements_by_xpath("//span[contains(text(), 'Strengths:')]/following-sibling::span[@class='note_content_value markdown-rendered']")
strengths_list = [';'.join(x.text for x in elem.find_elements_by_xpath(".//li")) for elem in strengths_elems]

weaknesses_elems = driver.find_elements_by_xpath("//span[contains(text(), 'Weaknesses:')]/following-sibling::span[@class='note_content_value markdown-rendered']")
weaknesses_list = [';'.join(x.text for x in elem.find_elements_by_xpath(".//li")) for elem in weaknesses_elems]
#print(ratings, decisions, summaries, strengths_list, weaknesses_list)

https://openreview.net/forum?id=foA_SFQ9zo0


In [58]:
import pandas as pd

data = {
    'Ratings': ratings,
    'Decisions': decisions*len(ratings),
    'Summary and Contributions': summaries,
    'Strengths': strengths_list,
    'Weaknesses': weaknesses_list
}

# 创建数据框架
df = pd.DataFrame(data)

# 保存为CSV文件
df.to_csv(f'{directory_path}/reviews_data.csv', index=False)
print(df)

   Ratings Decisions                          Summary and Contributions  \
0        7    Accept  This paper introduces ADBench, a tabular anoma...   
1        8    Accept  The authors propose a comprehensive Anomaly De...   
2        6    Accept  This paper presents ADBench, a comprehensive t...   
3        7    Accept  The paper proposes a comprehensive benchmark f...   
4        7    Accept  Although there are many existing AD benchmarks...   
5        6    Accept  This paper provides a detailed and thorough be...   

                                           Strengths  \
0  Large, well-designed benchmark for tabular ano...   
1  The authors have proposed a large collection o...   
2  ADBench has a large algorithm collection with ...   
3  the ADBench proposed in the paper includes the...   
4  This paper provides a comprehensive AD benchma...   
5                                                      

                                          Weaknesses  
0  At the end of the day, 

In [59]:
# 将paper 和 review 结合
# 1. 读取 .tex 文件
query_name='ADBench'
dirpath = f'./data/paper/{query_name}.tar.gz'
save_path = dirpath.split('.tar.gz')[0]
directory_path = save_path+'_arXiv'
with open(f'{directory_path}/integrated_main.tex', 'r', encoding='utf-8') as tex_file:
    tex_content = tex_file.read()

# 2. 读取 .csv 文件
with open('reviews_data.csv', 'r', encoding='utf-8') as csv_file:
    csv_content = csv_file.read()

# 3. 合并内容
combined_content = "作为人工智能顶级会议(NeurIPS, ICLR)的资深审稿人，请根据以下提供的论文和审稿意见，提供一个论文分享PPT的内容概要，要求PPT不超过10页。\n\n" + tex_content + '\n\nOpenReview contents:\n' + csv_content

# 4. 将合并的内容写入一个 .txt 文件
with open(f'{directory_path}/output.txt', 'w', encoding='utf-8') as txt_file:
    txt_file.write(combined_content)