### Setting

In [24]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
WORK_DIR = os.environ.get("WORK_DIR")
print(WORK_DIR)
sys.path.append(WORK_DIR)

import pymupdf4llm
from pathlib import Path
from glob import glob
from src.llm.llm import LLM
from src.notion.crud import NotionRepository

/PaperSurvey


### Paper Search

In [20]:
import arxiv
query = "Graph Embedding Community Detection"
max_results = 30

client = arxiv.Client()
search = arxiv.Search(
    query=query,
    max_results=max_results,
    sort_by=arxiv.SortCriterion.Relevance,
    sort_order=arxiv.SortOrder.Descending
)
results = list(client.results(search))

In [21]:
ids = []
for result in results:
    id = result.entry_id.split('/')[-1]
    title = result.title
    ids.append((id, title))

In [26]:
WORK_DIR = Path(WORK_DIR)
pdf_dir = WORK_DIR / "docs/GraphEmbeddingCommunityDetection"
pdf_dir

PosixPath('/PaperSurvey/docs/GraphEmbeddingCommunityDetection')

In [27]:
for id, title in ids:
    search = arxiv.Search(id_list=[id])
    paper = next(client.results(search))
    pdf_path = pdf_dir / f"{title}.pdf"
    paper.download_pdf(filename=pdf_path)


### Make Summary

In [28]:
paper_paths = glob("/PaperSurvey/docs/GraphEmbeddingCommunityDetection/*.pdf")
len(paper_paths)

30

In [30]:
pdfs = []
for paper_path in paper_paths:
    paper_md = pymupdf4llm.to_markdown(paper_path)
    pdfs.append(paper_md)

Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Variational Embeddings for Community Detection and Node Representation.pdf...
Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Change Detection in Noisy Dynamic Networks: A Spectral Embedding Approach.pdf...
Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Semantic Random Walk for Graph Representation Learning in Attributed Graphs.pdf...
Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Graph Vertex Embeddings: Distance, Regularization and Community Detection.pdf...
Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Iterative embedding and reweighting of complex networks reveals community structure.pdf...
Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Network community detection via neural embeddings.pdf...
Processing /PaperSurvey/docs/GraphEmbeddingCommunityDetection/Classic Graph Structural Features Outperform Factorization-Based Graph Embedding Methods on Com

In [36]:
from tqdm import tqdm

WORK_DIR = Path(WORK_DIR)
PROMPT_DIR = WORK_DIR / "prompts"

all_input_tokens = 0
all_output_tokens = 0

debug:bool = False

notion: NotionRepository = NotionRepository()

md_resutlts = []

for i, paper_path in enumerate(tqdm(paper_paths)):
    llm_client: LLM = LLM(base="openai", model="gpt-4o-mini")

    title = Path(paper_path).stem[0: len(str(paper_path))-4]
    paper_md: str = pdfs[i]
        
    prompt_path: Path = PROMPT_DIR / "v1.txt"
    prompt: str = open(prompt_path).read()
    prompt = prompt.replace("<<INPUT>>", paper_md)

    if debug:
        print(title)
        print(prompt)
        break

    llm_client.set_prompt(text=prompt)
    md_content, input_tokens, output_tokens = llm_client.get_response()

    md_resutlts.append(md_content)

    notion.create_markdown_page(title, md_content)

    all_input_tokens += input_tokens
    all_input_tokens += output_tokens

100%|██████████| 30/30 [15:18<00:00, 30.61s/it]


In [37]:
print(150 * 0.15 * all_input_tokens / 1000000 + 150 * 0.6 *  all_output_tokens / 1000000)

13.4860725
