In [None]:
import os, openai, json, llm_utils, re
import pandas as pd
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from Arxiv_API import Arxiv_API
from dotenv import load_dotenv

load_dotenv()
rootdir=os.getenv("rootdir")
gemini_key=os.getenv("gemini_key")
connection_str=os.getenv("connection_str")

blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_str)



In [None]:
# Download the PDF from Azure storage account
with open("../SampleData/Data.pdf", "wb") as pdf_file:
        blob_data = blob_service_client.get_blob_client(container="samplepicture", blob=f"Data/Article_2306.00251v1.pdf").download_blob()
        pdf_file.write(blob_data.readall())


# Or download pdf from Arxiv and save as metadata
data=Arxiv_API(search=['math','physics','cs','econ','eess','q-bio','q-fin','stat'], start_date='2000-01-01', end_date='2024-12-31', start=0, sampling_unit_size=50, max_results=50, downloadstuff=True).df

In [3]:
# Or load the data from metadata.json
with open('../SampleData/metadata.json') as outfile:
    outputs=json.load(outfile)

metadata=pd.DataFrame(outputs)
display(metadata)

Unnamed: 0,ID,PublishDate,Title,Authors,Journal_Ref,Comment,Abstract,Content,Primary_Cat,Category,PDF_link,file_path
0,9904016v1,1999-04-22T15:47:22Z,Brittle System Analysis,"Stephen F. Bush, John Hershey, Kirby Vosburgh",No Journal_Ref,No Comment,The goal of this paper is to define and analyz...,"arXiv:cs/9904016v1 [cs.NI] 22 Apr 1999BUSH, ...",cs.NI,No Category,http://arxiv.org/pdf/cs/9904016v1,/Users/tengli/Python/chatGPT/SampleData/cs/Art...
1,2412.05784v3,2024-12-08T02:30:23Z,ASC-Hook: fast and transparent system call hoo...,"Yang Shen, Min Xie, Wenzhe Zhang, Tao Wu",No Journal_Ref,"11 pages (including appendix), 6 figures, not ...",Intercepting system calls is crucial for tools...,ASC-Hook: fast and transparent system call hoo...,cs.AR,No Category,http://arxiv.org/pdf/2412.05784v3,/Users/tengli/Python/chatGPT/SampleData/cs/Art...
2,2207.01849v1,2022-07-05T07:27:04Z,Learnings from an Under the Hood Analysis of a...,"Pratik Mishra, Rekha Pitchumani, Yang Suk Kee",No Journal_Ref,No Comment,Conventional object-stores are built on top of...,\n 1 \n Learnings from an Under the Hood Anal...,cs.DB,No Category,http://arxiv.org/pdf/2207.01849v1,/Users/tengli/Python/chatGPT/SampleData/cs/Art...
3,0410044v5,2004-10-18T17:39:51Z,An Example of Clifford Algebras Calculations w...,Vladimir V. Kisil,"Advances in Applied Clifford Algebras, 15(2005...","20 pages, LaTeX2e, 12 PS graphics in one figur...",This example of Clifford algebras calculations...,arXiv:cs/0410044v5 [cs.MS] 9 Dec 2005arXiv: ...,cs.MS,No Category,http://arxiv.org/pdf/cs/0410044v5,/Users/tengli/Python/chatGPT/SampleData/cs/Art...
4,2407.10098v1,2024-07-14T06:36:19Z,Accelerator-as-a-Service in Public Clouds: An ...,"Jiechen Zhao, Ran Shu, Katie Lim, Zewen Fan, T...",No Journal_Ref,No Comment,I/O devices in public clouds have integrated i...,Accelerator-as-a-Service in Public Clouds:\nAn...,cs.OS,No Category,http://arxiv.org/pdf/2407.10098v1,/Users/tengli/Python/chatGPT/SampleData/cs/Art...
...,...,...,...,...,...,...,...,...,...,...,...,...
290,2111.12267v1,2021-11-24T04:55:59Z,The Practical Scope of the Central Limit Theorem,"David Draper, Erdong Guo",No Journal_Ref,"47 pages, 17 figures",The \textit{Central Limit Theorem (CLT)} is at...,The Practical Scope\nof the Central Limit Theo...,stat.OT,No Category,http://arxiv.org/pdf/2111.12267v1,/Users/tengli/Python/chatGPT/SampleData/stat/A...
291,2209.00636v1,2022-09-01T17:42:55Z,Testing for the Important Components of Poster...,"Dean Dustin, Bertrand Clarke",No Journal_Ref,No Comment,We give a decomposition of the posterior predi...,Testing for the Important Components of Poster...,stat.ME,No Category,http://arxiv.org/pdf/2209.00636v1,/Users/tengli/Python/chatGPT/SampleData/stat/A...
292,2304.04249v1,2023-04-09T14:51:58Z,Convergent estimators of variance of a spatial...,Ashwin K Seshadri,No Journal_Ref,No Comment,"In the geosciences, a recurring problem is one...",Convergent estimators of variance of a spatial...,math.ST,No Category,http://arxiv.org/pdf/2304.04249v1,/Users/tengli/Python/chatGPT/SampleData/stat/A...
293,2407.18835v3,2024-07-26T15:54:37Z,Robust Estimation of Polychoric Correlation,"Max Welz, Patrick Mair, Andreas Alfons",No Journal_Ref,"50 pages (30 main text), 13 figures (8 in main...",Polychoric correlation is often an important b...,Robust Estimation of Polychoric\nCorrelation∗\...,stat.ME,No Category,http://arxiv.org/pdf/2407.18835v3,/Users/tengli/Python/chatGPT/SampleData/stat/A...


In [None]:
def GetTitleAuthor_gpt():
    engine="gpt-3.5-turbo"
    system_prompt = "Please extract the title and author names:"
    system_prompt_tokens = llm_utils.count_tokens(text=system_prompt, engine=engine)

    def getEachRes(row):
        input_max_tokens = 100

        raw_text = row['Content']
        user_prompt = llm_utils.get_tokens_between_indices(
            text=raw_text, 
            engine=engine, 
            max_tokens=input_max_tokens)
        print(user_prompt)
        
        response=llm_utils.get_gpt_response(user_prompt=user_prompt, 
                                            system_prompt=system_prompt,
                                            engine=engine,
                                            max_completion=500)
        print("ChatGPT Response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open("all_results.json", 'w') as f:
        json.dump(results, f)


GetTitleAuthor_gpt()

In [137]:
def GetTitleAuthor_gemini(metadata):
    user_prompt = "Please extract the title and author names: "

    def getEachRes(row):
        input_max_tokens = 1000
        raw_text = row['Content']
        response=llm_utils.get_gemini_response(user_prompt=user_prompt+raw_text[0:input_max_tokens], api_key=gemini_key)
        
        print("Gemini response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open(os.path.join(rootdir, "results", "all_results.json"), 'w') as f:
        json.dump(results, f)


GetTitleAuthor_gemini(metadata=metadata)

Processing rows:  20%|██        | 1/5 [00:00<00:01,  2.17it/s]

Gemini response: Title: On randomized confidence intervals for the binomial probability

Author: Paul Kabaila



Processing rows:  40%|████      | 2/5 [00:01<00:01,  1.68it/s]

Gemini response: **Title:** Confidence-Aware Learning for Camouflaged Object Detection

**Authors:** Jiawei Liu, Jing Zhang, Nick Barnes



Processing rows:  60%|██████    | 3/5 [00:01<00:01,  1.60it/s]

Gemini response: **Title:** A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets

**Authors:** Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei



Processing rows:  80%|████████  | 4/5 [00:02<00:00,  1.88it/s]

Gemini response: Title: Confident AI

Author: Jim Davis



Processing rows: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]

Gemini response: Title: Simultaneous confidence bands for the integrated hazard function

Authors: Anna Dudek, Maciej Gołcwin, Jacek Leśkow






In [138]:
with open(os.path.join(rootdir, 'results', 'all_results.json')) as outfile:
    outputs=json.load(outfile)

outputs

['Title: On randomized confidence intervals for the binomial probability\n\nAuthor: Paul Kabaila\n',
 '**Title:** Confidence-Aware Learning for Camouflaged Object Detection\n\n**Authors:** Jiawei Liu, Jing Zhang, Nick Barnes\n',
 '**Title:** A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets\n\n**Authors:** Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei\n',
 'Title: Confident AI\n\nAuthor: Jim Davis\n',
 'Title: Simultaneous confidence bands for the integrated hazard function\n\nAuthors: Anna Dudek, Maciej Gołcwin, Jacek Leśkow\n']

In [139]:
def parse_to_dict(string_list):
    titles=[]
    authors=[]
    for string in string_list:
        match_title = re.search(r"Title:\s*(.*?)\n", string)
        if match_title:
            titles.append(match_title.group(1).strip().strip("**"))
        
        match_author = re.search(r"(Author|Authors):\s*(.*?)\n", string)
        if match_author:
            authors.append(match_author.group(2).strip().strip("**"))

    result={"Title": titles, "Author":authors}
    return pd.DataFrame(result)

output_dict = parse_to_dict(string_list=outputs)
pd.set_option('display.max_colwidth', 100)
display(output_dict)

Unnamed: 0,Title,Author
0,On randomized confidence intervals for the binomial probability,Paul Kabaila
1,Confidence-Aware Learning for Camouflaged Object Detection,"Jiawei Liu, Jing Zhang, Nick Barnes"
2,A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets,"Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei"
3,Confident AI,Jim Davis
4,Simultaneous confidence bands for the integrated hazard function,"Anna Dudek, Maciej Gołcwin, Jacek Leśkow"
