In [None]:
import os, openai, json, llm_utils, re
import pandas as pd
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from Arxiv_API import Arxiv_API
from dotenv import load_dotenv

load_dotenv()
rootdir=os.getenv("rootdir")
gemini_key=os.getenv("gemini_key")
connection_str=os.getenv("connection_str")

blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_str)



In [None]:
# Download the PDF from Azure storage account
with open("../SampleData/Data.pdf", "wb") as pdf_file:
        blob_data = blob_service_client.get_blob_client(container="samplepicture", blob=f"Data/Article_2306.00251v1.pdf").download_blob()
        pdf_file.write(blob_data.readall())


# Or download pdf from Arxiv and save as metadata
data=Arxiv_API(search=['math','physics','cs','econ','eess','q-bio','q-fin','stat'], start_date='2000-01-01', end_date='2024-12-31', start=0, max_results=50, downloadstuff=True).df

In [4]:
# Or load the data from metadata.json
with open('../SampleData/metadata.json') as outfile:
    outputs=json.load(outfile)

metadata=pd.DataFrame(outputs)
display(metadata)

Unnamed: 0,ID,PublishDate,Title,Authors,Journal_Ref,Comment,Abstract,Content,Primary_Cat,Category,PDF_link,file_path
0,9904016v1,1999-04-22T15:47:22Z,Brittle System Analysis,"Stephen F. Bush, John Hershey, Kirby Vosburgh",No Journal_Ref,No Comment,The goal of this paper is to define and analyz...,"arXiv:cs/9904016v1 [cs.NI] 22 Apr 1999BUSH, ...",cs.NI,No Category,http://arxiv.org/pdf/cs/9904016v1,/Users/tengli/Python/chatGPT/SampleData/cs/Art...
1,2501.06270v1,2025-01-09T19:38:21Z,Sectorial Exclusion Criteria in the Marxist An...,Jose Mauricio Gomez Julian,No Journal_Ref,No Comment,The long-term estimation of the Marxist averag...,1SECTORIAL EXCLUSION CRITERIA IN THE MARXIST A...,econ.GN,No Category,http://arxiv.org/pdf/2501.06270v1,/Users/tengli/Python/chatGPT/SampleData/econ/A...
2,1907.05351v1,2019-07-11T16:19:19Z,Optimized Sharing of Coefficients in Parallel ...,"M. Tunç Arslan, Onur Yorulmaz, Erdinç L. Atılgan",No Journal_Ref,"10 pages, submitted to IEEE Transactions on Si...",Filters are the basic and most important block...,1\nOptimized Sharing of Coefﬁcients in Paralle...,eess.SP,No Category,http://arxiv.org/pdf/1907.05351v1,/Users/tengli/Python/chatGPT/SampleData/eess/A...
3,2408.05119v1,2024-08-09T15:17:42Z,Acto-myosin clusters as active units shaping l...,"Karsten Kruse, Rémi Berthoz, Luca Barberi, Ann...",No Journal_Ref,No Comment,Stress generation by the actin cytoskeleton sh...,1 Acto -myosin clusters as active units sh...,q-bio.TO,No Category,http://arxiv.org/pdf/2408.05119v1,/Users/tengli/Python/chatGPT/SampleData/q-bio/...
4,2206.09877v1,2022-06-20T16:29:08Z,Efficient Pricing and Calibration of High-Dime...,"Lech A. Grzelak, Juliusz Jablecki, Dariusz Gat...",No Journal_Ref,"23 pages, 21 figures",This paper studies equity basket options -- i....,Ecient Pricing and Calibration of High-Dimens...,q-fin.CP,No Category,http://arxiv.org/pdf/2206.09877v1,/Users/tengli/Python/chatGPT/SampleData/q-fin/...
5,1211.1183v2,2012-11-06T11:26:30Z,KernSmoothIRT: An R Package for Kernel Smoothi...,"Angelo Mazza, Antonio Punzo, Brian McGuire",No Journal_Ref,No Comment,Item response theory (IRT) models are a class ...,JSS\nKernSmoothIRT : AnRPackage for Kernel\nSm...,stat.CO,No Category,http://arxiv.org/pdf/1211.1183v2,/Users/tengli/Python/chatGPT/SampleData/stat/A...


In [None]:
def GetTitleAuthor_gpt():
    engine="gpt-3.5-turbo"
    system_prompt = "Please extract the title and author names:"
    system_prompt_tokens = llm_utils.count_tokens(text=system_prompt, engine=engine)

    def getEachRes(row):
        input_max_tokens = 100

        raw_text = row['Content']
        user_prompt = llm_utils.get_tokens_between_indices(
            text=raw_text, 
            engine=engine, 
            max_tokens=input_max_tokens)
        print(user_prompt)
        
        response=llm_utils.get_gpt_response(user_prompt=user_prompt, 
                                            system_prompt=system_prompt,
                                            engine=engine,
                                            max_completion=500)
        print("ChatGPT Response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open("all_results.json", 'w') as f:
        json.dump(results, f)


GetTitleAuthor_gpt()

In [137]:
def GetTitleAuthor_gemini(metadata):
    user_prompt = "Please extract the title and author names: "

    def getEachRes(row):
        input_max_tokens = 1000
        raw_text = row['Content']
        response=llm_utils.get_gemini_response(user_prompt=user_prompt+raw_text[0:input_max_tokens], api_key=gemini_key)
        
        print("Gemini response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open(os.path.join(rootdir, "results", "all_results.json"), 'w') as f:
        json.dump(results, f)


GetTitleAuthor_gemini(metadata=metadata)

Processing rows:  20%|██        | 1/5 [00:00<00:01,  2.17it/s]

Gemini response: Title: On randomized confidence intervals for the binomial probability

Author: Paul Kabaila



Processing rows:  40%|████      | 2/5 [00:01<00:01,  1.68it/s]

Gemini response: **Title:** Confidence-Aware Learning for Camouflaged Object Detection

**Authors:** Jiawei Liu, Jing Zhang, Nick Barnes



Processing rows:  60%|██████    | 3/5 [00:01<00:01,  1.60it/s]

Gemini response: **Title:** A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets

**Authors:** Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei



Processing rows:  80%|████████  | 4/5 [00:02<00:00,  1.88it/s]

Gemini response: Title: Confident AI

Author: Jim Davis



Processing rows: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]

Gemini response: Title: Simultaneous confidence bands for the integrated hazard function

Authors: Anna Dudek, Maciej Gołcwin, Jacek Leśkow






In [138]:
with open(os.path.join(rootdir, 'results', 'all_results.json')) as outfile:
    outputs=json.load(outfile)

outputs

['Title: On randomized confidence intervals for the binomial probability\n\nAuthor: Paul Kabaila\n',
 '**Title:** Confidence-Aware Learning for Camouflaged Object Detection\n\n**Authors:** Jiawei Liu, Jing Zhang, Nick Barnes\n',
 '**Title:** A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets\n\n**Authors:** Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei\n',
 'Title: Confident AI\n\nAuthor: Jim Davis\n',
 'Title: Simultaneous confidence bands for the integrated hazard function\n\nAuthors: Anna Dudek, Maciej Gołcwin, Jacek Leśkow\n']

In [139]:
def parse_to_dict(string_list):
    titles=[]
    authors=[]
    for string in string_list:
        match_title = re.search(r"Title:\s*(.*?)\n", string)
        if match_title:
            titles.append(match_title.group(1).strip().strip("**"))
        
        match_author = re.search(r"(Author|Authors):\s*(.*?)\n", string)
        if match_author:
            authors.append(match_author.group(2).strip().strip("**"))

    result={"Title": titles, "Author":authors}
    return pd.DataFrame(result)

output_dict = parse_to_dict(string_list=outputs)
pd.set_option('display.max_colwidth', 100)
display(output_dict)

Unnamed: 0,Title,Author
0,On randomized confidence intervals for the binomial probability,Paul Kabaila
1,Confidence-Aware Learning for Camouflaged Object Detection,"Jiawei Liu, Jing Zhang, Nick Barnes"
2,A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets,"Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei"
3,Confident AI,Jim Davis
4,Simultaneous confidence bands for the integrated hazard function,"Anna Dudek, Maciej Gołcwin, Jacek Leśkow"
