In [None]:
import os, openai, json, llm_utils, re
import pandas as pd
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from Arxiv_API import Arxiv_API
from dotenv import load_dotenv

load_dotenv()
rootdir=os.getenv("rootdir")
gemini_key=os.getenv("gemini_key")
connection_str=os.getenv("connection_str")

blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_str)



In [None]:
# Download the PDF from Azure storage account
with open("../SampleData/Data.pdf", "wb") as pdf_file:
        blob_data = blob_service_client.get_blob_client(container="samplepicture", blob=f"Data/Article_2306.00251v1.pdf").download_blob()
        pdf_file.write(blob_data.readall())


# Or download pdf from Arxiv and save as metadata
data=Arxiv_API(file_dir='../SampleData',search='Bayesian', start_date='2024-01-01', end_date='2024-11-30', start=0, max_results=2, downloadstuff=True).df

In [136]:
# Or load the data from metadata.json
with open('../SampleData/metadata.json') as outfile:
    outputs=json.load(outfile)

metadata=pd.DataFrame(outputs)
display(metadata)

Unnamed: 0,ID,PublishDate,Title,Authors,Journal_Ref,Comment,Abstract,Content,Primary_Cat,Category,PDF_link,file_path
0,1302.6659v1,2013-02-27T04:03:48Z,On randomized confidence intervals for the binomial probability,Paul Kabaila,No Journal_Ref,No Comment,"Suppose that X_1,X_2,...,X_n are independent and identically Bernoulli(theta)\ndistributed. Also...",arXiv:1302.6659v1 [math.ST] 27 Feb 2013On randomized conﬁdence intervals for the binomial\npro...,math.ST,No Category,http://arxiv.org/pdf/1302.6659v1,../SampleData/Article_1302.6659v1.pdf
1,2106.11641v1,2021-06-22T09:49:23Z,Confidence-Aware Learning for Camouflaged Object Detection,"Jiawei Liu, Jing Zhang, Nick Barnes",No Journal_Ref,No Comment,Confidence-aware learning is proven as an effective solution to prevent\nnetworks becoming overc...,Conﬁdence-Aware Learning for Camouﬂaged Object Detection\nJiawei Liu\nANU\njiawei.liu3@anu.edu.a...,cs.CV,No Category,http://arxiv.org/pdf/2106.11641v1,../SampleData/Article_2106.11641v1.pdf
2,1701.05455v1,2017-01-19T15:04:54Z,A Weighted Model Confidence Set: Applications to Local and Mixture Model\n Confidence Sets,"Amir T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei",No Journal_Ref,No Comment,"This article provides a weighted model confidence set, whenever underling\nmodel has been misspe...",arXiv:1701.05455v1 [stat.AP] 19 Jan 2017A Weighted Model Conﬁdence Set: Applications to Local ...,stat.AP,No Category,http://arxiv.org/pdf/1701.05455v1,../SampleData/Article_1701.05455v1.pdf
3,2202.05957v1,2022-02-12T02:26:46Z,Confident AI,Jim Davis,No Journal_Ref,No Comment,"In this paper, we propose ""Confident AI"" as a means to designing Artificial\nIntelligence (AI) a...",Condent AI\nJim Davis\nDept. Computer Science and Engineering\nOhio State University\nColumbus ...,cs.AI,No Category,http://arxiv.org/pdf/2202.05957v1,../SampleData/Article_2202.05957v1.pdf
4,0601346v1,2006-01-14T10:54:32Z,Simultaneous confidence bands for the integrated hazard function,"Anna Dudek, Maciej Gocwin, Jacek Leskow",No Journal_Ref,"15 pages, 4 figures",The construction of the simultaneous confidence bands for the integrated\nhazard function is con...,arXiv:math/0601346v1 [math.ST] 14 Jan 2006Simultaneous conﬁdence bands for the integrated haza...,math.ST,No Category,http://arxiv.org/pdf/math/0601346v1,../SampleData/Article_0601346v1.pdf


In [None]:
def GetTitleAuthor_gpt():
    engine="gpt-3.5-turbo"
    system_prompt = "Please extract the title and author names:"
    system_prompt_tokens = llm_utils.count_tokens(text=system_prompt, engine=engine)

    def getEachRes(row):
        input_max_tokens = 100

        raw_text = row['Content']
        user_prompt = llm_utils.get_tokens_between_indices(
            text=raw_text, 
            engine=engine, 
            max_tokens=input_max_tokens)
        print(user_prompt)
        
        response=llm_utils.get_gpt_response(user_prompt=user_prompt, 
                                            system_prompt=system_prompt,
                                            engine=engine,
                                            max_completion=500)
        print("ChatGPT Response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open("all_results.json", 'w') as f:
        json.dump(results, f)


GetTitleAuthor_gpt()

In [137]:
def GetTitleAuthor_gemini(metadata):
    user_prompt = "Please extract the title and author names: "

    def getEachRes(row):
        input_max_tokens = 1000
        raw_text = row['Content']
        response=llm_utils.get_gemini_response(user_prompt=user_prompt+raw_text[0:input_max_tokens], api_key=gemini_key)
        
        print("Gemini response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open(os.path.join(rootdir, "results", "all_results.json"), 'w') as f:
        json.dump(results, f)


GetTitleAuthor_gemini(metadata=metadata)

Processing rows:  20%|██        | 1/5 [00:00<00:01,  2.17it/s]

Gemini response: Title: On randomized confidence intervals for the binomial probability

Author: Paul Kabaila



Processing rows:  40%|████      | 2/5 [00:01<00:01,  1.68it/s]

Gemini response: **Title:** Confidence-Aware Learning for Camouflaged Object Detection

**Authors:** Jiawei Liu, Jing Zhang, Nick Barnes



Processing rows:  60%|██████    | 3/5 [00:01<00:01,  1.60it/s]

Gemini response: **Title:** A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets

**Authors:** Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei



Processing rows:  80%|████████  | 4/5 [00:02<00:00,  1.88it/s]

Gemini response: Title: Confident AI

Author: Jim Davis



Processing rows: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]

Gemini response: Title: Simultaneous confidence bands for the integrated hazard function

Authors: Anna Dudek, Maciej Gołcwin, Jacek Leśkow






In [138]:
with open(os.path.join(rootdir, 'results', 'all_results.json')) as outfile:
    outputs=json.load(outfile)

outputs

['Title: On randomized confidence intervals for the binomial probability\n\nAuthor: Paul Kabaila\n',
 '**Title:** Confidence-Aware Learning for Camouflaged Object Detection\n\n**Authors:** Jiawei Liu, Jing Zhang, Nick Barnes\n',
 '**Title:** A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets\n\n**Authors:** Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei\n',
 'Title: Confident AI\n\nAuthor: Jim Davis\n',
 'Title: Simultaneous confidence bands for the integrated hazard function\n\nAuthors: Anna Dudek, Maciej Gołcwin, Jacek Leśkow\n']

In [139]:
def parse_to_dict(string_list):
    titles=[]
    authors=[]
    for string in string_list:
        match_title = re.search(r"Title:\s*(.*?)\n", string)
        if match_title:
            titles.append(match_title.group(1).strip().strip("**"))
        
        match_author = re.search(r"(Author|Authors):\s*(.*?)\n", string)
        if match_author:
            authors.append(match_author.group(2).strip().strip("**"))

    result={"Title": titles, "Author":authors}
    return pd.DataFrame(result)

output_dict = parse_to_dict(string_list=outputs)
pd.set_option('display.max_colwidth', 100)
display(output_dict)

Unnamed: 0,Title,Author
0,On randomized confidence intervals for the binomial probability,Paul Kabaila
1,Confidence-Aware Learning for Camouflaged Object Detection,"Jiawei Liu, Jing Zhang, Nick Barnes"
2,A Weighted Model Confidence Set: Applications to Local and Mixture Model Confidence Sets,"Amir.T. Payandeh Najafabadi, Ghobad Barmalzan, Shahla Aghaei"
3,Confident AI,Jim Davis
4,Simultaneous confidence bands for the integrated hazard function,"Anna Dudek, Maciej Gołcwin, Jacek Leśkow"
