In [1]:
import os, json, llm_utils, config
import pandas as pd
from tqdm import tqdm
from azure.storage.blob import BlobServiceClient
from Arxiv_API import Arxiv_API

keys=config.get_all_variables()
rootdir=keys['rootdir']
gemini_key=keys['gemini_key']
connection_str=keys['connection_str']

  from .autonotebook import tqdm as notebook_tqdm


# Data Collection

In [None]:
# Download the PDF from Azure storage account
blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_str)

with open("../SampleData/Data.pdf", "wb") as pdf_file:
        blob_data = blob_service_client.get_blob_client(container="samplepicture", blob=f"Data/Article_2306.00251v1.pdf").download_blob()
        pdf_file.write(blob_data.readall())

In [None]:
# Or download pdf from Arxiv and save as metadata
data=Arxiv_API(search=['math','physics','cs','econ','eess','q-bio','q-fin','stat'], start_date='2000-01-01', end_date='2024-12-31', start=0, sampling_unit_size=50, max_results=100, downloadstuff=True).df

In [2]:
# Or load the data from metadata.json
with open('../SampleData/metadata.json') as outfile:
    outputs=json.load(outfile)

metadata=pd.DataFrame(outputs)
display(metadata.iloc[0:1,])

Unnamed: 0,ID,PublishDate,Title,Authors,Journal_Ref,Comment,Abstract,Content,Primary_Cat,Category,PDF_link,file_path
0,2305.09528v1,2022-10-02T11:45:44Z,German to Spanish translation of Einstein's wo...,"Enrique M. Padilla, Birgit L. Emberger, Manuel...",No Journal_Ref,No Comment,In 1926 Albert Einstein gave a clear explanati...,German to Spanish translation of Einstein's wo...,physics.hist-ph,No Category,http://arxiv.org/pdf/2305.09528v1,/Users/tengli/Python/chatGPT/SampleData/physic...


# Define functions to run models

In [16]:

def getEachRes(user_prompt,row):
        input_max_tokens = 100
        raw_text = user_prompt.format(row['Content'][0:input_max_tokens])
        return raw_text



def GetTitleAuthor_gpt():
    engine="gpt-3.5-turbo"
    system_prompt = "Please extract the title and author names:"
    system_prompt_tokens = llm_utils.count_tokens(text=system_prompt, engine=engine)

    def getEachRes(row):
        input_max_tokens = 100

        raw_text = row['Content']
        user_prompt = llm_utils.get_tokens_between_indices(
            text=raw_text, 
            engine=engine, 
            max_tokens=input_max_tokens)
        print(user_prompt)
        
        response=llm_utils.get_gpt_response(user_prompt=user_prompt, 
                                            system_prompt=system_prompt,
                                            engine=engine,
                                            max_completion=500)
        print("ChatGPT Response:", response)
        return response

    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        result = getEachRes(row)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open("all_results.json", 'w') as f:
        json.dump(results, f)



def GetTitleAuthor_gemini(metadata, user_prompt, api_key):
    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        raw_text = getEachRes(user_prompt, row)
        result=llm_utils.get_gemini_response(user_prompt=raw_text, api_key=api_key)
        result=json.loads(result)
        #print(result)
        results+=result

    # After all rows are processed, write the accumulated results
    with open(os.path.join(rootdir, "results", "gemini_results.json"), 'w') as f:
        json.dump(results, f)



def GetTitleAuthor_llama(metadata, user_prompt):
    results = []
    for i in tqdm(range(len(metadata)), desc="Processing rows"):
        row = metadata.iloc[i]
        raw_text = getEachRes(user_prompt, row)
        result=llm_utils.get_llama_response(user_prompt=raw_text)
        result=json.loads(result)
        print(result)
        results.append(result)

    # After all rows are processed, write the accumulated results
    with open(os.path.join(rootdir, "results", "llama_results.json"), 'w') as f:
        json.dump(results, f)



# create prompt message
prompt_message = "Extract the title and author names from this research paper: \"{}\". If a field cannot be found, return 'NA'. "


In [17]:
GetTitleAuthor_gemini(metadata=metadata.iloc[0:5,], user_prompt=prompt_message, api_key=gemini_key)
with open(os.path.join(rootdir, 'results', 'gemini_results.json')) as outfile:
    outputs=json.load(outfile)

outputs

Processing rows: 100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


[{'Authors': ['Enrique M. '],
  'Title': "German to Spanish translation of Einstein's work on the formation of meanders in rivers."},
 {'Authors': ['Steven N. Shore', 'Vclav Pavl\nk'],
  'Title': 'How a fake Kepler portrait became iconic'},
 {'Authors': ['NA'],
  'Title': 'Metáforas científicas no discurso jornalístico'},
 {'Authors': ['İzzet S'],
  'Title': 'Review Article : Integral Role of Physics in Advancing Pharmacy Education and Research'},
 {'Authors': ['NA'], 'Title': 'HOLLYWOOD BLOCKBUSTERS'}]

In [18]:
GetTitleAuthor_llama(metadata=metadata.iloc[0:5,], user_prompt=prompt_message)
with open(os.path.join(rootdir, 'results', 'llama_results.json')) as outfile:
    outputs=json.load(outfile)

outputs

Processing rows:   0%|          | 0/5 [00:00<?, ?it/s]2025-01-22 13:56:29,280 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Processing rows:  20%|██        | 1/5 [00:02<00:08,  2.15s/it]

{'Title': "German to Spanish translation of Einstein's work on the formation of meanders in rivers.", 'Authors': ['Enrique M.']}


2025-01-22 13:56:31,100 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Processing rows:  40%|████      | 2/5 [00:03<00:05,  1.96s/it]

{'Title': 'How a fake Kepler portrait became iconic', 'Authors': ['Steven N. Shore', 'Vaclav Pavlik']}


2025-01-22 13:56:33,493 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Processing rows:  60%|██████    | 3/5 [00:06<00:04,  2.16s/it]

{'Title': 'Metáforas científicas no discurso jornalístico (Scientific Metaphors in the journalistic discourse)', 'Authors': ['NA']}


2025-01-22 13:56:35,546 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Processing rows:  80%|████████  | 4/5 [00:08<00:02,  2.11s/it]

{'Title': 'Review  Article : Integral Role of Physics in Advancing Pharmacy Education and Research', 'Authors': ['İzzet S']}


2025-01-22 13:56:36,989 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Processing rows: 100%|██████████| 5/5 [00:09<00:00,  1.97s/it]

{'Title': 'HOLLYWOOD BLOCKBUSTERS', 'Authors': ['NA']}





[{'Title': "German to Spanish translation of Einstein's work on the formation of meanders in rivers.",
  'Authors': ['Enrique M.']},
 {'Title': 'How a fake Kepler portrait became iconic',
  'Authors': ['Steven N. Shore', 'Vaclav Pavlik']},
 {'Title': 'Metáforas científicas no discurso jornalístico (Scientific Metaphors in the journalistic discourse)',
  'Authors': ['NA']},
 {'Title': 'Review  Article : Integral Role of Physics in Advancing Pharmacy Education and Research',
  'Authors': ['İzzet S']},
 {'Title': 'HOLLYWOOD BLOCKBUSTERS', 'Authors': ['NA']}]

In [21]:
with open('../results/llama_results.json') as outfile:
    json_data=json.load(outfile)
df_llama = pd.json_normalize(json_data)



with open('../results/gemini_results.json') as outfile:
    json_data=json.load(outfile)
df_gemini = pd.json_normalize(json_data)


display(df_llama)
display(df_gemini)

Unnamed: 0,Title,Authors
0,German to Spanish translation of Einstein's wo...,[Enrique M.]
1,How a fake Kepler portrait became iconic,"[Steven N. Shore, Vaclav Pavlik]"
2,Metáforas científicas no discurso jornalístico...,[NA]
3,Review Article : Integral Role of Physics in ...,[İzzet S]
4,HOLLYWOOD BLOCKBUSTERS,[NA]


Unnamed: 0,Authors,Title
0,[Enrique M. ],German to Spanish translation of Einstein's wo...
1,"[Steven N. Shore, Vclav Pavl\nk]",How a fake Kepler portrait became iconic
2,[NA],Metáforas científicas no discurso jornalístico
3,[İzzet S],Review Article : Integral Role of Physics in A...
4,[NA],HOLLYWOOD BLOCKBUSTERS


In [45]:
metadata.iloc[0:10][['Title', 'Authors']]

Unnamed: 0,Title,Authors
0,German to Spanish translation of Einstein's wo...,"Enrique M. Padilla, Birgit L. Emberger, Manuel..."
1,How a fake Kepler portrait became iconic,"Steven N. Shore, Václav Pavlík"
2,Scientific Metaphors in the journalistic disco...,"Osame Kinouchi, Angélica A. Mandrá"
3,Review Article: Integral Role of Physics in Ad...,İzzet Sakallı
4,Hollywood Blockbusters: Unlimited Fun but Limi...,"C. J. Efthimiou, R. A. Llewellyn"
5,Ab Initio Study of Different Acid Molecules In...,"Aleksey A. Zakharenko, S. Karthikyan, K. S. Kim"
6,Medium-scale thermospheric gravity waves in th...,"Garima Malhotra, Timothy Fuller-Rowell, Tzu-We..."
7,Climate Engineering Responses to Climate Emerg...,"J. J. Blackstock, D. S. Battisti, K. Caldeira,..."
8,Revealing pre-earthquake signatures in atmosph...,"Dimitar Ouzounov, Sergey Pulinets, Dmitry Davi..."
9,Coherent radiation reaction effects in laser-v...,"P. W. Smorenburg, L. P. J. Kamp, G. A. Geloni,..."
