In [17]:
import os, json, time, llm_utils, config, re
import pandas as pd, numpy as np
from azure.storage.blob import BlobServiceClient
from Arxiv_API import Arxiv_API
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

keys=config.get_all_variables()
rootdir=keys['rootdir']
gemini_key=keys['gemini_key']
connection_str=keys['connection_str']
SampleData_dir='/Volumes/Lacie External Hard Drive/LLMSampleData'


# Data Collection

In [None]:
# Download the PDF from Azure storage account
blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_str)

with open("../SampleData/Data.pdf", "wb") as pdf_file:
        blob_data = blob_service_client.get_blob_client(container="samplepicture", blob=f"Data/Article_2306.00251v1.pdf").download_blob()
        pdf_file.write(blob_data.readall())

In [None]:
# Or download pdf from Arxiv and save as metadata
data=Arxiv_API(search=['math','physics','cs','econ','eess','q-bio','q-fin','stat'], start_date='2000-01-01', end_date='2024-12-31', start=0, sampling_unit_size=200, max_results=1000, downloadstuff=True).df

In [18]:
# Or load the data from metadata.json
with open(os.path.join(SampleData_dir, 'metadata.json')) as outfile:
    outputs=json.load(outfile)

metadata=pd.DataFrame(outputs)
metadata['Authors']=metadata['Authors'].apply(lambda x: [author.strip() for author in x.split(',')])
metadata['Primary_Cat']=metadata['Primary_Cat'].apply(lambda x: x.split('.')[0])

display(metadata)

Unnamed: 0,ID,PublishDate,Title,Authors,Journal_Ref,Comment,Abstract,Content,Primary_Cat,Category,PDF_link,file_path
0,2012.13850v1,2020-12-27T02:25:29Z,Generalized spaces for constructive algebra,[Ingo Blechschmidt],No Journal_Ref,"Chapter for an upcoming collection ""Proof and ...",The purpose of this contribution is to give a ...,2012.13850v1 [math.LO] 27 Dec 2020\n\narXiv\n\...,math,No Category,http://arxiv.org/pdf/2012.13850v1,/Users/tengli/Python/chatGPT/SampleData/math/A...
1,2412.05250v1,2024-12-06T18:30:30Z,Constructing projective modules,[Aravind Asok],No Journal_Ref,138 pages; comments welcome!,We discuss elements of a social history of the...,arXiv:2412.05250v1 [math.HO] 6 Dec 2024\n\nCon...,math,No Category,http://arxiv.org/pdf/2412.05250v1,/Users/tengli/Python/chatGPT/SampleData/math/A...
2,2201.06408v1,2022-01-17T13:48:30Z,Quantalic spectra of semirings,[Graham Manuell],No Journal_Ref,My PhD thesis from 2019/2020. 117 pages,Spectrum constructions appear throughout mathe...,Quantalic spectra of\nsemirings\n\nGraham Manu...,math,No Category,http://arxiv.org/pdf/2201.06408v1,/Users/tengli/Python/chatGPT/SampleData/math/A...
3,0409250v3,2004-09-15T14:01:51Z,Von Neumann coordinatization is not first-order,[Friedrich Wehrung],"Journal of Mathematical Logic 6, no. 1 (2006) ...",No Comment,"A lattice L is coordinatizable, if it is isomo...",math/0409250v3 [math.GM] 28 Jan 2006\n\ne\ne\n...,math,No Category,http://arxiv.org/pdf/math/0409250v3,/Users/tengli/Python/chatGPT/SampleData/math/A...
4,0509245v2,2005-09-11T20:22:45Z,Analysis in J_2,[Nik Weaver],No Journal_Ref,31 pages,This is an expository paper in which I explain...,math/0509245v2 [math.LO] 12 Sep 2005\n\ne\ne\n...,math,No Category,http://arxiv.org/pdf/math/0509245v2,/Users/tengli/Python/chatGPT/SampleData/math/A...
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,1704.01732v1,2017-04-06T07:42:17Z,A Mathematically Sensible Explanation of the C...,[Yiping Cheng],No Journal_Ref,7 pages,"In statistics education, the concept of popula...",1704.01732v1 [stat.OT] 6 Apr 2017\n\ne\ne\n\na...,stat,No Category,http://arxiv.org/pdf/1704.01732v1,/Users/tengli/Python/chatGPT/SampleData/stat/A...
1596,1704.03812v15,2017-04-12T16:09:00Z,A New Theoretical Interpretation of Measuremen...,"[Huisheng Shi, Xiaoming Ye, Cheng Xing, Shijun...",Discrete Dynamics in Nature and Society(2020),"20 pages, 7 figures",The traditional measurement theory interprets ...,Discrete Dynamics in Nature and Society\nhttps...,stat,No Category,http://arxiv.org/pdf/1704.03812v15,/Users/tengli/Python/chatGPT/SampleData/stat/A...
1597,1704.06292v1,2017-04-19T14:54:43Z,Remark On Variance Bounds,[R. Sharma],No Journal_Ref,No Comment,It is shown that the formula for the variance ...,arXiv:1704.06292v1 [stat.OT] 19 Apr 2017\n\nRe...,stat,No Category,http://arxiv.org/pdf/1704.06292v1,/Users/tengli/Python/chatGPT/SampleData/stat/A...
1598,1704.07512v1,2017-04-25T01:55:24Z,Information vs. Uncertainty as the Foundation ...,"[Grey Nearing, Hoshin Gupta]",No Journal_Ref,No Comment,Information accounting provides a better found...,Information vs. Uncertainty as the Foundation ...,stat,No Category,http://arxiv.org/pdf/1704.07512v1,/Users/tengli/Python/chatGPT/SampleData/stat/A...


# Randomization for RCBD

In [3]:
# Set the random seed for reproducibility
np.random.seed(100)

# Creating the treatment levels
trt_levels = np.tile(["M1", "M2"], 8)

# Creating the randomization plan
randplan = pd.DataFrame({
    'Block': np.repeat(['math', 'physics', 'cs', 'econ', 'eess', 'q-bio', 'q-fin', 'stat'], 2),
    'Treatment': trt_levels
})
randplan['Assign'] = randplan.groupby('Block')['Treatment'].transform(lambda x: np.random.permutation(x))
print(randplan)


      Block Treatment Assign
0      math        M1     M1
1      math        M2     M2
2   physics        M1     M1
3   physics        M2     M2
4        cs        M1     M2
5        cs        M2     M1
6      econ        M1     M2
7      econ        M2     M1
8      eess        M1     M1
9      eess        M2     M2
10    q-bio        M1     M1
11    q-bio        M2     M2
12    q-fin        M1     M2
13    q-fin        M2     M1
14     stat        M1     M2
15     stat        M2     M1


# Run models

In [5]:
metadata[metadata['Primary_Cat']=='q-fin'].iloc[195:200]

Unnamed: 0,ID,PublishDate,Title,Authors,Journal_Ref,Comment,Abstract,Content,Primary_Cat,Category,PDF_link,file_path
1395,2112.10209v1,2021-12-19T17:48:40Z,Option Pricing Model with Transaction Costs,"[F. G. Bellora, G. Mazzei, M. Maurette]",MACI 6 2017 p.569-573,5 pages,The author presents alternatives to the Black-...,OPTION PRICING MODEL WITH TRANSACTION COSTS\n\...,q-fin,No Category,http://arxiv.org/pdf/2112.10209v1,/Users/tengli/Python/chatGPT/SampleData/q-fin/...
1396,2205.13942v1,2022-05-27T12:40:57Z,Deep Generators on Commodity Markets; applicat...,"[Nicolas Boursin, Carl Remlinger, Joseph Mikae...",No Journal_Ref,15 pages,Driven by the good results obtained in compute...,in.RM]| 27 May 2022\n\n2205.13942v1 [q-f\n\ne\...,q-fin,No Category,http://arxiv.org/pdf/2205.13942v1,/Users/tengli/Python/chatGPT/SampleData/q-fin/...
1397,2401.09955v2,2024-01-18T13:05:04Z,Consistent asset modelling with random coeffic...,"[Felix L. Wolf, Griselda Deelstra, Lech A. Grz...",No Journal_Ref,No Comment,We explore a stochastic model that enables cap...,in.PR] 10 Apr 2024\n\n2401.09955v2_ [q-f\n\ne\...,q-fin,No Category,http://arxiv.org/pdf/2401.09955v2,/Users/tengli/Python/chatGPT/SampleData/q-fin/...
1398,2402.15936v1,2024-02-24T23:52:43Z,Optimizing Neural Networks for Bermudan Option...,"[Vikranth Lokeshwar Dhandapani, Shashi Jain]",No Journal_Ref,34 pages,This paper presents a Monte-Carlo-based artifi...,Optimizing Neural Networks for Bermudan\n\nOpt...,q-fin,No Category,http://arxiv.org/pdf/2402.15936v1,/Users/tengli/Python/chatGPT/SampleData/q-fin/...
1399,2409.04233v1,2024-09-06T12:40:00Z,Pricing and hedging of decentralised lending c...,"[Lukasz Szpruch, Marc Sabaté Vidales, Tanut Tr...",No Journal_Ref,No Comment,We study the loan contracts offered by decentr...,in.PR] 6 Sep 2024\n\n2409.04233v1 [q-f\n\ne\nO...,q-fin,No Category,http://arxiv.org/pdf/2409.04233v1,/Users/tengli/Python/chatGPT/SampleData/q-fin/...


In [6]:
prompt_message = "Extract the title and author names from this research paper: \"{}\". If a field cannot be found, return 'NA'. Do not alter the original information in any way. Extract it and keep it as original as possible."

exp1=llm_utils.TitleAuthorExperiment(user_prompt=prompt_message, keys=keys)

for run in range(13, len(randplan)):
    data=metadata[metadata["Primary_Cat"]==randplan.loc[run,'Block']].copy()
    
    if(randplan.loc[run, 'Assign']=='M1'):
        exp1.GetTitleAuthor_llama(metadata=data)
    else:
        exp1.GetTitleAuthor_gemini(metadata=data)


Processing rows: 100%|██████████| 200/200 [18:50<00:00,  5.65s/it]
Processing rows: 100%|██████████| 200/200 [36:28<00:00, 10.94s/it]
Processing rows: 100%|██████████| 200/200 [18:52<00:00,  5.66s/it]


In [9]:

with open(exp1.resultpath['AllResults']) as outfile:
    outputs=json.load(outfile)
df_all = pd.json_normalize(outputs)
display(df_all)
df_all.to_csv("../results/exp_results.csv")

Unnamed: 0,Title,Authors,ID,Primary_Cat,TrueTitle,TrueAuthors,Model
0,Generalized spaces for constructive algebra,[Ingo Blechschmidt],2012.13850v1,math,Generalized spaces for constructive algebra,[Ingo Blechschmidt],Llama
1,Constructing projective modules,[Aravind Asok],2412.05250v1,math,Constructing projective modules,[Aravind Asok],Llama
2,Quantalic spectra of semirings,[Graham Manuell],2201.06408v1,math,Quantalic spectra of semirings,[Graham Manuell],Llama
3,VON NEUMANN COORDINATIZATION IS NOT FIRST-ORDER,[FRIEDRICH WEHRUNG],0409250v3,math,Von Neumann coordinatization is not first-order,[Friedrich Wehrung],Llama
4,Analysis In J2,[Nik Weaver],0509245v2,math,Analysis in J_2,[Nik Weaver],Llama
...,...,...,...,...,...,...,...
3195,A Mathematically Sensible Explanation of the C...,[Yiping Cheng],1704.01732v1,stat,A Mathematically Sensible Explanation of the C...,[Yiping Cheng],Llama
3196,A New Theoretical Interpretation of Measuremen...,"[Huisheng Shi, Xiaoming Ye, Cheng Xing, Shijun...",1704.03812v15,stat,A New Theoretical Interpretation of Measuremen...,"[Huisheng Shi, Xiaoming Ye, Cheng Xing, Shijun...",Llama
3197,Remark On Variance Bounds,[R. Sharma],1704.06292v1,stat,Remark On Variance Bounds,[R. Sharma],Llama
3198,Information vs. Uncertainty as the Foundation ...,"[Grey S. Nearing, Hoshin V. Gupta]",1704.07512v1,stat,Information vs. Uncertainty as the Foundation ...,"[Grey Nearing, Hoshin Gupta]",Llama


In [8]:
df_all[['Primary_Cat','Model']].drop_duplicates()

Unnamed: 0,Primary_Cat,Model
0,math,Llama
200,math,Gemini
400,physics,Llama
600,physics,Gemini
800,cs,Gemini
1000,cs,Llama
1200,econ,Gemini
1400,econ,Llama
1600,eess,Llama
1800,eess,Gemini


In [None]:
# Evaluate cosine similarity

df=pd.read_csv("../results/exp_results.csv")
df["Title"]=df["Title"].fillna('NA')

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer()
res=[]
for index, row in df.iterrows():
    # Define two text strings
    text1 = row["Title"]
    text2 = row["TrueTitle"]
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    #print("Cosine Similarity:", cosine_sim[0][0])
    res.append(round(cosine_sim[0][0], ndigits=2))

df["cosine_sim_Title"]=res

In [29]:
res=[]
for index, row in df.iterrows():
    # Define two text strings
    text1 = row["Authors"]
    text2 = row["TrueAuthors"]
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    #print("Cosine Similarity:", cosine_sim[0][0])
    res.append(round(cosine_sim[0][0], ndigits=2))

df["cosine_sim_Auth"]=res

# Things to consider

When extracting the information and giving a response, the model might slightly adjust the response whose validity is not entirely violated. For example the original author name might have only the first letter capitalized, but the model returns the name with all letters capitalized. In this experiment, such adjustment is still considered wrong, because the purpose of this experiment is to examine the model's capability in correctly extracting information. Any alternation in the information might have adverse effect, despite that such slight change might not be as disastrous as one thinks. Fine-tuning the model and prompt engineering might improve the result.