In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle
import re
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [9]:
#Load Data
modelfile = "cleaned_dataset.pkl"
file = open(modelfile, "rb")
df = pickle.load(file)
file.close()

#Load 'PCA'ed embeddings
modelfile = "embeddings_pca.pkl"
file = open(modelfile, "rb")
embeddings_pca = pickle.load(file)
file.close()


#Load clustering model and encoder
modelfile = "clustering_encoder_model.pkl"
file = open(modelfile, "rb")
model = pickle.load(file)
file.close()

k_means = model['kmeans']
encoder = model['encoder']
pca = model['pca']

In [19]:
cluster_model = {'dataframe':df, 'kmeans':k_means,'encoding_model':encoder, 'pca':pca, 'embeddings_pca':embeddings_pca}

In [10]:
df.head()

Unnamed: 0,id,title,authors,abstract,category,link,cleaned_title,cleaned_authors,cleaned_abstract,cleaned_data,final_cluster
0,704.0001,Calculation of prompt diphoton production cros...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",A fully differential calculation in perturba...,hep-ph,https://arxiv.org/abs/0704.0001,calculation of prompt diphoton production cros...,c balazs e l berger p m nadolsky cp yuan,a fully differential calculation in perturba...,calculation of prompt diphoton production cros...,5
1,704.0003,The evolution of the Earth-Moon system based o...,Hongjun Pan,The evolution of Earth-Moon system is descri...,physics.gen-ph,https://arxiv.org/abs/0704.0003,the evolution of the earthmoon system based on...,hongjun pan,the evolution of earthmoon system is describ...,the evolution of the earthmoon system based on...,18
2,704.0006,Bosonic characters of atomic Cooper pairs acro...,Y. H. Pong and C. K. Law,We study the two-particle wave function of p...,cond-mat.mes-hall,https://arxiv.org/abs/0704.0006,bosonic characters of atomic cooper pairs acro...,y h pong and c k law,we study the twoparticle wave function of pa...,bosonic characters of atomic cooper pairs acro...,6
3,704.0007,Polymer Quantum Mechanics and its Continuum Limit,"Alejandro Corichi, Tatjana Vukasinac and Jose ...",A rather non-standard quantum representation...,gr-qc,https://arxiv.org/abs/0704.0007,polymer quantum mechanics and its continuum limit,alejandro corichi tatjana vukasinac and jose a...,a rather nonstandard quantum representation ...,polymer quantum mechanics and its continuum li...,10
4,704.0008,Numerical solution of shock and ramp compressi...,Damian C. Swift,A general formulation was developed to repre...,cond-mat.mtrl-sci,https://arxiv.org/abs/0704.0008,numerical solution of shock and ramp compressi...,damian c swift,a general formulation was developed to repre...,numerical solution of shock and ramp compressi...,3


In [11]:
# Preprocessing function
def preprocess_abstract(abstract):
    # Convert to lowercase
    abstract = abstract.lower()
    # Remove special characters
    abstract = re.sub(r'[^\w\s]', '', abstract)
    # abstract = re.sub(r'[^\w\s]|[$\n]', '', abstract)
    # abstract = re.sub(r'[^\w]|[$\n]', '', abstract)
    return abstract

In [12]:
def fetch_arxiv_paper(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find('h1', {'class': 'title mathjax'}).text.strip().replace('Title:', '').strip()
    abstract = soup.find('blockquote', {'class': 'abstract mathjax'}).text.strip().replace('Abstract:', '').strip()
    
    return title, abstract

In [13]:
#Test
title, abstract = fetch_arxiv_paper('https://arxiv.org/abs/2406.06709')

In [14]:
abstract

"In this paper, we present a comprehensive toolbox for studying Carrollian stretched horizons, encompassing their geometry, dynamics, symplectic geometry, symmetries, and corresponding Noether charges. We introduce a precise definition of ruled stretched Carrollian structures (sCarrollian structures) on any surface, generalizing the conventional Carrollian structures of null surfaces, along with the notions of sCarrollian connection and sCarrollian stress tensor. Our approach unifies the sCarrollian (intrinsic) and stretched horizon (embedding) perspectives, providing a universal framework for any causal surface, whether timelike or null. We express the Einstein equations in sCarrollian variables and discuss the phase space symplectic structure of the sCarrollian geometry. Through Noether's theorem, we derive the Einstein equation and canonical charge and compute the evolution of the canonical charge along the transverse (radial) direction. The latter can be interpreted as a spin-2 sym

In [15]:
def get_cleaned_data(title, abstract):
    cleaned_title = preprocess_abstract(title)
    cleaned_abstract = preprocess_abstract(abstract)

    return cleaned_title + " " + cleaned_abstract

In [16]:
def get_similar_papers(link, model, num_papers):
    #Get paper title and abstract
    title, abstract = fetch_arxiv_paper(link)

    #Clean 
    cleaned_data = get_cleaned_data(title, abstract)

    #Load Model
    df = model['dataframe']
    k_means = model['kmeans']
    encoder = model['encoding_model']
    pca = model['pca']
    embeddings_pca = model['embeddings_pca']
    # X_cleaned = model['embeddings']

    #Encode cleaned data
    cleaned_data_transformed = encoder.encode(cleaned_data).reshape(1,-1)

    #PCA
    cleaned_data_transformed_pca = pca.transform(cleaned_data_transformed)


    #Assign Cluster
    pred = k_means.predict(cleaned_data_transformed_pca)

    #Print
    print(f"Given Paper: {title}\n")
    print(f"Abstract: {abstract}\n")
    print(f'Predicted Cluster: {pred.item()}\n')

    #Results
    # results_indices = df[df['final_cluster'] == pred.item()].index
    # results_X = embeddings_pca[results_indices]
    # results_cleaned_data = results['cleaned_data'].tolist()
    # results_X = vectorizer.transform(results_cleaned_data)

    #Get similar papers
    similarities = cosine_similarity(cleaned_data_transformed_pca, embeddings_pca).flatten()
    top_indices = similarities.argsort()[-num_papers:][::-1]

    similar_paper_titles = []
    similar_paper_abstracts = []
    similar_paper_links = []
    

    for idx in top_indices:
        similar_paper_titles.append(df.iloc[idx]['title'])
        similar_paper_abstracts.append(df.iloc[idx]['abstract'])
        similar_paper_links.append(df.iloc[idx]['link'])

    return similar_paper_titles, similar_paper_abstracts, similar_paper_links, similarities, top_indices

In [17]:
#Print Similar Papers
def show_similar_papers(link, model, num_papers):
    titles, abstracts, links, similarities, indices = get_similar_papers(link, model, num_papers)
    for i in range(len(titles)):
        print(f"Title: {titles[i]}\n")
        print(f"Abstract: {abstracts[i]}\n")
        print(f"Link: {links[i]}\n")
        print(f"Similarity: {similarities[indices[i]]}")
        print("\n")

    return titles, abstracts, links, similarities, indices

In [21]:
%%capture cap --no-stderr
num_papers = 10

#Paper Link
link = 'http://www.arxiv.org/abs/1602.03837'

titles, abstracts, links, similarities, indices = show_similar_papers(link, cluster_model, num_papers)

with open('output.txt', 'w') as f:
    f.write(cap.stdout)

f.close()