In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
JSON_PATH = "/content/drive/MyDrive/arxiv/arxiv-metadata-oai-snapshot.json"

In [12]:
import json
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
print("Stopwords loaded:", len(stop_words))
print("Lemmatizer ready:", lemmatizer.lemmatize("running"))

Stopwords loaded: 198
Lemmatizer ready: running


In [14]:
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

In [15]:
MAX_PAPERS = 100000   # adjust based on RAM (50kâ€“200k safe)

data = []
with open(JSON_PATH, "r") as f:
    for i, line in enumerate(f):
        if i >= MAX_PAPERS:
            break
        paper = json.loads(line)
        data.append({
            "title": paper.get("title", ""),
            "abstract": paper.get("abstract", ""),
            "categories": paper.get("categories", "")
        })

df = pd.DataFrame(data)
df["text"] = (df["title"] + " " + df["abstract"]).apply(clean_text)

print("Total papers loaded:", len(df))
df.head()

Total papers loaded: 100000


Unnamed: 0,title,abstract,categories,text
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,calculation prompt diphoton production cross s...
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,sparsity certifying graph decomposition descri...
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,evolution earth moon system based dark matter ...
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,determinant stirling cycle number count unlabe...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,dyadic lambda alpha lambda alpha paper show co...


In [16]:
df

Unnamed: 0,title,abstract,categories,text
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,calculation prompt diphoton production cross s...
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,sparsity certifying graph decomposition descri...
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,evolution earth moon system based dark matter ...
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,determinant stirling cycle number count unlabe...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,dyadic lambda alpha lambda alpha paper show co...
...,...,...,...,...
99995,Multiple Time Dimensions,The possibility of physics in multiple time ...,physics.gen-ph physics.class-ph,multiple time dimension possibility physic mul...
99996,Depth Zero Representations of Nonlinear Covers...,"We generalize the methods of Moy-Prasad, in ...",math.RT math.NT,depth zero representation nonlinear cover p ad...
99997,Decting Errors in Reversible Circuits With Inv...,Reversible logic is experience renewed inter...,cs.AR,decting error reversible circuit invariant rel...
99998,Unveiling the birth and evolution of the HII r...,"Based on a multiwavelength study, the ISM ar...",astro-ph,unveiling birth evolution hii region sh based ...


In [17]:
# =========================
# Exploratory Data Analysis
# =========================

print(f"Total papers used for experiments: {len(df)}")

# Text length statistics
df["text_length"] = df["text"].apply(lambda x: len(x.split()))
print("\nText Length Statistics:")
print(df["text_length"].describe())

# Category distribution
print("\nTop 10 Categories:")
print(df["categories"].value_counts().head(10))

Total papers used for experiments: 100000

Text Length Statistics:
count    100000.000000
mean         82.189320
std          39.017539
min           5.000000
25%          54.000000
50%          75.000000
75%         105.000000
max         318.000000
Name: text_length, dtype: float64

Top 10 Categories:
categories
astro-ph             16266
hep-ph                5326
quant-ph              4228
hep-th                3704
cond-mat.mtrl-sci     2083
gr-qc                 1953
cond-mat.mes-hall     1709
hep-ex                1517
cond-mat.str-el       1504
nucl-th               1350
Name: count, dtype: int64


## Exploratory Data Analysis Insights

- The experiment uses 100,000 research papers sampled from the full arXiv corpus (~1.7M papers).
- The average abstract length is ~82 words, with some abstracts exceeding 300 words.
- The dataset spans multiple scientific domains, with a higher concentration in physics-related categories such as astro-ph, hep-ph, and quant-ph.
- Due to the length and technical nature of abstracts, semantic embedding models are more suitable than keyword-based methods.

In [18]:
!pip install -q sentence-transformers

In [19]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
document_embeddings = embedder.encode(
    df["text"].tolist(),
    batch_size=32,
    show_progress_bar=True
)

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [21]:
np.save("/content/drive/MyDrive/arxiv/document_embeddings.npy", document_embeddings)

In [22]:
def recommend_papers(query, top_k=5):
    query_clean = clean_text(query)
    query_embedding = embedder.encode([query_clean])

    scores = cosine_similarity(query_embedding, document_embeddings)[0]
    top_idx = np.argsort(scores)[-top_k:][::-1]

    results = df.iloc[top_idx][["title", "categories"]].copy()
    results["similarity_score"] = scores[top_idx]

    return results.reset_index(drop=True)


In [23]:
query = "transformer models for language understanding"
recommend_papers(query, top_k=5)

Unnamed: 0,title,categories,similarity_score
0,A Formal Model of Dictionary Structure and Con...,cs.CL,0.480431
1,Generating models for temporal representations,cs.CL,0.471002
2,A computer simulation of language families,physics.soc-ph,0.462464
3,Translating a first-order modal language to re...,cs.LO cs.DB,0.417354
4,Scalar and gauge translation-invariant noncomm...,hep-th math-ph math.MP,0.408938


In [24]:
# =========================
# TF-IDF Baseline Model
# =========================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tfidf = TfidfVectorizer(
    max_features=30000,   # memory-safe for 100k docs
    stop_words="english"
)

tfidf_matrix = tfidf.fit_transform(df["text"])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (100000, 30000)


In [27]:
def tfidf_recommend(query, top_k=5):
    query_clean = clean_text(query)
    q_vec = tfidf.transform([query_clean])

    scores = cosine_similarity(q_vec, tfidf_matrix)[0]
    top_idx = np.argsort(scores)[-top_k:][::-1]

    results = df.iloc[top_idx][["title", "categories"]].copy()
    results["similarity_score"] = scores[top_idx]

    return results

In [28]:
query = "transformer models for language understanding"

print("ðŸ”¹ SBERT Results:")
print(recommend_papers(query, top_k=5))

print("\nðŸ”¹ TF-IDF Results:")
print(tfidf_recommend(query, top_k=5))

ðŸ”¹ SBERT Results:
                                               title              categories  \
0  A Formal Model of Dictionary Structure and Con...                   cs.CL   
1     Generating models for temporal representations                   cs.CL   
2         A computer simulation of language families          physics.soc-ph   
3  Translating a first-order modal language to re...             cs.LO cs.DB   
4  Scalar and gauge translation-invariant noncomm...  hep-th math-ph math.MP   

   similarity_score  
0          0.480431  
1          0.471002  
2          0.462464  
3          0.417354  
4          0.408938  

ðŸ”¹ TF-IDF Results:
                                                   title  \
82640  The absoption refrigerator as a thermal transf...   
28428                                  What's in a Name?   
4843   Characteristics of Switchable Superconducting ...   
93653  A simple branching model that reproduces langu...   
23133         A computer simulation of langua

## Results and Discussion

This project compares a keyword-based retrieval method (TF-IDF) with a semantic embedding-based method (Sentence-BERT) for research paper recommendation.

TF-IDF represents documents as sparse vectors based on word frequencies and computes similarity using cosine similarity. It relies on exact word overlap and does not capture semantic meaning. As a result, documents with common keywords may receive high similarity scores even if they are contextually irrelevant.

Sentence-BERT (SBERT) generates dense semantic embeddings that capture the contextual meaning of text. Cosine similarity in this embedding space reflects semantic closeness rather than lexical overlap, making SBERT more suitable for long and technical abstracts.

In the observed results, SBERT retrieves papers that are semantically aligned with the query, while TF-IDF often retrieves keyword-matched but irrelevant documents. Although TF-IDF produces higher cosine similarity values, these scores are not directly comparable to SBERT scores because they are computed in fundamentally different vector spaces.

Overall, SBERT provides more accurate and meaningful recommendations by capturing semantic relationships, whereas TF-IDF is limited to surface-level keyword matching.

In [37]:
BASE_PATH = "/content/drive/MyDrive/arxiv"
# Create a processed data folder if it doesn't exist
processed_dir = os.path.join(BASE_PATH, "processed")

os.makedirs(processed_dir, exist_ok=True)

# Save processed dataframe
processed_file_path = os.path.join(processed_dir, "processed_data.csv")
df.to_csv(processed_file_path, index=False)

print("âœ… Processed data saved successfully at:")
print(processed_file_path)

âœ… Processed data saved successfully at:
/content/drive/MyDrive/arxiv/processed/processed_data.csv
