In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
JSON_PATH = "/content/drive/MyDrive/arxiv/arxiv-metadata-oai-snapshot.json"

In [None]:
import json
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [None]:
print("Stopwords loaded:", len(stop_words))
print("Lemmatizer ready:", lemmatizer.lemmatize("running"))

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

In [None]:
MAX_PAPERS = 100000   # adjust based on RAM (50kâ€“200k safe)

data = []
with open(JSON_PATH, "r") as f:
    for i, line in enumerate(f):
        if i >= MAX_PAPERS:
            break
        paper = json.loads(line)
        data.append({
            "title": paper.get("title", ""),
            "abstract": paper.get("abstract", ""),
            "categories": paper.get("categories", "")
        })

df = pd.DataFrame(data)
df["text"] = (df["title"] + " " + df["abstract"]).apply(clean_text)

print("Total papers loaded:", len(df))
df.head()

In [None]:
df

In [None]:
# =========================
# Exploratory Data Analysis
# =========================

print(f"Total papers used for experiments: {len(df)}")

# Text length statistics
df["text_length"] = df["text"].apply(lambda x: len(x.split()))
print("\nText Length Statistics:")
print(df["text_length"].describe())

# Category distribution
print("\nTop 10 Categories:")
print(df["categories"].value_counts().head(10))

## Exploratory Data Analysis Insights

- The experiment uses 100,000 research papers sampled from the full arXiv corpus (~1.7M papers).
- The average abstract length is ~82 words, with some abstracts exceeding 300 words.
- The dataset spans multiple scientific domains, with a higher concentration in physics-related categories such as astro-ph, hep-ph, and quant-ph.
- Due to the length and technical nature of abstracts, semantic embedding models are more suitable than keyword-based methods.

In [None]:
!pip install -q sentence-transformers

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
document_embeddings = embedder.encode(
    df["text"].tolist(),
    batch_size=32,
    show_progress_bar=True
)

In [None]:
np.save("/content/drive/MyDrive/arxiv/document_embeddings.npy", document_embeddings)

In [None]:
def recommend_papers(query, top_k=5):
    query_clean = clean_text(query)
    query_embedding = embedder.encode([query_clean])

    scores = cosine_similarity(query_embedding, document_embeddings)[0]
    top_idx = np.argsort(scores)[-top_k:][::-1]

    results = df.iloc[top_idx][["title", "categories"]].copy()
    results["similarity_score"] = scores[top_idx]

    return results.reset_index(drop=True)


In [None]:
query = "transformer models for language understanding"
recommend_papers(query, top_k=5)

In [None]:
# =========================
# TF-IDF Baseline Model
# =========================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tfidf = TfidfVectorizer(
    max_features=30000,   # memory-safe for 100k docs
    stop_words="english"
)

tfidf_matrix = tfidf.fit_transform(df["text"])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

In [None]:
def tfidf_recommend(query, top_k=5):
    query_clean = clean_text(query)
    q_vec = tfidf.transform([query_clean])

    scores = cosine_similarity(q_vec, tfidf_matrix)[0]
    top_idx = np.argsort(scores)[-top_k:][::-1]

    results = df.iloc[top_idx][["title", "categories"]].copy()
    results["similarity_score"] = scores[top_idx]

    return results

In [None]:
query = "transformer models for language understanding"

print("ðŸ”¹ SBERT Results:")
print(recommend_papers(query, top_k=5))

print("\nðŸ”¹ TF-IDF Results:")
print(tfidf_recommend(query, top_k=5))

## Results and Discussion

This project compares a keyword-based retrieval method (TF-IDF) with a semantic embedding-based method (Sentence-BERT) for research paper recommendation.

TF-IDF represents documents as sparse vectors based on word frequencies and computes similarity using cosine similarity. It relies on exact word overlap and does not capture semantic meaning. As a result, documents with common keywords may receive high similarity scores even if they are contextually irrelevant.

Sentence-BERT (SBERT) generates dense semantic embeddings that capture the contextual meaning of text. Cosine similarity in this embedding space reflects semantic closeness rather than lexical overlap, making SBERT more suitable for long and technical abstracts.

In the observed results, SBERT retrieves papers that are semantically aligned with the query, while TF-IDF often retrieves keyword-matched but irrelevant documents. Although TF-IDF produces higher cosine similarity values, these scores are not directly comparable to SBERT scores because they are computed in fundamentally different vector spaces.

Overall, SBERT provides more accurate and meaningful recommendations by capturing semantic relationships, whereas TF-IDF is limited to surface-level keyword matching.

In [None]:
BASE_PATH = "/content/drive/MyDrive/arxiv"
# Create a processed data folder if it doesn't exist
processed_dir = os.path.join(BASE_PATH, "processed")

os.makedirs(processed_dir, exist_ok=True)

# Save processed dataframe
processed_file_path = os.path.join(processed_dir, "processed_data.csv")
df.to_csv(processed_file_path, index=False)

print("âœ… Processed data saved successfully at:")
print(processed_file_path)