In [1]:
import pandas as pd

dataset = pd.read_csv("arxiv_abstracts.csv")

abstracts = dataset["Abstract"]
abstracts.to_csv("abstracts.csv")
abstracts = pd.read_csv("abstracts.csv")

# Preprocessing

In [4]:
import re

# Define preprocessing function
def preprocess_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    text = text.replace("\n", " ")  # Remove line breaks
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# check if 'Abstract' column exists
if "Abstract" in abstracts.columns:
    # apply preprocessing
    abstracts["Cleaned abstract"] = abstracts["Abstract"].apply(preprocess_text)
    abstracts.drop(columns=["Abstract", "Unnamed: 0"], inplace=True)
    abstracts.to_csv("cleaned_abstracts.csv", index=False)
else:
    print("Error: Column 'Abstract' not found in the data.")

cleaned_abstracts = pd.read_csv("cleaned_abstracts.csv")
cleaned_abstracts_list = cleaned_abstracts["Cleaned abstract"].tolist()

# Text embeddings

In [6]:
#!pip install git+https://github.com/huggingface/transformers.git -q

In [13]:
import torch
from transformers import ModernBertModel, ModernBertConfig

# Initializing a ModernBert style configuration
configuration = ModernBertConfig()

# Initializing a model from the modernbert-base style configuration
model = ModernBertModel(configuration)

In [23]:
from transformers import AutoTokenizer

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def modernbert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.mean(1) # (batch_size, seq_length, hidden_size) -> (batch_size, hidden_size)
    return token_embeddings

In [15]:
import os

batch_size = 16
os.makedirs("embeddings", exist_ok=True)

for i in range(0, len(cleaned_abstracts_list), batch_size):
    batch = cleaned_abstracts_list[i:i + batch_size]
    batch_embeddings = modernbert_embedding(batch)
    batch_file_path = f"embeddings/batch_{i // batch_size}.pt"
    torch.save(batch_embeddings, batch_file_path)
    print(f"Processed batch {i // batch_size + 1}/{len(cleaned_abstracts_list) // batch_size + 1}")

Processed batch 1/350
Processed batch 2/350
Processed batch 3/350
Processed batch 4/350
Processed batch 5/350
Processed batch 6/350
Processed batch 7/350
Processed batch 8/350
Processed batch 9/350
Processed batch 10/350
Processed batch 11/350
Processed batch 12/350
Processed batch 13/350
Processed batch 14/350
Processed batch 15/350
Processed batch 16/350
Processed batch 17/350
Processed batch 18/350
Processed batch 19/350
Processed batch 20/350
Processed batch 21/350
Processed batch 22/350
Processed batch 23/350
Processed batch 24/350
Processed batch 25/350
Processed batch 26/350
Processed batch 27/350
Processed batch 28/350
Processed batch 29/350
Processed batch 30/350
Processed batch 31/350
Processed batch 32/350
Processed batch 33/350
Processed batch 34/350
Processed batch 35/350
Processed batch 36/350
Processed batch 37/350
Processed batch 38/350
Processed batch 39/350
Processed batch 40/350
Processed batch 41/350
Processed batch 42/350
Processed batch 43/350
Processed batch 44/3