In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 117592
    })
})

In [4]:
split_datasets = dataset["train"].train_test_split(test_size=0.2)

In [5]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 94073
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 23519
    })
})

In [6]:
dataset_train = split_datasets["train"]

In [7]:
dataset_train

Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
    num_rows: 94073
})

In [8]:
dataset_train = dataset_train.remove_columns(['Unnamed: 0','Unnamed: 0.1'])

In [9]:
dataset_train

Dataset({
    features: ['title', 'abstract'],
    num_rows: 94073
})

In [10]:
dataset_train[0]

{'title': 'Improving trajectory calculations using deep learning inspired single image superresolution',
 'abstract': 'Lagrangian trajectory or particle dispersion models as well as\nsemi-Lagrangian advection schemes require meteorological data such as wind,\ntemperature and geopotential at the exact spatio-temporal locations of the\nparticles that move independently from a regular grid. Traditionally, this\nhigh-resolution data has been obtained by interpolating the meteorological\nparameters from the gridded data of a meteorological model or reanalysis, e.g.\nusing linear interpolation in space and time. However, interpolation errors are\na large source of error for these models. Reducing them requires meteorological\ninput fields with high space and time resolution, which may not always be\navailable and can cause severe data storage and transfer problems. Here, we\ninterpret this problem as a single image superresolution task. We interpret\nmeteorological fields available at their 

In [11]:
dataset_train = dataset_train.map(lambda x, idx: { 'index': idx }, with_indices=True)

Map:   0%|          | 0/94073 [00:00<?, ? examples/s]

In [12]:
dataset_train

Dataset({
    features: ['title', 'abstract', 'index'],
    num_rows: 94073
})

In [13]:
dataset_train[4]

{'title': 'Rank-to-engage: New Listwise Approaches to Maximize Engagement',
 'abstract': '  For many internet businesses, presenting a given list of items in an order\nthat maximizes a certain metric of interest (e.g., click-through-rate, average\nengagement time etc.) is crucial. We approach the aforementioned task from a\nlearning-to-rank perspective which reveals a new problem setup. In traditional\nlearning-to-rank literature, it is implicitly assumed that during the training\ndata generation one has access to the \\emph{best or desired} order for the\ngiven list of items. In this work, we consider a problem setup where we do not\nobserve the desired ranking. We present two novel solutions: the first solution\nis an extension of already existing listwise learning-to-rank\ntechnique--Listwise maximum likelihood estimation (ListMLE)--while the second\none is a generic machine learning based framework that tackles the problem in\nits entire generality. We discuss several challenges as

In [14]:
dataset_train[20572]

{'title': 'Contrastive Predictive Coding Based Feature for Automatic Speaker\n  Verification',
 'abstract': '  This thesis describes our ongoing work on Contrastive Predictive Coding (CPC)\nfeatures for speaker verification. CPC is a recently proposed representation\nlearning framework based on predictive coding and noise contrastive estimation.\nWe focus on incorporating CPC features into the standard automatic speaker\nverification systems, and we present our methods, experiments, and analysis.\nThis thesis also details necessary background knowledge in past and recent work\non automatic speaker verification systems, conventional speech features, and\nthe motivation and techniques behind CPC.\n',
 'index': 20572}

In [15]:
import numpy as np
all_indexes = np.array(dataset_train['index'])

In [16]:
import random

def generate_hard_negatives(example, num_negatives=10,dataset=dataset_train):
    query_index = example['index']
    anchor = example['title']
    positive = example['abstract']
    
    negatives = []
    
    negative_indexes = np.delete(all_indexes, np.where(all_indexes == query_index)) 
    
    sampled_negatives = random.sample(list(negative_indexes), num_negatives)
    
    for idx in sampled_negatives:
        negatives.append(dataset[int(idx)]['abstract'])
    
    return {
        "query": anchor,
        "positive": positive,
        "negatives": negatives
    }

In [17]:
processed_data_train = dataset_train.map(generate_hard_negatives, remove_columns=dataset_train.column_names)

Map:   0%|          | 0/94073 [00:00<?, ? examples/s]

In [18]:
processed_data_train

Dataset({
    features: ['query', 'positive', 'negatives'],
    num_rows: 94073
})

In [19]:
processed_data_train[0]

{'query': 'Improving trajectory calculations using deep learning inspired single image superresolution',
 'positive': 'Lagrangian trajectory or particle dispersion models as well as\nsemi-Lagrangian advection schemes require meteorological data such as wind,\ntemperature and geopotential at the exact spatio-temporal locations of the\nparticles that move independently from a regular grid. Traditionally, this\nhigh-resolution data has been obtained by interpolating the meteorological\nparameters from the gridded data of a meteorological model or reanalysis, e.g.\nusing linear interpolation in space and time. However, interpolation errors are\na large source of error for these models. Reducing them requires meteorological\ninput fields with high space and time resolution, which may not always be\navailable and can cause severe data storage and transfer problems. Here, we\ninterpret this problem as a single image superresolution task. We interpret\nmeteorological fields available at their 

In [20]:
contrastive_pairs_train = []
for item in processed_data_train:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_train.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [21]:
len(contrastive_pairs_train)

94073

In [22]:
from torch.utils.data import DataLoader

In [23]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [24]:
contrastive_dataset_train = ContrastiveDataset(contrastive_pairs_train)

In [25]:
data_loader_train = DataLoader(contrastive_dataset_train, batch_size=8, shuffle=True)

In [26]:
len(data_loader_train)

11760

In [27]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [28]:
model = AutoModel.from_pretrained("bert-base-uncased")

In [29]:
from peft import LoraConfig, get_peft_model

In [30]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [31]:
lora_model = get_peft_model(model, lora_config)

In [32]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [33]:
def info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn):

    pos_dist = distance_fn(anchor_embedding, positive_embedding)
    neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
    
    logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device)

    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [34]:
import torch
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [35]:
num_epochs=3

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
device

device(type='cuda')

In [38]:
torch.cuda.is_available()

True

In [39]:
lora_model = lora_model.to(device)

In [40]:
dataset_val = split_datasets["test"]

In [41]:
dataset_val

Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
    num_rows: 23519
})

In [42]:
dataset_val = dataset_val.remove_columns(['Unnamed: 0','Unnamed: 0.1'])

In [43]:
dataset_val = dataset_val.map(lambda x, idx: { 'index': idx }, with_indices=True)

Map:   0%|          | 0/23519 [00:00<?, ? examples/s]

In [44]:
dataset_val

Dataset({
    features: ['title', 'abstract', 'index'],
    num_rows: 23519
})

In [45]:
all_indexes_val = np.array(dataset_val['index'])

In [46]:
def generate_hard_negatives_val(example, num_negatives=10,dataset=dataset_val):
    query_index = example['index'] 
    anchor = example['title']
    positive = example['abstract']
    
    negatives = []
    
    negative_indexes = np.delete(all_indexes_val, np.where(all_indexes_val == query_index))
    
    sampled_negatives = random.sample(list(negative_indexes), num_negatives)
    
    for idx in sampled_negatives:
        negatives.append(dataset[int(idx)]['abstract'])
    
    return {
        "query": anchor,
        "positive": positive,
        "negatives": negatives
    }

In [47]:
processed_data_val = dataset_val.map(generate_hard_negatives_val, remove_columns=dataset_val.column_names)

Map:   0%|          | 0/23519 [00:00<?, ? examples/s]

In [48]:
contrastive_pairs_val = []
for item in processed_data_val:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_val.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [49]:
contrastive_dataset_val = ContrastiveDataset(contrastive_pairs_val)

In [50]:
data_loader_val = DataLoader(contrastive_dataset_val, batch_size=8, shuffle=True)

In [51]:
len(data_loader_val)

2940

In [52]:
def evaluate_mrr(model, data_loader_val, distance_fn):
    model.eval()

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = model(**anchor_input).last_hidden_state[:, 0, :]
            positive_embedding = model(**positive_input).last_hidden_state[:, 0, :]
            negative_embedding = [model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr

In [53]:
import os
save_dir ="/dss/dsshome1/07/ra65bex2/srawat/baseline"

In [54]:
import time
epoch_metrics = []

In [55]:
torch.cuda.empty_cache()

In [56]:
for epoch in range(num_epochs):
    start_time = time.time()
    lora_model.train()
    
    total_loss = 0.0
    for batch in data_loader_train:
    
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
    
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    
        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        loss = info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn=cosine_distance)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}.pth")
    torch.save(lora_model, save_path)
    print(f"EPOCH {epoch+1}:")
    print(f"Checkpoint saved: {save_path}")
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss / len(data_loader_train)}")
    mrr_validation = evaluate_mrr(lora_model, data_loader_val, cosine_distance)
    #mrr_train = evaluate_mrr(lora_model, data_loader_train, cosine_distance)
    #print(f"Mean Reciprocal Rank (MRR) for training set: {mrr_train:.4f}")
    print(f"Mean Reciprocal Rank (MRR) for validation set: {mrr_validation:.4f}")
    end_time = time.time()
    print(f"Epoch {epoch+1} took {(end_time - start_time) / 60:.4f} minutes.")
    print(f"\n")
    epoch_metrics.append({
        'epoch': epoch + 1,
        'training_loss': total_loss / len(data_loader_train),
        'mrr_validation': mrr_validation,
        'time_taken_minutes': (end_time - start_time) / 60
    })

EPOCH 1:
Checkpoint saved: /dss/dsshome1/07/ra65bex2/srawat/baseline/checkpoint_epoch_1.pth
Epoch 1/3, Training Loss: 1.9245748056440937


Mean Reciprocal Rank (MRR) for validation set: 0.8883
Epoch 1 took 181.2205 minutes.




EPOCH 2:
Checkpoint saved: /dss/dsshome1/07/ra65bex2/srawat/baseline/checkpoint_epoch_2.pth
Epoch 2/3, Training Loss: 1.8470888216073822


Mean Reciprocal Rank (MRR) for validation set: 0.8869
Epoch 2 took 181.0148 minutes.




EPOCH 3:
Checkpoint saved: /dss/dsshome1/07/ra65bex2/srawat/baseline/checkpoint_epoch_3.pth
Epoch 3/3, Training Loss: 1.8241808064332625


Mean Reciprocal Rank (MRR) for validation set: 0.8932
Epoch 3 took 180.9587 minutes.




In [57]:
import json

In [58]:
with open(save_dir + '/epoch_metrics.json', 'w') as f:
    json.dump(epoch_metrics, f)