In [97]:
from datasets import load_dataset

In [98]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")

In [99]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 117592
    })
})

In [100]:
split_datasets = dataset["train"].train_test_split(test_size=0.2)

In [101]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 94073
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 23519
    })
})

In [102]:
dataset_train = split_datasets["train"]

In [103]:
dataset_train

Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
    num_rows: 94073
})

In [104]:
dataset_train = dataset_train.remove_columns(['Unnamed: 0','Unnamed: 0.1'])

In [105]:
dataset_train

Dataset({
    features: ['title', 'abstract'],
    num_rows: 94073
})

In [106]:
dataset_train[0]

{'title': 'Predicting Visual Overlap of Images Through Interpretable Non-Metric Box\n  Embeddings',
 'abstract': "  To what extent are two images picturing the same 3D surfaces? Even when this\nis a known scene, the answer typically requires an expensive search across\nscale space, with matching and geometric verification of large sets of local\nfeatures. This expense is further multiplied when a query image is evaluated\nagainst a gallery, e.g. in visual relocalization. While we don't obviate the\nneed for geometric verification, we propose an interpretable image-embedding\nthat cuts the search in scale space to essentially a lookup.\n  Our approach measures the asymmetric relation between two images. The model\nthen learns a scene-specific measure of similarity, from training examples with\nknown 3D visible-surface overlaps. The result is that we can quickly identify,\nfor example, which test image is a close-up version of another, and by what\nscale factor. Subsequently, local featu

In [107]:
dataset_train = dataset_train.map(lambda x, idx: { 'index': idx }, with_indices=True)

Map:   0%|          | 0/94073 [00:00<?, ? examples/s]

In [108]:
dataset_train

Dataset({
    features: ['title', 'abstract', 'index'],
    num_rows: 94073
})

In [109]:
dataset_train[4]

{'title': 'A Probabilistic Framework to Node-level Anomaly Detection in\n  Communication Networks',
 'abstract': '  In this paper we consider the task of detecting abnormal communication volume\noccurring at node-level in communication networks. The signal of the\ncommunication activity is modeled by means of a clique stream: each occurring\ncommunication event is instantaneous and activates an undirected subgraph\nspanning over a set of equally participating nodes. We present a probabilistic\nframework to model and assess the communication volume observed at any single\nnode. Specifically, we employ non-parametric regression to learn the\nprobability that a node takes part in a certain event knowing the set of other\nnodes that are involved. On the top of that, we present a concentration\ninequality around the estimated volume of events in which a node could\nparticipate, which in turn allows us to build an efficient and interpretable\nanomaly scoring function. Finally, the superior p

In [110]:
dataset_train[20572]

{'title': 'Self-Assembling Modular Networks for Interpretable Multi-Hop Reasoning',
 'abstract': '  Multi-hop QA requires a model to connect multiple pieces of evidence\nscattered in a long context to answer the question. The recently proposed\nHotpotQA (Yang et al., 2018) dataset is comprised of questions embodying four\ndifferent multi-hop reasoning paradigms (two bridge entity setups, checking\nmultiple properties, and comparing two entities), making it challenging for a\nsingle neural network to handle all four. In this work, we present an\ninterpretable, controller-based Self-Assembling Neural Modular Network (Hu et\nal., 2017, 2018) for multi-hop reasoning, where we design four novel modules\n(Find, Relocate, Compare, NoOp) to perform unique types of language reasoning.\nBased on a question, our layout controller RNN dynamically infers a series of\nreasoning modules to construct the entire network. Empirically, we show that\nour dynamic, multi-hop modular network achieves signifi

In [111]:
import numpy as np
all_indexes = np.array(dataset_train['index'])

In [112]:
import random

def generate_hard_negatives(example, num_negatives=10,dataset=dataset_train):
    query_index = example['index']  
    anchor = example['title']
    positive = example['abstract']
    
    negatives = []
    
    negative_indexes = np.delete(all_indexes, np.where(all_indexes == query_index))
    
    sampled_negatives = random.sample(list(negative_indexes), num_negatives)
    
    for idx in sampled_negatives:
        negatives.append(dataset[int(idx)]['abstract'])
    
    return {
        "query": anchor,
        "positive": positive,
        "negatives": negatives
    }

In [None]:
processed_data_train = dataset_train.map(generate_hard_negatives, remove_columns=dataset_train.column_names)

Map:   0%|          | 0/94073 [00:00<?, ? examples/s]

In [None]:
processed_data_train

In [None]:
processed_data_train[0]

In [None]:
contrastive_pairs_train = []
for item in processed_data_train:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_train.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [None]:
len(contrastive_pairs_train)

In [None]:
from torch.utils.data import DataLoader

In [None]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [None]:
contrastive_dataset_train = ContrastiveDataset(contrastive_pairs_train)

In [None]:
data_loader_train = DataLoader(contrastive_dataset_train, batch_size=8, shuffle=True)

In [None]:
len(data_loader_train)

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
from peft import LoraConfig, get_peft_model

In [None]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [None]:
lora_model = get_peft_model(model, lora_config)

In [None]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [None]:
def info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn):

    pos_dist = distance_fn(anchor_embedding, positive_embedding)
    neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
    
    logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device)

    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [None]:
import torch
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [None]:
num_epochs=3

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
torch.cuda.is_available()

In [None]:
lora_model = lora_model.to(device)

In [None]:
dataset_val = split_datasets["test"]

In [None]:
dataset_val

In [None]:
dataset_val = dataset_val.remove_columns(['Unnamed: 0','Unnamed: 0.1'])

In [None]:
dataset_val = dataset_val.map(lambda x, idx: { 'index': idx }, with_indices=True)

In [None]:
dataset_val

In [None]:
all_indexes_val = np.array(dataset_val['index'])

In [None]:
def generate_hard_negatives_val(example, num_negatives=10,dataset=dataset_val):
    query_index = example['index'] 
    anchor = example['title']
    positive = example['abstract']
    
    negatives = []
    
    negative_indexes = np.delete(all_indexes_val, np.where(all_indexes_val == query_index)) 
    
    sampled_negatives = random.sample(list(negative_indexes), num_negatives)
    
    for idx in sampled_negatives:
        negatives.append(dataset[int(idx)]['abstract'])
    
    return {
        "query": anchor,
        "positive": positive,
        "negatives": negatives
    }

In [None]:
processed_data_val = dataset_val.map(generate_hard_negatives_val, remove_columns=dataset_val.column_names)

In [None]:
contrastive_pairs_val = []
for item in processed_data_val:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_val.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [None]:
contrastive_dataset_val = ContrastiveDataset(contrastive_pairs_val)

In [None]:
data_loader_val = DataLoader(contrastive_dataset_val, batch_size=8, shuffle=True)

In [None]:
len(data_loader_val)

In [None]:
def evaluate_mrr(model, data_loader_val, distance_fn):
    model.eval()

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = model(**anchor_input).last_hidden_state[:, 0, :]
            positive_embedding = model(**positive_input).last_hidden_state[:, 0, :]
            negative_embedding = [model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr

In [None]:
import os
save_dir ="/dss/dsshome1/07/ra65bex2/srawat/baseline"

In [None]:
import time
epoch_metrics = []

In [None]:
torch.cuda.empty_cache()

In [None]:
for epoch in range(num_epochs):
    start_time = time.time()
    lora_model.train()
    
    total_loss = 0.0
    for batch in data_loader_train:
    
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
    
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    
        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        loss = info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn=cosine_distance)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}.pth")
    torch.save(lora_model, save_path)
    print(f"EPOCH {epoch+1}:")
    print(f"Checkpoint saved: {save_path}")
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss / len(data_loader_train)}")
    mrr_validation = evaluate_mrr(lora_model, data_loader_val, cosine_distance)
    #mrr_train = evaluate_mrr(lora_model, data_loader_train, cosine_distance)
    #print(f"Mean Reciprocal Rank (MRR) for training set: {mrr_train:.4f}")
    print(f"Mean Reciprocal Rank (MRR) for validation set: {mrr_validation:.4f}")
    end_time = time.time()
    print(f"Epoch {epoch+1} took {(end_time - start_time) / 60:.4f} minutes.")
    print(f"\n")
    epoch_metrics.append({
        'epoch': epoch + 1,
        'training_loss': total_loss / len(data_loader_train),
        'mrr_validation': mrr_validation,
        'time_taken_minutes': (end_time - start_time) / 60
    })

In [None]:
import json

In [None]:
with open(save_dir + '/epoch_metrics.json', 'w') as f:
    json.dump(epoch_metrics, f)