In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 117592
    })
})

In [4]:
split_datasets = dataset["train"].train_test_split(test_size=0.2)

In [5]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 94073
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
        num_rows: 23519
    })
})

In [6]:
dataset_train = split_datasets["train"]

In [7]:
dataset_train

Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'title', 'abstract'],
    num_rows: 94073
})

In [8]:
dataset_train = dataset_train.remove_columns(['Unnamed: 0','Unnamed: 0.1'])

In [9]:
dataset_train

Dataset({
    features: ['title', 'abstract'],
    num_rows: 94073
})

In [10]:
dataset_train[0]

{'title': 'A Privacy-Preserving and Trustable Multi-agent Learning Framework',
 'abstract': "  Distributed multi-agent learning enables agents to cooperatively train a\nmodel without requiring to share their datasets. While this setting ensures\nsome level of privacy, it has been shown that, even when data is not directly\nshared, the training process is vulnerable to privacy attacks including data\nreconstruction and model inversion attacks. Additionally, malicious agents that\ntrain on inverted labels or random data, may arbitrarily weaken the accuracy of\nthe global model. This paper addresses these challenges and presents\nPrivacy-preserving and trustable Distributed Learning (PT-DL), a fully\ndecentralized framework that relies on Differential Privacy to guarantee strong\nprivacy protections of the agents' data, and Ethereum smart contracts to ensure\ntrustability. The paper shows that PT-DL is resilient up to a 50% collusion\nattack, with high probability, in a malicious trust mo

In [11]:
dataset_train = dataset_train.map(lambda x, idx: { 'index': idx }, with_indices=True)

Map:   0%|          | 0/94073 [00:00<?, ? examples/s]

In [12]:
dataset_train

Dataset({
    features: ['title', 'abstract', 'index'],
    num_rows: 94073
})

In [13]:
dataset_train[4]

{'title': 'Simple Neighborhood Representative Pre-processing Boosts Outlier\n  Detectors',
 'abstract': '  Over the decades, traditional outlier detectors have ignored the group-level\nfactor when calculating outlier scores for objects in data by evaluating only\nthe object-level factor, failing to capture the collective outliers. To\nmitigate this issue, we present a method called neighborhood representative\n(NR), which empowers all the existing outlier detectors to efficiently detect\noutliers, including collective outliers, while maintaining their computational\nintegrity. It achieves this by selecting representative objects, scoring these\nobjects, then applies the score of the representative objects to its collective\nobjects. Without altering existing detectors, NR is compatible with existing\ndetectors, while improving performance on real world datasets with +8% (0.72 to\n0.78 AUC) relative to state-of-the-art outlier detectors.\n',
 'index': 4}

In [14]:
dataset_train[20572]

{'title': 'UNOC: Understanding Occlusion for Embodied Presence in Virtual Reality',
 'abstract': '  Tracking body and hand motions in the 3D space is essential for social and\nself-presence in augmented and virtual environments. Unlike the popular 3D pose\nestimation setting, the problem is often formulated as inside-out tracking\nbased on embodied perception (e.g., egocentric cameras, handheld sensors). In\nthis paper, we propose a new data-driven framework for inside-out body\ntracking, targeting challenges of omnipresent occlusions in optimization-based\nmethods (e.g., inverse kinematics solvers). We first collect a large-scale\nmotion capture dataset with both body and finger motions using optical markers\nand inertial sensors. This dataset focuses on social scenarios and captures\nground truth poses under self-occlusions and body-hand interactions. We then\nsimulate the occlusion patterns in head-mounted camera views on the captured\nground truth using a ray casting algorithm and 

In [15]:
import numpy as np
all_indexes = np.array(dataset_train['index'])

In [16]:
import random

def generate_hard_negatives(example, num_negatives=10,dataset=dataset_train):
    query_index = example['index'] 
    anchor = example['title']
    positive = example['abstract']
    
    negatives = []
    
    negative_indexes = np.delete(all_indexes, np.where(all_indexes == query_index))
    
    sampled_negatives = random.sample(list(negative_indexes), num_negatives)
    
    for idx in sampled_negatives:
        negatives.append(dataset[int(idx)]['abstract'])
    
    return {
        "query": anchor,
        "positive": positive,
        "negatives": negatives
    }

In [17]:
processed_data_train = dataset_train.map(generate_hard_negatives, remove_columns=dataset_train.column_names)

Map:   0%|          | 0/94073 [00:00<?, ? examples/s]

In [None]:
processed_data_train

In [None]:
processed_data_train[0]

In [None]:
contrastive_pairs_train = []
for item in processed_data_train:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_train.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [None]:
len(contrastive_pairs_train)

In [None]:
from torch.utils.data import DataLoader

In [None]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [None]:
contrastive_dataset_train = ContrastiveDataset(contrastive_pairs_train)

In [None]:
data_loader_train = DataLoader(contrastive_dataset_train, batch_size=32, shuffle=True)

In [None]:
len(data_loader_train)

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
from peft import LoraConfig, get_peft_model

In [None]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [None]:
lora_model = get_peft_model(model, lora_config)

In [None]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [None]:
def info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn):

    pos_dist = distance_fn(anchor_embedding, positive_embedding)
    neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
    
    logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device)

    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [None]:
import torch
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [None]:
num_epochs=3

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
torch.cuda.is_available()

In [None]:
lora_model = lora_model.to(device)

In [None]:
dataset_val = split_datasets["test"]

In [None]:
dataset_val

In [None]:
dataset_val = dataset_val.remove_columns(['Unnamed: 0','Unnamed: 0.1'])

In [None]:
dataset_val = dataset_val.map(lambda x, idx: { 'index': idx }, with_indices=True)

In [None]:
dataset_val

In [None]:
all_indexes_val = np.array(dataset_val['index'])

In [None]:
def generate_hard_negatives_val(example, num_negatives=10,dataset=dataset_val):
    query_index = example['index']
    anchor = example['title']
    positive = example['abstract']
    
    negatives = []
    
    negative_indexes = np.delete(all_indexes_val, np.where(all_indexes_val == query_index))
    
    sampled_negatives = random.sample(list(negative_indexes), num_negatives)
    
    for idx in sampled_negatives:
        negatives.append(dataset[int(idx)]['abstract'])
    
    return {
        "query": anchor,
        "positive": positive,
        "negatives": negatives
    }

In [None]:
processed_data_val = dataset_val.map(generate_hard_negatives_val, remove_columns=dataset_val.column_names)

In [None]:
contrastive_pairs_val = []
for item in processed_data_val:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_val.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [None]:
contrastive_dataset_val = ContrastiveDataset(contrastive_pairs_val)

In [None]:
data_loader_val = DataLoader(contrastive_dataset_val, batch_size=32, shuffle=True)

In [None]:
len(data_loader_val)

In [None]:
def evaluate_mrr(model, data_loader_val, distance_fn):
    model.eval()

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = model(**anchor_input).last_hidden_state[:, 0, :]
            positive_embedding = model(**positive_input).last_hidden_state[:, 0, :]
            negative_embedding = [model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr

In [None]:
import os
save_dir ="/dss/dsshome1/07/ra65bex2/srawat/baseline"

In [None]:
import time
epoch_metrics = []

In [None]:
torch.cuda.empty_cache()

In [None]:
for epoch in range(num_epochs):
    start_time = time.time()
    lora_model.train()
    
    total_loss = 0.0
    for batch in data_loader_train:
    
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
    
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    
        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        loss = info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn=cosine_distance)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}.pth")
    torch.save(lora_model, save_path)
    print(f"EPOCH {epoch+1}:")
    print(f"Checkpoint saved: {save_path}")
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss / len(data_loader_train)}")
    mrr_validation = evaluate_mrr(lora_model, data_loader_val, cosine_distance)
    #mrr_train = evaluate_mrr(lora_model, data_loader_train, cosine_distance)
    #print(f"Mean Reciprocal Rank (MRR) for training set: {mrr_train:.4f}")
    print(f"Mean Reciprocal Rank (MRR) for validation set: {mrr_validation:.4f}")
    end_time = time.time()
    print(f"Epoch {epoch+1} took {(end_time - start_time) / 60:.4f} minutes.")
    print(f"\n")
    epoch_metrics.append({
        'epoch': epoch + 1,
        'training_loss': total_loss / len(data_loader_train),
        'mrr_validation': mrr_validation,
        'time_taken_minutes': (end_time - start_time) / 60
    })

In [None]:
import json

In [None]:
with open(save_dir + '/epoch_metrics.json', 'w') as f:
    json.dump(epoch_metrics, f)