In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('ms_marco', 'v1.1')

In [3]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})

In [4]:
dataset = load_dataset('ms_marco', 'v1.1', split='train')

In [5]:
dataset

Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 82326
})

In [6]:
type(dataset)

datasets.arrow_dataset.Dataset

In [7]:
def preprocess(example):
    positive_passages=[]
    for p in enumerate(example['passages']["is_selected"]):
        if p[1]==1:
            positive_passages.append(example["passages"]["passage_text"][p[0]])
    negative_passages = []
    for p in enumerate(example['passages']["is_selected"]):
        if p[1]==0:
            negative_passages.append(example["passages"]["passage_text"][p[0]])
    if (len(positive_passages)>0 and len(negative_passages)>=5):
        positive = positive_passages[0]
        negatives = negative_passages[:5]
        return {
            "query": example["query"],
            "positive": positive,
            "negatives": negatives
        }
    else:
        return {"query": None, "positive": None, "negatives": None}

In [8]:
processed_data = dataset.map(preprocess, remove_columns=dataset.column_names)

In [9]:
processed_data = processed_data.filter(lambda x: x['query'] is not None and x['positive'] is not None)

In [10]:
contrastive_pairs = []
for item in processed_data:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [11]:
len(contrastive_pairs)

74538

In [12]:
from torch.utils.data import DataLoader

In [13]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [14]:
contrastive_dataset = ContrastiveDataset(contrastive_pairs[0:4])

In [15]:
data_loader = DataLoader(contrastive_dataset, batch_size=2, shuffle=True)

In [16]:
len(data_loader)

2

In [17]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [18]:
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
from peft import LoraConfig, get_peft_model

In [20]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [21]:
lora_model = get_peft_model(model, lora_config)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [22]:
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 109,777,152 || trainable%: 0.2686


In [23]:
import torch

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [26]:
num_epochs=2

In [27]:
import torch.nn.functional as F

In [39]:
def compute_laplacian(similarity_matrix):
    similarity_matrix = similarity_matrix.float()

    degree_matrix = torch.diag(torch.sum(similarity_matrix, dim=1))

    laplacian_matrix = degree_matrix - similarity_matrix

    return laplacian_matrix

In [84]:
def harmonic_distance(all_embeddings):
    harmonic_distances = []
    for idx in range(0, len(all_embeddings), 7):
        sample_embeddings=all_embeddings[idx:idx+7]
        sim=F.cosine_similarity(sample_embeddings.unsqueeze(1), sample_embeddings.unsqueeze(0), dim=2)
        laplacian_matrix = compute_laplacian(sim)
        laplacian_pseudo_inv = torch.pinverse(laplacian_matrix)
        anchor_node=torch.zeros(len(laplacian_matrix), dtype=torch.float32)
        anchor_node[0] = 1        
        distances = []
        for i in range(1,7):
            node=torch.zeros(len(laplacian_matrix), dtype=torch.float32)
            node[i] = 1
            diff = anchor_node - node
            dist = torch.sqrt(torch.matmul(torch.matmul(diff.T, laplacian_pseudo_inv), diff))
            distances.append(dist)
        harmonic_distances.append(torch.stack(distances))
    return(harmonic_distances)

In [52]:
all_embeddings

tensor([[-0.2465, -0.2117,  0.5187,  ..., -0.1181,  0.1114,  0.6934],
        [-0.4711, -0.2889,  0.6605,  ...,  0.0811,  0.0191,  0.4664],
        [-0.8228,  0.0446, -0.1155,  ...,  0.1651, -0.5674,  0.7006],
        ...,
        [-0.1158, -0.0063,  0.0353,  ..., -0.1332,  0.0306,  0.1985],
        [ 0.2141, -0.1693, -0.0944,  ...,  0.4810,  0.2314,  0.9298],
        [ 0.0433, -0.1788,  0.0576,  ..., -0.4694,  0.1978,  0.5300]],
       grad_fn=<CatBackward0>)

In [87]:
negative_embedding = [torch.tensor([
    [-0.1, 0.2, 0.9],  # Negatives for the first anchor
     [0.8, -0.5, 0.1]]),
     torch.tensor([[-0.3, -0.2, -0.4],
     [0.1, -0.1, 0.5]]),
    torch.tensor([[0.5, 0.6, -0.2],   # Negatives for the second anchor
     [0.7, 0.3, -0.9]]),
     torch.tensor([[-0.8, -0.6, 0.3],
     [-0.4, 0.4, 0.2]]),
     torch.tensor([[-0.9, -0.6, 0.3],
     [-0.4, 0.4, 0.1]])]

In [88]:
anchor_embedding = torch.tensor([
    [0.45, 0.15, -0.35],  # Similar to the first anchor
    [-0.25, 0.38, 0.85]   # Similar to the second anchor
])

In [89]:
positive_embedding = torch.tensor([
    [0.85, 0.15, -0.35],  # Similar to the first anchor
    [-0.35, 0.38, 0.85]   # Similar to the second anchor
])

In [90]:
batch_embeddings = []
for i in range(len(anchor_embedding)):  
    sample_embeddings = torch.cat(
        [anchor_embedding[i:i+1], 
        positive_embedding[i:i+1],  
        torch.stack([neg[i] for neg in negative_embedding])],
        dim=0
    )
    batch_embeddings.append(sample_embeddings)

all_embeddings = torch.cat(batch_embeddings, dim=0)
all_embeddings

tensor([[ 0.4500,  0.1500, -0.3500],
        [ 0.8500,  0.1500, -0.3500],
        [-0.1000,  0.2000,  0.9000],
        [-0.3000, -0.2000, -0.4000],
        [ 0.5000,  0.6000, -0.2000],
        [-0.8000, -0.6000,  0.3000],
        [-0.9000, -0.6000,  0.3000],
        [-0.2500,  0.3800,  0.8500],
        [-0.3500,  0.3800,  0.8500],
        [ 0.8000, -0.5000,  0.1000],
        [ 0.1000, -0.1000,  0.5000],
        [ 0.7000,  0.3000, -0.9000],
        [-0.4000,  0.4000,  0.2000],
        [-0.4000,  0.4000,  0.1000]])

In [96]:
for idx in range(0, len(all_embeddings), 7):
    sample_embeddings=all_embeddings[idx:idx+7]
    sim=F.cosine_similarity(sample_embeddings.unsqueeze(1), sample_embeddings.unsqueeze(0), dim=2)
    laplacian_matrix = compute_laplacian(sim)
    laplacian_pseudo_inv = torch.pinverse(laplacian_matrix)
    print(laplacian_pseudo_inv)

tensor([[1022920.5625, 1022910.3125, 1022928.2500, 1022934.6250, 1022911.4375,
         1022908.1250, 1022908.2500],
        [1022915.9375, 1022896.7500, 1022944.9375, 1022967.1250, 1022894.8125,
         1022886.9375, 1022881.3750],
        [1022921.0625, 1022932.1875, 1022910.9375, 1022902.9375, 1022938.2500,
         1022935.6875, 1022941.4375],
        [1022922.2500, 1022949.1250, 1022897.6875, 1022875.9375, 1022957.0000,
         1022956.6875, 1022967.4375],
        [1022916.5625, 1022894.3750, 1022950.5625, 1022974.5625, 1022866.4375,
         1022870.1875, 1022866.2500],
        [1022916.1250, 1022889.3750, 1022950.9375, 1022977.1250, 1022873.1250,
         1022866.6875, 1022869.5625],
        [1022916.3750, 1022883.9375, 1022956.8125, 1022988.0000, 1022869.2500,
         1022869.6875, 1022852.8750]])
tensor([[996714.6875, 996714.5000, 996714.5000, 996714.3750, 996714.3125,
         996714.3125, 996714.3125],
        [996714.4375, 996714.9375, 996714.5625, 996714.5000, 996714.43

In [95]:
eigenvalues[0]

tensor(-3.8278+0.j)

In [76]:
def harmonic_loss(distances):
    logits=-torch.stack(distances, dim=0)
    print(logits)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device)
    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [45]:
dataset_val = load_dataset('ms_marco', 'v1.1', split='validation')
processed_data_val = dataset_val.map(preprocess, remove_columns=dataset_val.column_names)
processed_data_val = processed_data_val.filter(lambda x: x['query'] is not None and x['positive'] is not None)
contrastive_pairs_val = []
for item in processed_data_val:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_val.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })
contrastive_dataset_val = ContrastiveDataset(contrastive_pairs_val[0:4])
data_loader_val = DataLoader(contrastive_dataset_val, batch_size=2, shuffle=True)
len(data_loader_val)

2

In [49]:
def evaluate_mrr(model, data_loader_val):
    model.eval()

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = model(**anchor_input).last_hidden_state[:, 0, :]
            positive_embedding = model(**positive_input).last_hidden_state[:, 0, :]
            negative_embedding = [model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]
            batch_embeddings = []
    
            for i in range(len(anchor_embedding)):
                sample_embeddings = torch.cat(
                    [anchor_embedding[i:i+1], 
                    positive_embedding[i:i+1], 
                    torch.stack([neg[i] for neg in negative_embedding])],
                    dim=0
                )
                batch_embeddings.append(sample_embeddings)

            all_embeddings = torch.cat(batch_embeddings, dim=0)
            distances=harmonic_distance(all_embeddings)
            all_similarities=-torch.stack(distances, dim=0)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr



In [83]:
for epoch in range(num_epochs):
    lora_model.train() 
    
    total_loss = 0.0
    for batch in data_loader:
        
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
    
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        batch_embeddings = []
        
        for i in range(len(anchor_embedding)): 
            sample_embeddings = torch.cat(
                [anchor_embedding[i:i+1],  
                positive_embedding[i:i+1],  
                torch.stack([neg[i] for neg in negative_embedding])],
                dim=0
            )
            batch_embeddings.append(sample_embeddings)

        all_embeddings = torch.cat(batch_embeddings, dim=0)
        
        
        distances=harmonic_distance(all_embeddings)

        loss = harmonic_loss(distances)
        
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  
        
        total_loss += loss.item()
    print(total_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(data_loader)}")
    mrr_validation = evaluate_mrr(lora_model, data_loader_val)
    #mrr_train = evaluate_mrr(lora_model, data_loader)
    #print(f"Mean Reciprocal Rank (MRR) for training set: {mrr_train:.4f}")
    print(f"Mean Reciprocal Rank (MRR) for validation set: {mrr_validation:.4f}")

tensor([[-0., -0., -0., nan, -0., nan],
        [-0., -0., -0., -0., -0., -0.]], grad_fn=<NegBackward0>)


_LinAlgError: linalg.svd: The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated singular values (error code: 6).