In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('ms_marco', 'v1.1')

In [3]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})

In [4]:
dataset = load_dataset('ms_marco', 'v1.1', split='train')

In [5]:
dataset

Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 82326
})

In [6]:
type(dataset)

datasets.arrow_dataset.Dataset

In [7]:
def preprocess(example):
    positive_passages=[]
    for p in enumerate(example['passages']["is_selected"]):
        if p[1]==1:
            positive_passages.append(example["passages"]["passage_text"][p[0]])
    negative_passages = []
    for p in enumerate(example['passages']["is_selected"]):
        if p[1]==0:
            negative_passages.append(example["passages"]["passage_text"][p[0]])
    if (len(positive_passages)>0 and len(negative_passages)>=5):
        positive = positive_passages[0]
        negatives = negative_passages[:5]
        return {
            "query": example["query"],
            "positive": positive,
            "negatives": negatives
        }
    else:
        return {"query": None, "positive": None, "negatives": None}

In [8]:
processed_data = dataset.map(preprocess, remove_columns=dataset.column_names)

In [9]:
processed_data = processed_data.filter(lambda x: x['query'] is not None and x['positive'] is not None)

In [10]:
contrastive_pairs = []
for item in processed_data:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [11]:
len(contrastive_pairs)

74538

In [12]:
from torch.utils.data import DataLoader

In [13]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [14]:
contrastive_dataset = ContrastiveDataset(contrastive_pairs[0:4])

In [15]:

data_loader = DataLoader(contrastive_dataset, batch_size=2, shuffle=True)

In [16]:
len(data_loader)

2

In [17]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [18]:
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
from peft import LoraConfig, get_peft_model

In [20]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [21]:
lora_model = get_peft_model(model, lora_config)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [22]:
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 109,777,152 || trainable%: 0.2686


In [70]:
anchor_embedding = torch.tensor([
    [0.45, 0.15, -0.35],  # Similar to the first anchor
    [-0.25, 0.38, 0.85]   # Similar to the second anchor
])

In [71]:
anchor_embedding.shape

torch.Size([2, 3])

In [72]:
y = torch.tensor([
    [7.0, 8.0, 9.0],  # First vector
    [10.0, 11.0, 12.0] # Second vector
])

In [73]:
negative_embedding = [torch.tensor([
    [-0.1, 0.2, 0.9],  # Negatives for the first anchor
     [0.8, -0.5, 0.1]]),
     torch.tensor([[-0.3, -0.2, -0.4],
     [0.1, -0.1, 0.5]]),
    torch.tensor([[0.5, 0.6, -0.2],   # Negatives for the second anchor
     [0.7, 0.3, -0.9]]),
     torch.tensor([[-0.8, -0.6, 0.3],
     [-0.4, 0.4, 0.2]]),
     torch.tensor([[-0.9, -0.6, 0.3],
     [-0.4, 0.4, 0.1]])]

In [85]:
batch_embeddings = []

for i in range(len(anchor_embedding)): 
    sample_embeddings = torch.cat(
        [anchor_embedding[i:i+1],  
         y[i:i+1],  
         torch.stack([neg[i] for neg in negative_embedding])],
        dim=0
    )
    batch_embeddings.append(sample_embeddings)

all_embeddings = torch.cat(batch_embeddings, dim=0)

In [102]:
all_embeddings

tensor([[ 0.4500,  0.1500, -0.3500],
        [ 7.0000,  8.0000,  9.0000],
        [-0.1000,  0.2000,  0.9000],
        [-0.3000, -0.2000, -0.4000],
        [ 0.5000,  0.6000, -0.2000],
        [-0.8000, -0.6000,  0.3000],
        [-0.9000, -0.6000,  0.3000],
        [-0.2500,  0.3800,  0.8500],
        [10.0000, 11.0000, 12.0000],
        [ 0.8000, -0.5000,  0.1000],
        [ 0.1000, -0.1000,  0.5000],
        [ 0.7000,  0.3000, -0.9000],
        [-0.4000,  0.4000,  0.2000],
        [-0.4000,  0.4000,  0.1000]])

In [103]:
sim=cosine_similarity(all_embeddings, all_embeddings)

In [104]:
sim

array([[ 1.00000000e+00,  1.46151394e-01, -6.03652894e-01,
        -7.87523612e-02,  8.10077667e-01, -9.01783586e-01,
        -9.06752169e-01, -6.21149242e-01,  1.73145324e-01,
         4.47034836e-01, -4.73378688e-01,  9.71223056e-01,
        -5.37186265e-01, -4.57717270e-01],
       [ 1.46151394e-01,  9.99999940e-01,  6.96774721e-01,
        -9.73246753e-01,  5.78836381e-01, -5.29513001e-01,
        -5.37270665e-01,  6.65787041e-01,  9.99618590e-01,
         1.89198583e-01,  6.07952893e-01, -4.87171337e-02,
         2.63251334e-01,  1.62474632e-01],
       [-6.03652894e-01,  6.96774721e-01,  9.99999940e-01,
        -7.40889490e-01, -1.47125080e-01,  2.37555638e-01,
         2.30556175e-01,  9.68650997e-01,  6.77307069e-01,
        -1.02299146e-01,  8.71601939e-01, -7.49993026e-01,
         5.39163828e-01,  3.94196808e-01],
       [-7.87523612e-02, -9.73246753e-01, -7.40889490e-01,
         9.99999881e-01, -4.37620789e-01,  4.26873296e-01,
         4.46662486e-01, -6.56831563e-01, -9.

In [106]:
import numpy as np
degree_matrix = np.diag(np.sum(sim, axis=1))
laplacian_matrix = degree_matrix - sim
laplacian_matrix.shape

(14, 14)

In [108]:

laplacian_pseudo_inv.shape

(14, 14)

In [109]:
len(sim)

14

In [130]:
harmonic_distances

[[-0.5, 0.0, -0.5, 0.5, -1.5, -1.5], [0.5, 0.0, 0.5, -0.5, 0.5, 0.5]]

In [134]:
torch.tensor(harmonic_distances)

tensor([[-0.5000,  0.0000, -0.5000,  0.5000, -1.5000, -1.5000],
        [ 0.5000,  0.0000,  0.5000, -0.5000,  0.5000,  0.5000]],
       dtype=torch.float64)

In [138]:
(-torch.tensor(harmonic_distances)).size(0)

2

In [56]:
embeddings.detach().cpu().numpy()

array([[[ 0.45,  0.15, -0.35],
        [-0.25,  0.38,  0.85]],

       [[ 7.  ,  8.  ,  9.  ],
        [10.  , 11.  , 12.  ]],

       [[-0.1 ,  0.2 ,  0.9 ],
        [ 0.8 , -0.5 ,  0.1 ]],

       [[-0.3 , -0.2 , -0.4 ],
        [ 0.1 , -0.1 ,  0.5 ]],

       [[ 0.5 ,  0.6 , -0.2 ],
        [ 0.7 ,  0.3 , -0.9 ]],

       [[-0.8 , -0.6 ,  0.3 ],
        [-0.4 ,  0.4 ,  0.2 ]],

       [[-0.9 , -0.6 ,  0.3 ],
        [-0.4 ,  0.4 ,  0.1 ]]], dtype=float32)

In [23]:
import torch

In [113]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
for batch in data_loader:
    anchor_texts = batch[0]
    positive_texts = batch[1]
    negative_texts = batch[2]
    print(anchor_texts)
    print(len(anchor_texts))
    print(positive_texts)
    print(len(positive_texts))
    print(negative_texts)
    print(len(negative_texts))
    anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
    print(anchor_embedding)
    print(anchor_embedding.shape)
    positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
    negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]
    print(negative_embedding)
    print(len(negative_embedding))
    pos_dist = cosine_distance(anchor_embedding, positive_embedding)
    break

('how long do you need for sydney and surrounding areas', 'was ronald reagan a democrat')
[('Sydney, New South Wales, Australia is located in a coastal basin bordered by the Pacific Ocean to the east, the Blue Mountains to the west, the Hawkesbury River to the north and the Woronora Plateau to the south. The Sydney Statistical Division, used for census data, is the unofficial metropolitan area and covers 12,145 km² (4,689 mi²). This area includes the Central Coast and Blue Mountains as well as broad swathes of national park and other non-urban land.', 'In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Democratic candidates; however, his views grew more conservative over time, and in the early 1960s he officially became a Republican. In November 1984, Ronald Reagan was reelected in a landslide, defeating Walter Mondale and his running mate Geraldine Ferraro (1935-), the first female vice-presidential candidate from a major U.S. political party.'

In [50]:
contrastive_pairs[0]

{'anchor': 'what is rba',
 'positive': 'Results-Based Accountability® (also known as RBA) is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole. RBA is also used by organizations to improve the performance of their programs. Creating Community Impact with RBA. Community impact focuses on conditions of well-being for children, families and the community as a whole that a group of leaders is working collectively to improve. For example: “Residents with good jobs,” “Children ready for school,” or “A safe and clean neighborhood”.',
 'negatives': ["Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a 

In [133]:
def info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn):
    pos_dist = distance_fn(anchor_embedding, positive_embedding) 
    neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
    
    logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device) 

    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [115]:
pos_dist = cosine_distance(anchor_embedding, y)  
neg_dist = torch.stack([cosine_distance(anchor_embedding, neg) for neg in negative_embedding], dim=-1) 

logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

In [117]:
logits.shape

torch.Size([2, 6])

In [134]:
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [53]:
num_epochs=2

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [55]:
print(lora_model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDi

In [56]:
processed_data[0]['positive']

'Results-Based Accountability® (also known as RBA) is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole. RBA is also used by organizations to improve the performance of their programs. Creating Community Impact with RBA. Community impact focuses on conditions of well-being for children, families and the community as a whole that a group of leaders is working collectively to improve. For example: “Residents with good jobs,” “Children ready for school,” or “A safe and clean neighborhood”.'

In [57]:
tokenizer(processed_data[0]['positive'], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)


{'input_ids': tensor([[  101,  3463,  1011,  2241, 17842, 29656,  1006,  2036,  2124,  2004,
         21144,  2050,  1007,  2003,  1037, 28675,  2126,  1997,  3241,  1998,
          2635,  2895,  2008,  4279,  2064,  2224,  2000,  5335,  1996,  3268,
          1997,  2336,  1010,  3360,  1010,  2945,  1010,  6001,  1998,  1996,
          2451,  2004,  1037,  2878,  1012, 21144,  2050,  2003,  2036,  2109,
          2011,  4411,  2000,  5335,  1996,  2836,  1997,  2037,  3454,  1012,
          4526,  2451,  4254,  2007, 21144,  2050,  1012,  2451,  4254,  7679,
          2006,  3785,  1997,  2092,  1011,  2108,  2005,  2336,  1010,  2945,
          1998,  1996,  2451,  2004,  1037,  2878,  2008,  1037,  2177,  1997,
          4177,  2003,  2551, 13643,  2000,  5335,  1012,  2005,  2742,  1024,
          1523,  3901,  2007,  2204,  5841,  1010,  1524,  1523,  2336,  3201,
          2005,  2082,  1010,  1524,  2030,  1523,  1037,  3647,  1998,  4550,
          5101,  1524,  1012,   102]])

In [122]:
def compute_laplacian(similarity_matrix):

    degree_matrix = np.diag(np.sum(similarity_matrix, axis=1))
    laplacian_matrix = degree_matrix - similarity_matrix  
    return laplacian_matrix

In [121]:
from sklearn.metrics.pairwise import cosine_similarity

In [126]:
def harmonic_distance(laplacian_matrix):
    laplacian_pseudo_inv = np.linalg.pinv(laplacian_matrix)
    harmonic_distances = []
    for anchor_idx in range(0, len(laplacian_matrix), 7):  
        
        anchor_node=np.zeros(len(laplacian_matrix))
        anchor_node[anchor_idx] = 1

        distances = []
        for i in range(7-1):
            node=np.zeros(len(laplacian_matrix))
            node[anchor_idx+(i+1)] = 1
            dist = np.dot(np.dot((anchor_node - node).T, laplacian_pseudo_inv), (anchor_node - node))
            distances.append(dist)
        
        harmonic_distances.append(distances)
    return(harmonic_distances)

In [127]:
def harmonic_loss(anchor_embedding, positive_embedding, negative_embedding):
    batch_embeddings = []

    for i in range(len(anchor_embedding)):  
        sample_embeddings = torch.cat(
            [anchor_embedding[i:i+1],  
            positive_embedding[i:i+1],  
            torch.stack([neg[i] for neg in negative_embedding])],  
            dim=0
        )
        batch_embeddings.append(sample_embeddings)

    all_embeddings = torch.cat(batch_embeddings, dim=0)
    sim=cosine_similarity(all_embeddings, all_embeddings)
    laplacian = compute_laplacian(sim)
    loss=harmonic_distance(laplacian)

In [71]:
for epoch in range(num_epochs):
    model.train() 
    
    total_loss = 0.0
    for batch in data_loader:
       
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
     
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        loss = harmonic_loss(anchor_embedding, positive_embedding, negative_embedding)
        
        optimizer.zero_grad()  
        loss.backward() 
        optimizer.step() 
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(data_loader)}")

Epoch 1/2, Loss: 1.802066683769226
Epoch 2/2, Loss: 1.7613139748573303
