In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('ms_marco', 'v1.1')

In [3]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})

In [4]:
dataset = load_dataset('ms_marco', 'v1.1', split='train')

In [5]:
dataset

Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 82326
})

In [6]:
type(dataset)

datasets.arrow_dataset.Dataset

In [7]:
def preprocess(example):
    positive_passages=[]
    for p in enumerate(example['passages']["is_selected"]):
        if p[1]==1:
            positive_passages.append(example["passages"]["passage_text"][p[0]])
    negative_passages = []
    for p in enumerate(example['passages']["is_selected"]):
        if p[1]==0:
            negative_passages.append(example["passages"]["passage_text"][p[0]])
    if (len(positive_passages)>0 and len(negative_passages)>=5):
        positive = positive_passages[0]
        negatives = negative_passages[:5]
        return {
            "query": example["query"],
            "positive": positive,
            "negatives": negatives
        }
    else:
        return {"query": None, "positive": None, "negatives": None}

In [8]:
processed_data = dataset.map(preprocess, remove_columns=dataset.column_names)

In [9]:
processed_data = processed_data.filter(lambda x: x['query'] is not None and x['positive'] is not None)

In [10]:
contrastive_pairs = []
for item in processed_data:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [11]:
len(contrastive_pairs)

74538

In [12]:
from torch.utils.data import DataLoader

In [13]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [14]:
contrastive_dataset = ContrastiveDataset(contrastive_pairs[0:4])

In [15]:

data_loader = DataLoader(contrastive_dataset, batch_size=2, shuffle=True)

In [16]:
len(data_loader)

2

In [17]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [18]:
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
from peft import LoraConfig, get_peft_model

In [20]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [21]:
lora_model = get_peft_model(model, lora_config)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [22]:
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 109,777,152 || trainable%: 0.2686


In [23]:
import torch

In [24]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [25]:
x = torch.tensor([
    [1.0, 2.0, 3.0],  # First vector
    [4.0, 5.0, 6.0]   # Second vector
])

y = torch.tensor([
    [7.0, 8.0, 9.0],  # First vector
    [10.0, 11.0, 12.0] # Second vector
])

In [26]:
torch.nn.functional.cosine_similarity(x, y, dim=-1)

tensor([0.9594, 0.9961])

In [27]:
torch.nn.functional.cosine_similarity(x, y, dim=-2)

tensor([0.9339, 0.9693, 0.9839])

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
for batch in data_loader:
    anchor_texts = batch[0]
    positive_texts = batch[1]
    negative_texts = batch[2]
    print(anchor_texts)
    print(len(anchor_texts))
    print(positive_texts)
    print(len(positive_texts))
    print(negative_texts)
    print(len(negative_texts))
    break
    anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
    print(anchor_embedding)
    print(anchor_embedding.shape)
    positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
    negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]
    print(negative_embedding)
    print(len(negative_embedding))
    pos_dist = cosine_distance(anchor_embedding, positive_embedding)


('price to install tile in shower', 'was ronald reagan a democrat')
2
('1 Install ceramic tile floor to match shower-Average prices for installation are between $11 to $22 per square foot; 2  A light/fan combination-Averages at $180 and one hour of installation; 3  Insulate and re-finish ceilings and walls-Fiberglass wall insulation with R-30 value will cost $2.25 per square foot.', "From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (even more than, four years earlier, Liberal Republican Gerald Ford), supported former California governor (and former Democrat) Ronald Reagan.")
2
[('In regards to tile inst

In [105]:
anchor_embedding = torch.tensor([
    [0.45, 0.15, -0.35],  # Similar to the first anchor
    [-0.25, 0.38, 0.85]   # Similar to the second anchor
])

In [106]:
negative_embedding = [torch.tensor([
    [-0.1, 0.2, 0.9],  # Negatives for the first anchor
     [0.8, -0.5, 0.1]]),
     torch.tensor([[-0.3, -0.2, -0.4],
     [0.1, -0.1, 0.5]]),
    torch.tensor([[0.5, 0.6, -0.2],   # Negatives for the second anchor
     [0.7, 0.3, -0.9]]),
     torch.tensor([[-0.8, -0.6, 0.3],
     [-0.4, 0.4, 0.2]]),
     torch.tensor([[-0.9, -0.6, 0.3],
     [-0.4, 0.4, 0.1]])]

In [107]:
len(negative_embedding)

5

In [108]:
[cosine_distance(anchor_embedding, neg) for neg in negative_embedding]

[tensor([1.6037, 1.3335]),
 tensor([1.0788, 0.2774]),
 tensor([0.1899, 1.7267]),
 tensor([1.9018, 0.2704]),
 tensor([1.9068, 0.3915])]

In [109]:
torch.stack([cosine_distance(anchor_embedding, neg) for neg in negative_embedding],dim=-1)

tensor([[1.6037, 1.0788, 0.1899, 1.9018, 1.9068],
        [1.3335, 0.2774, 1.7267, 0.2704, 0.3915]])

In [110]:
positive_embedding = torch.tensor([
    [0.85, 0.15, -0.35],  # Similar to the first anchor
    [-0.35, 0.38, 0.85]   # Similar to the second anchor
])

In [119]:
cosine_distance(anchor_embedding, positive_embedding).shape

torch.Size([2])

In [121]:
cosine_distance(anchor_embedding, positive_embedding).unsqueeze(1)

tensor([[0.0393],
        [0.0047]])

In [120]:
cosine_distance(anchor_embedding, positive_embedding).unsqueeze(1).shape

torch.Size([2, 1])

In [122]:
torch.stack([cosine_distance(anchor_embedding, neg) for neg in negative_embedding],dim=-1).shape

torch.Size([2, 5])

In [124]:
torch.cat([-cosine_distance(anchor_embedding, positive_embedding).unsqueeze(1), -torch.stack([cosine_distance(anchor_embedding, neg) for neg in negative_embedding],dim=-1)], dim=1)

tensor([[-0.0393, -1.6037, -1.0788, -0.1899, -1.9018, -1.9068],
        [-0.0047, -1.3335, -0.2774, -1.7267, -0.2704, -0.3915]])

In [130]:
torch.cat([-cosine_distance(anchor_embedding, positive_embedding).unsqueeze(1), -torch.stack([cosine_distance(anchor_embedding, neg) for neg in negative_embedding],dim=-1)], dim=1)

tensor([[-0.0393, -1.6037, -1.0788, -0.1899, -1.9018, -1.9068],
        [-0.0047, -1.3335, -0.2774, -1.7267, -0.2704, -0.3915]])

In [131]:
torch.zeros(2, dtype=torch.long)

tensor([0, 0])

In [132]:
torch.nn.CrossEntropyLoss()(torch.cat([-cosine_distance(anchor_embedding, positive_embedding).unsqueeze(1), -torch.stack([cosine_distance(anchor_embedding, neg) for neg in negative_embedding],dim=-1)], dim=1), torch.zeros(2, dtype=torch.long))

tensor(1.1501)

In [133]:
def info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn):

    pos_dist = distance_fn(anchor_embedding, positive_embedding) 
    neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1) 
    
    logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device) 

    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [134]:
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [53]:
num_epochs=2

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [55]:
print(lora_model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDi

In [56]:
processed_data[0]['positive']

'Results-Based Accountability® (also known as RBA) is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole. RBA is also used by organizations to improve the performance of their programs. Creating Community Impact with RBA. Community impact focuses on conditions of well-being for children, families and the community as a whole that a group of leaders is working collectively to improve. For example: “Residents with good jobs,” “Children ready for school,” or “A safe and clean neighborhood”.'

In [57]:
tokenizer(processed_data[0]['positive'], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)


{'input_ids': tensor([[  101,  3463,  1011,  2241, 17842, 29656,  1006,  2036,  2124,  2004,
         21144,  2050,  1007,  2003,  1037, 28675,  2126,  1997,  3241,  1998,
          2635,  2895,  2008,  4279,  2064,  2224,  2000,  5335,  1996,  3268,
          1997,  2336,  1010,  3360,  1010,  2945,  1010,  6001,  1998,  1996,
          2451,  2004,  1037,  2878,  1012, 21144,  2050,  2003,  2036,  2109,
          2011,  4411,  2000,  5335,  1996,  2836,  1997,  2037,  3454,  1012,
          4526,  2451,  4254,  2007, 21144,  2050,  1012,  2451,  4254,  7679,
          2006,  3785,  1997,  2092,  1011,  2108,  2005,  2336,  1010,  2945,
          1998,  1996,  2451,  2004,  1037,  2878,  2008,  1037,  2177,  1997,
          4177,  2003,  2551, 13643,  2000,  5335,  1012,  2005,  2742,  1024,
          1523,  3901,  2007,  2204,  5841,  1010,  1524,  1523,  2336,  3201,
          2005,  2082,  1010,  1524,  2030,  1523,  1037,  3647,  1998,  4550,
          5101,  1524,  1012,   102]])

In [71]:
for epoch in range(num_epochs):
    model.train()  
    
    total_loss = 0.0
    for batch in data_loader:
       
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
        
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        loss = info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn=cosine_distance)
        
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(data_loader)}")

Epoch 1/2, Loss: 1.802066683769226
Epoch 2/2, Loss: 1.7613139748573303
