In [2]:
from datasets import load_dataset
import random
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dataset("dbpedia_14")

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 70000
    })
})

In [4]:
dataset_train = load_dataset("dbpedia_14", split="train")
dataset_train

Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 560000
})

In [5]:
dataset_train[0]

{'label': 0,
 'title': 'E. D. Abbott Ltd',
 'content': ' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.'}

In [6]:
LABEL_TO_CATEGORY = {
    0: "Company",
    1: "Educational Institution",
    2: "Artist",
    3: "Athlete",
    4: "Office Holder",
    5: "Mean Of Transportation",
    6: "Building",
    7: "Natural Place",
    8: "Village",
    9: "Animal",
    10: "Plant",
    11: "Album",
    12: "Film",
    13: "Written Work"
}

In [None]:
category_to_samples = {}
for sample in dataset_train:
    category = LABEL_TO_CATEGORY[sample["label"]]
    if category not in category_to_samples:
        category_to_samples[category] = []
    category_to_samples[category].append(sample["content"])

In [8]:
category_negatives = {
    category: [desc for cat, descriptions in category_to_samples.items() if cat != category for desc in descriptions]
    for category in LABEL_TO_CATEGORY.values()
}

In [None]:
def preprocess(example, num_negatives=5):
    category = LABEL_TO_CATEGORY.get(example["label"], None)
    
    if category is None or category not in category_negatives:
        return None  

    query = f"Tell me about {category.lower()}."
    positive = example["content"]

    negatives = random.choices(category_negatives[category], k=num_negatives)

    return {
        "query": query,
        "positive": positive,
        "negatives": negatives
    }

In [11]:
processed_data_train = dataset_train.map(preprocess, remove_columns=dataset_train.column_names)

Map: 100%|██████████| 560000/560000 [03:02<00:00, 3061.17 examples/s]


In [14]:
processed_data_train = processed_data_train.filter(lambda x: x['query'] is not None and x['positive'] is not None)

Filter: 100%|██████████| 560000/560000 [00:53<00:00, 10402.70 examples/s]


In [15]:
processed_data_train

Dataset({
    features: ['query', 'positive', 'negatives'],
    num_rows: 560000
})

In [16]:
processed_data_train[0]

{'query': 'Tell me about company.',
 'positive': ' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.',
 'negatives': [" Bowman's Castle also known as Nemacolin Castle was built in present-day Brownsville Pennsylvania at the western terminus of the Nemacolin's Trail on the east bank of the Monongahela river. It replaced a wooden trading post built near the site of Fort Burd the latter built by British colonists during the French and Indian War. Construction on the castle including addition of a crenellated tower continued through the Victorian era when it was considered an engineering marvel.",
  ' RFA Olwen (A122) was an Ol-class fast fleet tanker of the Royal Fleet Auxiliary.The lead ship of her class and launched in 1964 as RFA Olynthus the second ship to bear this name she was renamed Olw

In [48]:
contrastive_pairs_train = []
for item in processed_data_train:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_train.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

KeyboardInterrupt: 

In [21]:
contrastive_pairs_train[500000]

{'anchor': 'Tell me about film.',
 'positive': ' Rojo Amanecer (Red Dawn) is a 1989 Silver Ariel Award-winning Mexican film directed by Jorge Fons. It is a film about the Tlatelolco Massacre in the section of Tlatelolco in Mexico City in the evening of October 2 1968. It focuses on the day of a middle-class Mexican family living in one of the apartment buildings surrounding the Plaza de Tlatelolco and is based on testimonials from witnesses and victims. It stars Héctor Bonilla María Rojo the Bichir Brothers Eduardo Palomo and others.',
 'negatives': [" Annuska Johanna Maria 'Anouska' van der Zee (born April 5 1976 in Utrecht) is a retired Dutch racing cyclist. She participated both on track and at the road.Van der Zee represented the Netherlands at the 2004 Summer Olympics in Athens where she took part in the road race but did not reach the finish. After the Olympics she ended her career.",
  ' Mirrors is the second studio album from German singer Sandra released in 1986.',
  " La Cuca

In [22]:
len(contrastive_pairs_train)

560000

In [23]:
from torch.utils.data import DataLoader

In [24]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [25]:
contrastive_dataset_train = ContrastiveDataset(contrastive_pairs_train[0:10])

In [26]:
data_loader_train = DataLoader(contrastive_dataset_train, batch_size=5, shuffle=True)

In [27]:
len(data_loader_train)

2

In [28]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [29]:
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
from peft import LoraConfig, get_peft_model

In [31]:
lora_config = LoraConfig(
    task_type= "FEATURE_EXTRACTION"
)

In [32]:
lora_model = get_peft_model(model, lora_config)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [33]:
import torch

In [34]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [35]:
def info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn):

    pos_dist = distance_fn(anchor_embedding, positive_embedding)
    neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
    
    logits = torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device)

    loss = torch.nn.CrossEntropyLoss()(logits, labels)
    return loss

In [36]:
import torch.optim as optim
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

In [37]:
num_epochs=3

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [39]:
device

device(type='cpu')

In [40]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.0.1+cpu
False


In [41]:
torch.cuda.is_available()

False

In [42]:
lora_model = lora_model.to(device)

In [43]:
dataset_val = load_dataset("dbpedia_14", split="test")
dataset_val

Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 70000
})

In [None]:
category_to_samples_val = {}
for sample in dataset_val:
    category = LABEL_TO_CATEGORY[sample["label"]]
    if category not in category_to_samples_val:
        category_to_samples_val[category] = []
    category_to_samples_val[category].append(sample["content"])

In [45]:
category_negatives_val = {
    category: [desc for cat, descriptions in category_to_samples_val.items() if cat != category for desc in descriptions]
    for category in LABEL_TO_CATEGORY.values()
}

In [None]:
def preprocess_val(example, num_negatives=5):
    category = LABEL_TO_CATEGORY.get(example["label"], None)
    
    if category is None or category not in category_negatives_val:
        return None

    query = f"Tell me about {category.lower()}."
    positive = example["content"]

    negatives = random.choices(category_negatives_val[category], k=num_negatives)

    return {
        "query": query,
        "positive": positive,
        "negatives": negatives
    }

In [47]:
processed_data_val = dataset_val.map(preprocess_val, remove_columns=dataset_val.column_names)

Map: 100%|██████████| 70000/70000 [01:33<00:00, 750.35 examples/s] 


In [49]:
processed_data_val = processed_data_val.filter(lambda x: x['query'] is not None and x['positive'] is not None)

Filter: 100%|██████████| 70000/70000 [00:08<00:00, 7896.43 examples/s] 


In [50]:
contrastive_pairs_val = []
for item in processed_data_val:
    query = item["query"]
    positive = item["positive"]
    negatives = item["negatives"]
    contrastive_pairs_val.append({
        "anchor": query,
        "positive": positive,
        "negatives": negatives
    })

In [51]:
contrastive_dataset_val = ContrastiveDataset(contrastive_pairs_val[0:10])

In [52]:
data_loader_val = DataLoader(contrastive_dataset_val, batch_size=5, shuffle=True)

In [53]:
len(data_loader_val)

2

In [54]:
def evaluate_mrr(model, data_loader_val, distance_fn):
    model.eval()

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = model(**anchor_input).last_hidden_state[:, 0, :]
            positive_embedding = model(**positive_input).last_hidden_state[:, 0, :]
            negative_embedding = [model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr

In [55]:
import os
save_dir ="/dss/dsshome1/07/ra65bex2/srawat"

In [56]:
import time
epoch_metrics = []

In [None]:
for epoch in range(num_epochs):
    start_time = time.time()
    lora_model.train()
    
    total_loss = 0.0
    for batch in data_loader_train:
    
        anchor_texts = batch[0]
        positive_texts = batch[1]
        negative_texts = batch[2]
    
        anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    
        anchor_embedding = lora_model(**anchor_inputs).last_hidden_state[:, 0, :]
        positive_embedding = lora_model(**positive_inputs).last_hidden_state[:, 0, :]
        negative_embedding = [lora_model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

        loss = info_nce_loss(anchor_embedding, positive_embedding, negative_embedding, distance_fn=cosine_distance)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}.pth")
    torch.save(lora_model, save_path)
    print(f"EPOCH {epoch+1}:")
    print(f"Checkpoint saved: {save_path}")
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss / len(data_loader_train)}")
    mrr_validation = evaluate_mrr(lora_model, data_loader_val, cosine_distance)
    #mrr_train = evaluate_mrr(lora_model, data_loader_train, cosine_distance)
    #print(f"Mean Reciprocal Rank (MRR) for training set: {mrr_train:.4f}")
    print(f"Mean Reciprocal Rank (MRR) for validation set: {mrr_validation:.4f}")
    end_time = time.time()
    print(f"Epoch {epoch+1} took {(end_time - start_time) / 60:.4f} minutes.")
    print(f"\n")
    epoch_metrics.append({
        'epoch': epoch + 1,
        'training_loss': total_loss / len(data_loader_train),
        'mrr_validation': mrr_validation,
        'time_taken_minutes': (end_time - start_time) / 60
    })

In [None]:
import json

In [None]:
with open(save_dir + '/epoch_metrics.json', 'w') as f:
    json.dump(epoch_metrics, f)