In [1]:
from beir.datasets.data_loader import GenericDataLoader

  from tqdm.autonotebook import tqdm


In [2]:
data_path = '/dss/dsshome1/07/ra65bex2/srawat/scidocs/scidocs'

In [3]:
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/25657 [00:00<?, ?it/s]

In [4]:
import random
import re
import nltk
from nltk.corpus import stopwords, wordnet
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
def clean_text(text):
    return re.sub(r'[^\w\s.!?]', '', text)
def select_words(text, percentage):
    words = [w for w in text.split() if w.lower() not in stop_words]
    return random.sample(words, min(int(percentage*len(words)), len(words)))
def introduce_typo(word):
    if len(word) > 1:
        idx = random.randint(0, len(word) - 1)
        return word[:idx] + random.choice('abcdefghijklmnopqrstuvwxyz') + word[idx+1:]
    return word
def introduce_noise(word):
    noise_chars = ['@', '#', '$', '%', '&', '*']
    if len(word) > 1:
        idx = random.randint(0, len(word) - 1)
        return word[:idx] + random.choice(noise_chars) + word[idx+1:]
    return word
def replace_with_synonym(word):
    synonyms = [syn.lemmas()[0].name() for syn in wordnet.synsets(word) if syn.lemmas()]
    return random.choice(synonyms) if synonyms else word
def corrupt_word(word, method):
    if method == 'typo':
        return introduce_typo(word)
    elif method == 'noise':
        return introduce_noise(word)
    elif method == 'synonym':
        return replace_with_synonym(word)
    return word
def corrupt_text(text):
    corrupted_words = []  
    words_to_corrupt=select_words(clean_text(text), percentage=0.5)
    for word in clean_text(text).split():
        if word in words_to_corrupt:
            corruption_method = random.choice(['typo', 'noise', 'synonym'])
            corrupted_words.append(corrupt_word(word, corruption_method))
        else:
            corrupted_words.append(word)
    return ' '.join(corrupted_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /dss/dsshome1/07/ra65bex2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /dss/dsshome1/07/ra65bex2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
contrastive_pairs=[]
query_lengths = []
import random
c=0
for query_id, relevant_docs in qrels.items():
    try:
        query_text = queries[query_id]
        query_lengths.append(len(query_text.split()))
        for doc_id in relevant_docs:
            positive = corpus[doc_id]["text"]
        #print(relevant_docs)
        positive_doc_ids = set(relevant_docs)
        all_doc_ids = set(corpus.keys())
        negative_doc_ids = all_doc_ids - positive_doc_ids
        negative_doc_ids=list(negative_doc_ids)
        negative_doc_samples = random.sample(negative_doc_ids, k=5)
        negatives=[]
        for neg_doc_id in negative_doc_samples:
            negative_doc_text = corpus[neg_doc_id]["text"]
            negatives.append(negative_doc_text)
        contrastive_pairs.append({
            "anchor": corrupt_text(query_text),
            "positive": positive,
            "negatives": negatives
        })
    except:
        c=c+1

In [6]:
c

0

In [7]:
len(contrastive_pairs)

1000

In [8]:
contrastive_pairs[0:2]

[{'anchor': 'A Direct Saarch Me$hod to solve Economic murder Problem with ValvePoint consequence',
  'positive': 'Many applications in speech, robotics, finance, and biology deal with sequential data, where ordering matters and recurrent structures are common. However, this structure cannot be easily captured by standard kernel functions. To model such structure, we propose expressive closed-form kernel functions for Gaussian processes. The resulting model, GP-LSTM, fully encapsulates the inductive biases of long short-term memory (LSTM) recurrent networks, while retaining the non-parametric probabilistic advantages of Gaussian processes. We learn the properties of the proposed kernels by optimizing the Gaussian process marginal likelihood using a new provably convergent semi-stochastic gradient procedure, and exploit the structure of these kernels for scalable training and prediction. This approach provides a practical representation for Bayesian LSTMs. We demonstrate state-of-the-art

In [9]:
len(qrels)

1000

In [10]:
from torch.utils.data import DataLoader

In [11]:
class ContrastiveDataset:
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]
        return item["anchor"], item["positive"], item["negatives"]

In [12]:
contrastive_dataset = ContrastiveDataset(contrastive_pairs)

In [13]:
data_loader = DataLoader(contrastive_dataset, batch_size=32, shuffle=True)

In [14]:
len(data_loader)

32

In [15]:
import torch
file_path_baseline ="/dss/dsshome1/07/ra65bex2/srawat/contrastive_learning/v1.1/app_baseline/checkpoint_epoch_3.pth"
lora_model_baseline = torch.load(file_path_baseline)

  lora_model_baseline = torch.load(file_path_baseline)


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lora_model_baseline = lora_model_baseline.to(device)

In [17]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [18]:
def cosine_distance(x, y):
    return 1 - torch.nn.functional.cosine_similarity(x, y, dim=-1)

In [19]:
def evaluate_mrr_baseline(model, data_loader_val, distance_fn):
    model.eval()

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = model(**anchor_input).last_hidden_state[:, 0, :]
            positive_embedding = model(**positive_input).last_hidden_state[:, 0, :]
            negative_embedding = [model(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :] for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr

In [None]:
mrr_validation_baseline = evaluate_mrr_baseline(lora_model_baseline, data_loader, cosine_distance)
print(mrr_validation_baseline)

In [None]:
file_path_average="/dss/dsshome1/07/ra65bex2/srawat/contrastive_learning/v1.1/app_average/average_checkpoint_epoch_3.pth"
lora_model_average = torch.load(file_path_average)
lora_model_average = lora_model_average.to(device)

In [None]:
def avg_embedding(inputs, model):
    input=model(**inputs)
    input_last_hidden_state=input.last_hidden_state
    input_attention_mask = inputs['attention_mask']
    input_masked_embeddings = input_last_hidden_state * input_attention_mask.unsqueeze(-1)
    input_sum_embeddings = torch.sum(input_masked_embeddings, dim=1)
    input_token_counts = torch.sum(input_attention_mask, dim=1).unsqueeze(-1)
    input_avg_embeddings = input_sum_embeddings / input_token_counts
    return(input_avg_embeddings)

In [None]:
def evaluate_mrr_average(model, data_loader_val, distance_fn):
    model.eval()  

    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = avg_embedding(anchor_input, model)
            positive_embedding = avg_embedding(positive_input, model)
            negative_embedding = [avg_embedding(tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device), model) for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)
            
            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)

    mrr = total_rr / num_queries
    return mrr

In [None]:
mrr_validation_average = evaluate_mrr_average(lora_model_average, data_loader, cosine_distance)
print(mrr_validation_average)

In [None]:
file_path_hyperbolic="/dss/dsshome1/07/ra65bex2/srawat/contrastive_learning/v1.1/0.1hyperbolic/hyperbolic_lora_checkpoint_epoch_3.pth"
lora_model_hyperbolic = torch.load(file_path_hyperbolic)
lora_model_hyperbolic = lora_model_hyperbolic.to(device)

In [None]:
def lorentzian_distance(x, y):
    
    dot_product = torch.sum(x * y, dim=-1)
    norm_x = torch.norm(x, dim=-1)
    norm_y = torch.norm(y, dim=-1)
    
    distance = torch.acosh(-dot_product + torch.sqrt((1 + norm_x**2) * (1 + norm_y**2)))
    return distance

In [None]:
def expm_o(v, c=1.0):
    c = torch.tensor(c)
    vspace = v
    vnorm = torch.norm(v, p=2, dim=-1, keepdim=True)
    xspace = torch.sinh(torch.sqrt(c) * vnorm) * vspace / (torch.sqrt(c) * vnorm)
    batch_min = xspace.min(dim=1, keepdim=True).values
    batch_max = xspace.max(dim=1, keepdim=True).values
    xspace_scaled=(xspace - batch_min) / (batch_max - batch_min)
    return xspace_scaled

In [None]:
def evaluate_mrr_hyperbolic(model1, data_loader_val, distance_fn):
    model1.eval()
    
    total_rr = 0.0
    num_queries = 0

    with torch.no_grad():
        for batch in data_loader_val:
            anchor_text = batch[0]
            positive_text = batch[1]
            negative_texts = batch[2]

            anchor_input = tokenizer(anchor_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            positive_input = tokenizer(positive_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

            anchor_embedding = expm_o(model1(**anchor_input).last_hidden_state[:, 0, :])
            positive_embedding = expm_o(model1(**positive_input).last_hidden_state[:, 0, :])
            negative_embedding = [expm_o(model1(**tokenizer(neg, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)).last_hidden_state[:, 0, :]) for neg in negative_texts]

            pos_dist = distance_fn(anchor_embedding, positive_embedding)
            neg_dist = torch.stack([distance_fn(anchor_embedding, neg) for neg in negative_embedding], dim=-1)
            all_similarities=torch.cat([-pos_dist.unsqueeze(1), -neg_dist], dim=1)

            sorted_similarities, sorted_indices = torch.sort(all_similarities, dim=1, descending=True)

            # Find the rank of the first relevant (positive) document
            positive_rank = (sorted_indices == 0).nonzero(as_tuple=True)[1] + 1  # +1 to make rank 1-based
            total_rr += torch.sum(1.0 / positive_rank.float()).item()  # Reciprocal rank
            num_queries += len(positive_rank)
            
    mrr = total_rr / num_queries
    return mrr

In [None]:
mrr_validation_hyperbolic = evaluate_mrr_hyperbolic(model1=lora_model_hyperbolic, data_loader_val=data_loader,distance_fn=lorentzian_distance)
print(mrr_validation_hyperbolic)

In [None]:
import statistics
print("Median Query Length:", statistics.median(query_lengths))

In [None]:
mrr_validation_baseline

In [None]:
mrr_validation_average

In [None]:
mrr_validation_hyperbolic