In [5]:
!pip install transformers datasets torch



In [6]:
from datasets import load_dataset

dataset = load_dataset("ms_marco", "v1.1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
from transformers import DPRQuestionEncoder, DPRContextEncoder
from transformers import DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer

# question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

# question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# context_encoder_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

base_path = "/content/drive/My Drive/Colab Notebooks/DPR/"


In [10]:
def saveModels(question_encoder, context_encoder, base_path = "/content/drive/My Drive/Colab Notebooks/DPR/"):
  question_encoder.save_pretrained(f"{base_path}question_encoder")
  context_encoder.save_pretrained(f"{base_path}context_encoder")

# saveModels(question_encoder, context_encoder)

In [11]:
def saveTokenizers(question_encoder_tokenizer, context_encoder_tokenizer, base_path = "/content/drive/My Drive/Colab Notebooks/DPR/"):
  question_encoder_tokenizer.save_pretrained(f"{base_path}question_encoder_tokenizer")
  context_encoder_tokenizer.save_pretrained(f"{base_path}context_encoder_tokenizer")
# saveTokenizers(question_encoder_tokenizer, context_encoder_tokenizer)

In [12]:
def getEncoders(base_path = "/content/drive/My Drive/Colab Notebooks/DPR/"):
  question_encoder = DPRQuestionEncoder.from_pretrained(f"{base_path}question_encoder")
  context_encoder = DPRContextEncoder.from_pretrained(f"{base_path}context_encoder")
  return question_encoder, context_encoder

In [13]:
def getTokenizers(base_path = "/content/drive/My Drive/Colab Notebooks/DPR/"):
  question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(f"{base_path}question_encoder_tokenizer")
  context_encoder_tokenizer = DPRContextEncoderTokenizer.from_pretrained(f"{base_path}context_encoder_tokenizer")
  return question_encoder_tokenizer, context_encoder_tokenizer

In [14]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_paraphrase_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForQuestionAnswering
import random


def generate_qa_pairs(passages):
    # Initialize tokenizer and model for question generation
    qg_tokenizer = AutoTokenizer.from_pretrained('mrm8488/t5-base-finetuned-question-generation-ap')
    qg_model = AutoModelForSeq2SeqLM.from_pretrained('mrm8488/t5-base-finetuned-question-generation-ap')

    # Initialize tokenizer and model for question answering
    qa_tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')
    qa_model = AutoModelForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')

    results = []

    for passage in passages:
        # Generate a question from the passage
        input_text = "generate question: " + passage
        inputs = qg_tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        outputs = qg_model.generate(
            inputs['input_ids'],
            max_length=64,
            num_beams=5,
            early_stopping=True
        )
        question = qg_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Prepare the model input for answering the question using the passage
        answer_input = f"context: {passage} question: {question}"
        answer_inputs = qa_tokenizer(answer_input, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        answer_outputs = qa_model(**answer_inputs)
        start_positions = torch.argmax(answer_outputs.start_logits)
        end_positions = torch.argmax(answer_outputs.end_logits) + 1
        answer = qa_tokenizer.decode(answer_inputs['input_ids'][0][start_positions:end_positions])

        results.append({
            'passage': passage,
            'question': question,
            'answer': answer
        })

    return results

def get_random_passage(passages):
    # Select a random index from the array of passages
    index = random.randint(0, len(passages) - 1)
    # Retrieve the passage at that index
    passage = passages[index]
    return index, passage

def createNewDataset(datasetToModify):
  data_to_generate = [dict(item) for item in datasetToModify]
  for item in data_to_generate:
    passages = item['passages']['passage_text']
    is_selected = [0] * len(passages)

    index, selected_passage = get_random_passage(passages)
    is_selected[index] = 1
    qa_pair = generate_qa_pairs([selected_passage])

    item['passages']['is_selected'] = is_selected
    item['query'] = qa_pair[0]['question']
    item['generated_answer'] = qa_pair[0]['answer']

  return data_to_generate


In [16]:
import concurrent.futures
import numpy as np


def preprocess_data(data):
    dpr_data = []
    for item in data['train']:
        query = item['query']
        passages = item['passages']['passage_text']
        is_selected = item['passages']['is_selected']

        for passage, selected in zip(passages, is_selected):
            if selected:  # This passage is positive
                dpr_data.append({'query': query, 'passage': get_paraphrase_response(passage,1,10)[0], 'label': 1})
            else:  # This passage is negative
                dpr_data.append({'query': query, 'passage': get_paraphrase_response(passage,1,10)[0], 'label': 0})
    return dpr_data

In [17]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_response_for_passage(passage, selected):
    """Function to handle the get_response call for a single passage."""
    try:
        # Assuming get_response returns a list and you want the first element
        response = get_paraphrase_response(passage, 1, 10)[0]
        return {'passage': response, 'label': int(selected)}
    except Exception as exc:
        print(f'Error processing passage {passage}: {exc}')
        return None

def process_item(item):
    """Function to process a single item with concurrent passage processing."""
    query = item['query']
    passages = item['passages']['passage_text']
    is_selected = item['passages']['is_selected']
    item_data = []

    with ThreadPoolExecutor(max_workers=100) as passage_executor:
        # Create a future for each passage
        passage_futures = [passage_executor.submit(get_response_for_passage, passage, selected)
                           for passage, selected in zip(passages, is_selected)]

        # As each future completes, append its result to item_data
        for future in as_completed(passage_futures):
            result = future.result()
            if result:  # Check to make sure an error didn't occur
                item_data.append({'query': query, **result})

    return item_data

def preprocess_for_dpr_concurrently(data):
    """Concurrent preprocessing for each item and its passages."""
    dpr_data = []
    with ThreadPoolExecutor(max_workers=100) as item_executor:
        # Submit all items for processing
        item_futures = [item_executor.submit(process_item, item) for item in data['train']]

        # Wait for all futures to complete and collect their results
        for future in as_completed(item_futures):
            try:
                item_results = future.result()
                dpr_data.extend(item_results)
            except Exception as exc:
                print(f'An item failed to process: {exc}')
                # Handle or log the error as appropriate

    return dpr_data


In [None]:
# print(preprocess_for_dpr_concurrent({'train':dataset['train'].select(range(10))}))

In [18]:
num_beams = 10
num_return_sequences = 1
context = "The ultimate test of your knowledge is your capacity to convey it to another."
print(get_paraphrase_response(context,num_return_sequences,num_beams))



['The test of your knowledge is your ability to convey it.']


In [19]:
from torch.utils.data import DataLoader, Dataset
import torch

class DPRDataset(Dataset):
    def __init__(self, queries, passages, labels, query_tokenizer, context_tokenizer, max_length=256):
        self.queries = queries
        self.passages = passages
        self.labels = labels
        self.query_tokenizer = query_tokenizer
        self.context_tokenizer = context_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        query_encodings = self.query_tokenizer(self.queries[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        passage_encodings = self.context_tokenizer(self.passages[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        return {
            'query_input_ids': query_encodings['input_ids'].squeeze(0),
            'query_attention_mask': query_encodings['attention_mask'].squeeze(0),
            'passage_input_ids': passage_encodings['input_ids'].squeeze(0),
            'passage_attention_mask': passage_encodings['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


def getDataLoader(data, question_encoder_tokenizer, context_encoder_tokenizer):
  queries = [item['query'] for item in preprocessed_data]
  passages = [item['passage'] for item in preprocessed_data]
  labels = [item['label'] for item in preprocessed_data]

  DPRdataset = DPRDataset(queries, passages, labels, question_encoder_tokenizer, context_encoder_tokenizer)
  DPRdataloader = DataLoader(DPRdataset, batch_size=16, shuffle=True)
  return DPRdataloader


In [20]:
import torch
from torch import nn
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import DPRQuestionEncoder, DPRContextEncoder
from transformers import DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer


def trainModel(data,  question_encoder, context_encoder,  question_encoder_tokenizer, context_encoder_tokenizer, num_epochs=1):
  DPRdataloader = getDataLoader(data, question_encoder_tokenizer, context_encoder_tokenizer)
  optimizer_qe = AdamW(question_encoder.parameters(), lr=1e-6)
  optimizer_ce = AdamW(context_encoder.parameters(), lr=1e-6)

  # Scheduler (optional)
  total_steps = len(DPRdataloader) * num_epochs
  scheduler_qe = get_linear_schedule_with_warmup(optimizer_qe, num_warmup_steps=0, num_training_steps=total_steps)
  scheduler_ce = get_linear_schedule_with_warmup(optimizer_ce, num_warmup_steps=0, num_training_steps=total_steps)

  # Loss function
  cosine_similarity = nn.CosineSimilarity(dim=1)
  margin_loss = nn.MarginRankingLoss(margin=0.01)

  # Move models to GPU if available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  question_encoder.to(device)
  context_encoder.to(device)

  # Training loop
  for epoch in range(num_epochs):
      question_encoder.train()
      context_encoder.train()
      total_loss = 0
      for batch in DPRdataloader:
          optimizer_qe.zero_grad()
          optimizer_ce.zero_grad()

          # Move batch to device
          query_input_ids = batch['query_input_ids'].to(device)
          query_attention_mask = batch['query_attention_mask'].to(device)
          passage_input_ids = batch['passage_input_ids'].to(device)
          passage_attention_mask = batch['passage_attention_mask'].to(device)
          labels = batch['labels'].to(device)

          # Forward pass
          query_embeddings = question_encoder(query_input_ids, attention_mask=query_attention_mask).pooler_output
          passage_embeddings = context_encoder(passage_input_ids, attention_mask=passage_attention_mask).pooler_output

          # Calculate similarities and loss
          similarities = cosine_similarity(query_embeddings, passage_embeddings)
          target = torch.where(labels > 0, 1, -1).float()  # Convert labels to {-1, 1} for MarginRankingLoss
          loss = margin_loss(similarities, torch.zeros_like(similarities), target)

          # Backward and optimize
          loss.backward()
          optimizer_qe.step()
          optimizer_ce.step()
          scheduler_qe.step()
          scheduler_ce.step()

          total_loss += loss.item()

      avg_loss = total_loss / len(DPRdataloader)
      print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

  return question_encoder, context_encoder

In [21]:
import os

def saveChunkCounter(num_chunks, file_path='/content/drive/My Drive/data/number.txt'):
    # Extract the directory path from the file_path
    directory_path = os.path.dirname(file_path)

    # Create the directory if it does not exist
    os.makedirs(directory_path, exist_ok=True)

    # Write num_chunks to the file
    with open(file_path, 'w') as file:
        file.write(str(num_chunks))

# Example usage
# saveChunkCounter(0)

In [22]:
def readChunkCounter(file_path='/content/drive/My Drive/data/number.txt'):
    # Ensure the file exists
    if not os.path.exists(file_path):
        print("File does not exist.")
        return None

    # Read the number from the file
    with open(file_path, 'r') as file:
        num_chunks = file.read()

    # Convert the read string to an integer
    try:
        return int(num_chunks)
    except ValueError:
        print("Could not convert the file content to an integer.")
        return None

In [30]:
num_chunks_completed = int(readChunkCounter())
total_entries = len(dataset['train'])
chunk_size = 10
num_chunks = (total_entries / chunk_size) + (1 if total_entries % chunk_size else 0) - num_chunks_completed
print(f"Total entries: {total_entries}")
print(f"Chunk size: {chunk_size}")
print(f"Number of chunks: {num_chunks_completed}")
print(f"Number of chunks: {num_chunks}")
question_encoder, context_encoder = getEncoders()
question_encoder_tokenizer, context_encoder_tokenizer = getTokenizers()

for i in range(int(num_chunks)):
    start_idx = num_chunks_completed * chunk_size
    end_idx = (start_idx)  + (chunk_size)
    # Use min to ensure end_idx does not exceed the total number of entries
    end_idx = min(end_idx, total_entries)
    print(f'start_idx: {start_idx}, end_idx: {end_idx}')
    print('current chunk', num_chunks_completed)
    # Select the current chunk
    current_chunk = dataset['train'].select(range(start_idx, end_idx))
    preprocessed_data = preprocess_for_dpr_concurrently({'train': current_chunk})
    print(len(preprocessed_data))
    question_encoder, context_encoder = trainModel(preprocessed_data, question_encoder, context_encoder, question_encoder_tokenizer, context_encoder_tokenizer)
    saveModels(question_encoder, context_encoder)
    num_chunks_completed += 1
    saveChunkCounter(num_chunks_completed)

Total entries: 82326
Chunk size: 10
Number of chunks: 176
Number of chunks: 8057.6
start_idx: 1760, end_idx: 1770
current chunk 176
84
Epoch 1/1, Average Loss: 0.0025
start_idx: 1770, end_idx: 1780
current chunk 177
89
Epoch 1/1, Average Loss: 0.0026
start_idx: 1780, end_idx: 1790
current chunk 178


KeyboardInterrupt: 

In [23]:

def normalize_embeddings(embeddings):
    norms = embeddings.norm(p=2, dim=1, keepdim=True)
    return embeddings.div(norms)

def encode_query(query, question_encoder, tokenizer):
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(question_encoder.device) for k, v in inputs.items()}
    with torch.no_grad():
        query_embedding = question_encoder(**inputs).pooler_output
    # Normalize the embedding
    query_embedding = normalize_embeddings(query_embedding)
    return query_embedding

def encode_passages(passages, context_encoder, tokenizer):
    inputs = tokenizer(passages, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(context_encoder.device) for k, v in inputs.items()}
    with torch.no_grad():
        passage_embeddings = context_encoder(**inputs).pooler_output
    # Normalize the embeddings
    passage_embeddings = normalize_embeddings(passage_embeddings)
    return passage_embeddings


In [26]:
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor
import nltk

# Initialize NLP tools and models (assuming these functions are properly defined)
device = torch.device("cpu")
question_encoder, context_encoder = getEncoders()
question_encoder_tokenizer, context_encoder_tokenizer = getTokenizers()

def preprocess_for_dpr_validation(validation_data):
    preprocessed_data = []
    for item in validation_data:
        query = item['query']
        passages = item['passages']['passage_text']
        labels = item['passages']['is_selected']
        preprocessed_item = {'query': query, 'passages': passages, 'labels': labels}
        preprocessed_data.append(preprocessed_item)
    return preprocessed_data

def calculate_rank_for_item(item):
    query, passages, labels = item['query'], item['passages'], item['labels']
    question_input = question_encoder_tokenizer(query, return_tensors='pt', padding=True, truncation=True).to(device)
    question_embedding = question_encoder(**question_input).pooler_output
    context_embeddings = []

    for passage in passages:
        context_input = context_encoder_tokenizer(passage, return_tensors='pt', padding=True, truncation=True, max_length=256).to(device)
        context_embedding = context_encoder(**context_input).pooler_output
        context_embeddings.append(context_embedding)

    context_embeddings = torch.cat(context_embeddings, dim=0)
    similarities = torch.matmul(question_embedding, context_embeddings.T).squeeze(0)
    ranked_indices = torch.argsort(similarities, descending=True)
    correct_indices = [i for i, label in enumerate(labels) if label == 1]

    if correct_indices:
        correct_index = correct_indices[0]
        rank = (ranked_indices == correct_index).nonzero(as_tuple=True)[0].item() + 1
        return 1.0 / rank
    return 0

def validate_and_calculate_mrr(preprocessed_data):
    with ThreadPoolExecutor(max_workers=10) as executor:
        ranks = list(executor.map(calculate_rank_for_item, preprocessed_data))
    mrr = np.mean([rank for rank in ranks if rank > 0]) if ranks else 0
    return mrr

# Load and preprocess the dataset
dataset = load_dataset("ms_marco", "v1.1", split='validation')
preprocessed_validation_data = preprocess_for_dpr_validation(dataset)

# Calculate the MRR score
mrr_score = validate_and_calculate_mrr(preprocessed_validation_data)
print(f"MRR Score: {mrr_score}")


MRR Score: 0.4561778183352468
