## Installation of Required Libraries
This cell contains code to installation of required libraries.


In [None]:
!pip install peft
!pip install sentence-transformers datasets


Collecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.27.2 peft-0.8.2
Collecting sentence-transformers
  Downloading sentence_transformers-2.4.0-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading di

## Import Libraries and Login to Hugging Face
This cell contains code to import libraries and login to hugging face.

In [None]:
from datasets import load_dataset

from huggingface_hub import login

login("hf_wLVCBnMQBLQM********************T") ## Kindly enter your hugging face secret Api Key

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load the MS MARCO Dataset
This cell contains code to load the ms marco dataset.

In [None]:

# Step 1: Load the MS MARCO dataset
benchmark_dataset = load_dataset("ms_marco", 'v1.1')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

## Setup and Load Model with PEFT
This cell contains code to setup and load model with peft.

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel, PeftConfig

def get_model(peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModel.from_pretrained(config.base_model_name_or_path)
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload()
    model.eval()
    return model

# Load the tokenizer and model
access_token = "hf_HDyQiklFWvnG*************"

tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
model = AutoModel.from_pretrained("Ranjithjames/rankllama_ms_marco_finetuned")


In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("castorini/rank_zephyr_7b_v1_full")
# model = AutoModelForCausalLM.from_pretrained("castorini/rank_zephyr_7b_v1_full")


tokenizer_config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Preprocess and Load Data for Training
This cell contains code to preprocess and load data for training.

In [None]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import default_data_collator

dataset=benchmark_dataset
# Select 25% of the data for training
train_test_split = dataset["train"].train_test_split(test_size=0.75)
# Use the smaller split for training
small_train_dataset = train_test_split["train"]

# Preprocess dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["query"], max_length=128, truncation=True)
    # Joining multiple answers into a single string
    answers_text = " ".join([answer["text"] for answer in examples["answers"]])
    model_inputs["labels"] = tokenizer(answers_text, max_length=128, truncation=True)["input_ids"]
    return model_inputs

small_tokenized_datasets = small_train_dataset.map(preprocess_function, batched=True)

# Create data loader
train_dataloader = DataLoader(small_tokenized_datasets, shuffle=True, batch_size=8, collate_fn=default_data_collator)

## Training Loop Setup
This cell contains code to training loop setup.

In [None]:
from transformers import AdamW

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

## Push Model to Hugging Face Hub
This cell contains code to push model to hugging face hub.

In [None]:

PEFT_MODEL = "Ranjithjames/rankllama_ms_marco_finetuned"

model.push_to_hub(
     PEFT_MODEL, use_auth_token=True)



model-00001-of-00006.safetensors:   0%|          | 0.00/4.84G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ranjithjames/rankllama_ms_marco_finetuned/commit/c340be8af7c1e24cb96f3ffe3b0abe132216dd46', commit_message='Upload model', commit_description='', oid='c340be8af7c1e24cb96f3ffe3b0abe132216dd46', pr_url=None, pr_revision=None, pr_num=None)

## Mount Google Drive and Load Tokenizer and Model from Drive
This cell contains code to mount google drive and load tokenizer and model from drive.

In [None]:
# import torch
# from transformers import AutoModel, AutoTokenizer
# from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')
# tokenizer_path = "/content/drive/My Drive/repllama_tokenizer"
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# model = AutoModel.from_pretrained("Ranjithjames/ms_marco_finetuned_llama_75")




Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.84G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

## Compute Similarity Scores
This cell contains code to parallel processing to compute similarity scores.

In [None]:

# Define query and passage inputs
query = "What is llama?"
title = "Llama"
passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
query_input = tokenizer(f'query: {query}</s>', return_tensors='pt')
passage_input = tokenizer(f'passage: {title} {passage}</s>', return_tensors='pt')

# Run the model forward to compute embeddings and query-passage similarity score
with torch.no_grad():
    # compute query embedding
    query_outputs = model(**query_input)
    query_embedding = query_outputs.last_hidden_state[0][-1]
    query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)

    # compute passage embedding
    passage_outputs = model(**passage_input)
    passage_embeddings = passage_outputs.last_hidden_state[0][-1]
    passage_embeddings = torch.nn.functional.normalize(passage_embeddings, p=2, dim=0)

    # compute similarity score
    score = torch.dot(query_embedding, passage_embeddings)
    print(score)


tensor(0.7905)


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define paths to save model and tokenizer
model_path = "/content/drive/My Drive/repllama_model"
tokenizer_path = "/content/drive/My Drive/repllama_tokenizer"

# Save model
model.save_pretrained(model_path)

# Save tokenizer
tokenizer.save_pretrained(tokenizer_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.84G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ranjithjames/ms_marco_finetuned_llama/commit/1ffc95e2e7e2a071afd957b7818a2165c323e910', commit_message='Upload model', commit_description='', oid='1ffc95e2e7e2a071afd957b7818a2165c323e910', pr_url=None, pr_revision=None, pr_num=None)

## Prepping data for predicitions

In [None]:
import pandas as pd

# Assuming benchmark_dataset is your dictionary
queries = benchmark_dataset['test']['query']
passages_lists = [item['passage_text'] for item in benchmark_dataset['test']['passages']]
selected_passages = [item['is_selected'] for item in benchmark_dataset['test']['passages']]

# Use list comprehension to create a list of dictionaries
rows = [{"Query": query, "Passage": passage, "Selected Passage": selected_passage}
        for query, passages, selected_passage in zip(queries, passages_lists, selected_passages)
        for passage, selected_passage in zip(passages, selected_passage)]

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(rows)


## Running Predictions on finetuned Model : This steps takes 1day + to run make sure you have enough compute before proceeding

In [None]:
import torch
import pandas as pd
from tqdm import tqdm
from torch.nn.parallel import DataParallel

# Assuming 'model' and 'tokenizer' are defined somewhere in your code

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming 'df' is your DataFrame

# Function to compute similarity score for a single row
def compute_similarity(row):
    query = row['Query'][0]
    passage = row['Passage'][0]
    query_input = tokenizer(f'query: {query}</s>', return_tensors='pt')
    passage_input = tokenizer(f'passage:  {passage}</s>', return_tensors='pt')
    # Move inputs to GPU if available
    query_input = query_input.to(device)
    passage_input = passage_input.to(device)
    # Run the model forward to compute embeddings and query-passage similarity score
    with torch.no_grad():
        # compute query embedding
        query_outputs = model(**query_input)
        query_embedding = query_outputs.last_hidden_state[0][-1]
        query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)

        # compute passage embedding
        passage_outputs = model(**passage_input)
        passage_embeddings = passage_outputs.last_hidden_state[0][-1]
        passage_embeddings = torch.nn.functional.normalize(passage_embeddings, p=2, dim=0)

        # compute similarity score
        score = torch.dot(query_embedding, passage_embeddings)
        return score.item()  # Return score as a scalar value

# Initialize the model
model = model.to(device)
if torch.cuda.device_count() > 1:
    model = DataParallel(model)

# Define number of processes to use
num_processes = 1  # Only 1 process is needed for GPU parallelism

# Split the DataFrame into chunks for processing
chunk_size = 1
df_chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

# Compute similarity scores for each chunk in parallel
similarity_scores = []
with tqdm(total=len(df)) as pbar:
    for chunk in df_chunks:
        scores = [compute_similarity(row) for index, row in chunk.iterrows()]
        similarity_scores.extend(scores)
        pbar.update(len(chunk))

# Assign similarity scores to the DataFrame
df['Similarity_Score'] = similarity_scores

# Display the DataFrame with the new column
print(df)


## Saving the output file to Googledrive

In [None]:
df.to_csv('/content/drive/My Drive/rank_llama_ms_marco_res.csv', index=False)
# Optionally, you can download the file to your local machine
from google.colab import files
files.download('/content/drive/My Drive/rank_llama_ms_marco_res.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Calculate Mean Reciprocal Rank
This cell contains code to calculate mean reciprocal rank.

In [3]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
# Optionally, you can download the file to your local machine
from google.colab import files

df=pd.read_csv('/content/drive/My Drive/rank_llama_ms_marco_res.csv')

# Group by Query column
grouped = df.groupby('Query')

# Initialize a list to store reciprocal ranks
reciprocal_ranks = []

# Iterate over each group
for name, group in grouped:

    # Sort the group by Similarity_Score
    group_sorted = group.sort_values(by='Similarity_Score', ascending=False).reset_index(drop=True)

    # Find the rank of the first occurrence where Selected Passage is 1
    rank = (group_sorted['Selected Passage'] == 1).idxmax() + 1 if (group_sorted['Selected Passage'] == 1).any() else None
    # Calculate reciprocal rank
    reciprocal_rank = 1 / rank if rank else 0
    # Append reciprocal rank to the list
    reciprocal_ranks.append(reciprocal_rank)

# Calculate Mean Reciprocal Rank
MRR = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank (MRR):", MRR)


Mean Reciprocal Rank (MRR): 0.423468574


# Paraphrasing the data

## Prepare Data for Paraphrasing Task
This cell contains code to prepare data for paraphrasing task.

In [None]:
import pandas as pd

# Assuming benchmark_dataset is your dictionary
queries = benchmark_dataset['train']['query']
passages_lists = [item['passage_text'] for item in benchmark_dataset['train']['passages']]
selected_passages = [item['is_selected'] for item in benchmark_dataset['train']['passages']]

# Use list comprehension to create a list of dictionaries
rows = [{"Query": query, "Passage": passage, "Selected Passage": selected_passage}
        for query, passages, selected_passage in zip(queries, passages_lists, selected_passages)
        for passage, selected_passage in zip(passages, selected_passage)]

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(rows)


## Paraphrasing Functionality
This cell contains code to paraphrasing functionality using T5 Models

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res



## Batch Paraphrasing with DataLoader
This cell contains code to batch paraphrasing with dataloader.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# Define the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").cuda()

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Define the paraphrase function
def paraphrase_batch(batch):
    inputs = tokenizer(
        ["paraphrase: " + text for text in batch],
        return_tensors="pt", padding=True,
        truncation=True,
        max_length=128
    ).to(model.device)

    num_beam_groups = 5 if len(batch) > 1 else 1  # Adjust num_beam_groups based on batch size
    diversity_penalty = 3.0 if num_beam_groups > 1 else 0.0  # Set diversity_penalty if using group beam search

    outputs = model.generate(**inputs, num_beams=5, num_beam_groups=num_beam_groups, num_return_sequences=5, diversity_penalty=diversity_penalty)
    paraphrases = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return paraphrases

# Create DataLoader for parallelization
texts = df['Query'].unique()
dataset = TextDataset(texts)
loader = DataLoader(dataset, batch_size=700)

# Create lists to store original questions and paraphrases
original_questions = []
paraphrases = []

# Generate paraphrases in batches
for batch_texts in tqdm(loader):
    batch_paraphrases = paraphrase_batch(batch_texts)
    original_questions.extend([text for text in batch_texts for _ in range(5)])
    paraphrases.extend(batch_paraphrases)
a
# Create a DataFrame from the lists
df_paraphrases = pd.DataFrame({
    'Question': original_questions,
    'Paraphrase': paraphrases
})

# Display the DataFrame
print(df_paraphrases)

## Save Paraphrases to CSV
This cell contains code to save paraphrases to csv.

In [None]:
df_paraphrases.to_csv('paraphrases.csv', index=False)

# RANK_ZEPHYR MODEL AND QUANTIZATION

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("castorini/rank_zephyr_7b_v1_full")
model = AutoModelForCausalLM.from_pretrained("castorini/rank_zephyr_7b_v1_full")

## Quantization

### Clone llama to help convert model to GGUF quantized model


In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r llama.cpp/requirements.txt

### Download snapshot of original model

In [None]:
from huggingface_hub import snapshot_download
model_id="castorini/rank_zephyr_7b_v1_full"
snapshot_download(repo_id=model_id, local_dir="rank_zephyr",
                  local_dir_use_symlinks=False, revision="main")

### Quantize to 8 bit


In [None]:
!python llama.cpp/convert.py rank_zephyr --outfile vicuna-13b-v1.5.gguf --outtype q8_0

## Push quantized model to Huggingface

In [None]:
from huggingface_hub import HfApi
api = HfApi()

model_id = "Ranjithjames/rank_zephyr_gguf"
api.create_repo(model_id, exist_ok=True, repo_type="model")
api.upload_file(
    path_or_fileobj="/content/Rank_zephyr.5.gguf",
    path_in_repo="rank_zephyr_8.gguf",
    repo_id=model_id,
)

## Test load the quantized model directly

In [None]:
# from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
from transformers import AutoModelForCausalLM
model_2 = AutoModelForCausalLM.from_pretrained("Ranjithjames/rank_zephyr_gguf")


## Save and Load to Gdrive

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')


# Define the directory in Google Drive where you want to save the model and tokenizer
save_directory = '/content/drive/MyDrive/Path/to/saved_models'

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Save tokenizer and model to the specified directory
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

# Print confirmation message
print("Tokenizer and model saved successfully to Google Drive.")

from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the directory where you saved the model and tokenizer
load_directory = '/content/drive/MyDrive/Path/to/saved_models'

# Load tokenizer and model from the specified directory
tokenizer = AutoTokenizer.from_pretrained(load_directory)
model = AutoModelForCausalLM.from_pretrained(load_directory)

# Print confirmation message
print("Tokenizer and model loaded successfully from Google Drive.")


## Download the quantized models locally

In [None]:
!huggingface-cli download  Ranjithjames/rank_zephyr_gguf rank_zephyr_8.gguf --local-dir . --local-dir-use-symlinks False

## Test working of model

In [None]:
llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a story writing assistant."},
        {
            "role": "user",
            "content": "Write a story about llamas."
        }
    ]
)

## Prepping data

In [None]:
import pandas as pd

# Assuming benchmark_dataset is your dictionary
queries = benchmark_dataset['test']['query']
passages_lists = [item['passage_text'] for item in benchmark_dataset['test']['passages']]
selected_passages = [item['is_selected'] for item in benchmark_dataset['test']['passages']]

# Use list comprehension to create a list of dictionaries
rows = [{"Query": query, "Passage": passage, "Selected Passage": selected_passage}
        for query, passages, selected_passage in zip(queries, passages_lists, selected_passages)
        for passage, selected_passage in zip(passages, selected_passage)]

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(rows)


## Finetuning Model

In [None]:

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch

def tokenize_data(row):
    inputs = tokenizer(row['Query'], row['Passage'], return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = row["Selected Passage"]
    return inputs


df_tokenized = df.apply(tokenize_data, axis=1)


class MyDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

dataset = MyDataset(df_tokenized)

train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# Step 4: Define Fine-Tuning Model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Step 5: Fine-Tuning Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        inputs = {k: v.squeeze(1).to(device) if k != 'labels' else v.to(device) for k, v in batch.items()}
        print("Input Shape:", inputs['input_ids'].shape)  # Print input shape for debugging
        print("Label Shape:", inputs['labels'].shape)  # Print label shape for debugging

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluation on validation set
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            inputs = {k: v.squeeze(1).to(device) if k != 'labels' else v.to(device) for k, v in batch.items()}
            print("Input Shape:", inputs['input_ids'].shape)  # Print input shape for debugging
            print("Label Shape:", inputs['labels'].shape)  # Print label shape for debugging

            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            total_correct += torch.sum(predictions == inputs["labels"]).item()
            total_samples += inputs["labels"].size(0)

    accuracy = total_correct / total_samples
    print(f"Epoch {epoch + 1}, Validation Accuracy: {accuracy}")


## Computing Similarity

In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm

# Assuming 'model' and 'tokenizer' are defined and loaded

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Example DataFrame creation (this should be replaced with your actual DataFrame)
# df = pd.DataFrame({'Query': ['What is llama?', 'Tell me about llamas.'], 'Passage': ['Llamas are domestic animals.', 'Llamas have long necks.']})

def compute_similarity(row):
    query = row['Query']
    passage = row['Passage']
    combined_input = tokenizer.encode(f"query: {query} passage: {passage}", return_tensors='pt').to(device)

    with torch.no_grad():
        output = model(input_ids=combined_input, labels=combined_input)
        loss = output.loss.item()  # Lower loss indicates higher similarity

    return -loss  # Negate loss so higher values indicate higher similarity

# Use tqdm to show progress
similarity_scores = []
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Computing similarity"):
    score = compute_similarity(row)
    similarity_scores.append(score)

# Assign similarity scores to the DataFrame
df['Similarity_Score'] = similarity_scores

# Display the DataFrame wi th the new column
print(df)


## Computing MRR

In [4]:

import pandas as pd
from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')
# # Optionally, you can download the file to your local machine
# from google.colab import files

# df=pd.read_csv('/content/drive/My Drive/rank_zephyr_ms_marco_res_25.csv')

# # Group by Query column
grouped = df.groupby('Query')

# Initialize a list to store reciprocal ranks
reciprocal_ranks = []

# Iterate over each group
for name, group in grouped:

    # Sort the group by Similarity_Score
    group_sorted = group.sort_values(by='Similarity_Score', ascending=False).reset_index(drop=True)

    # Find the rank of the first occurrence where Selected Passage is 1
    rank = (group_sorted['Selected Passage'] == 1).idxmax() + 1 if (group_sorted['Selected Passage'] == 1).any() else None
    # Calculate reciprocal rank
    reciprocal_rank = 1 / rank if rank else 0
    # Append reciprocal rank to the list
    reciprocal_ranks.append(reciprocal_rank)

# Calculate Mean Reciprocal Rank
MRR = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank Quantized model (MRR):",MRR)


Mean Reciprocal Rank Quantized model (MRR): 0.39578456
