# AIR - Exercise in Google Colab

## Colab Preparation

Open via google drive -> right click: open with Colab

**Get a GPU**

Toolbar -> Runtime -> Change Runtime Type -> GPU

**Mount Google Drive**

* Download data and clone your github repo to your Google Drive folder
* Use Google Drive as connection between Github and Colab (Could also use direct github access, but re-submitting credentials might be annoying)
* Commit to Github locally from the synced drive

**Keep Alive**

When training google colab tends to kick you out, This might help: https://medium.com/@shivamrawat_756/how-to-prevent-google-colab-from-disconnecting-717b88a128c0

**Get Started**

Run the following script to mount google drive and install needed python packages. Pytorch comes pre-installed.

In [1]:
'''
from google.colab import drive
drive.mount('/content/drive')

!pip install -r ../requirements.txt
'''

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n!pip install -r ../requirements.txt\n"

In [2]:
import torch

print("Version:",torch.__version__)
print("Has GPU:",torch.cuda.is_available()) # check that 1 gpu is available
print("Random tensor:",torch.rand(10,device="cuda")) # check that pytorch works 

Version: 1.7.1
Has GPU: True
Random tensor: tensor([0.9229, 0.9077, 0.5947, 0.5328, 0.0282, 0.4172, 0.2058, 0.4862, 0.8398,
        0.3297], device='cuda:0')


# Main.py Replacement

-> add your code here

- Replace *air_test* with your google drive location in the sys.path.append()

In [20]:
from allennlp.common import Params, Tqdm
from allennlp.common.util import prepare_environment
from allennlp.data.dataloader import PyTorchDataLoader
prepare_environment(Params({})) # sets the seeds to be fixed

import torch

from allennlp.data.vocabulary import Vocabulary

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.nn.util import move_to_device

from data_loading import *
from model_knrm import *
from model_tk import *

import os
import pandas as pd

from torch.optim import Adam

from core_metrics import (
    unrolled_to_ranked_result,
    load_qrels,
    calculate_metrics_plain
)

# importlib.import_module('model_tk')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# change paths to your data directory
config = {
    "vocab_directory": "../Part-2/allen_vocab_lower_10",
    "pre_trained_embedding": "../Part-2/glove.42B.300d.txt",
    "model": "knrm",
    "train_data": "../Part-2/triples.train.tsv",
    "validation_data": "../Part-2/msmarco_tuples.validation.tsv",
    "test_data": "../Part-2/msmarco_tuples.test.tsv",
    
    "qrels" : "../Part-2/msmarco_qrels.txt",

    # Fira datasets
    'fira_2022_data_set': "../Part-2/fira-22.tuples.tsv",
    'fira_2022_baseline_qrels': "../Part-2/fira-22.baseline-qrels.tsv",
    'fira_2022_PART_1': "../Part-2/Final_Exercise_1.tsv",

    "validation_frequency": 200,

    # Part 3 data sets
    "msmarco_fira_21_qa_answers": "../Part-3/msmarco-fira-21.qrels.qa-answers.tsv",
    "msmarco_fira_21_qa_tuples": "../Part-3/msmarco-fira-21.qrels.qa-tuples.tsv"

}

Device: cuda


In [21]:
%%time
#
# data loading
#
print("Loading is starting...")
vocab = Vocabulary.from_files(config["vocab_directory"])
tokens_embedder = Embedding(vocab=vocab,
                           pretrained_file= config["pre_trained_embedding"],
                           embedding_dim=300,
                           trainable=True,
                           padding_index=0)
word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})
print("The loading has been completed!")

Loading is starting...


0it [00:00, ?it/s]

The loading has been completed!
Wall time: 29.6 s


In [22]:
# recommended default params for the models (but you may change them if you want)
if config["model"] == "knrm":
    model = KNRM(word_embedder, n_kernels=11)
elif config["model"] == "tk":
    model = TK(word_embedder, n_kernels=11, n_layers = 2, n_tf_dim = 300, n_tf_heads = 10)


# todo optimizer, loss 
loss_function = torch.nn.MarginRankingLoss(margin=1, reduction="mean").to(device)
optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=0.001)

print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Network:', model)

Model knrm total parameters: 94382412
Network: KNRM(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (cosine_module): CosineMatrixAttention()
  (dense): Linear(in_features=11, out_features=1, bias=True)
)


In [23]:
print("Load Training Data")
_triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_triple_reader = _triple_reader.read(config["train_data"])
_triple_reader.index_with(vocab)
train_loader = PyTorchDataLoader(_triple_reader, batch_size=32)
print("DONE!")

Load Training Data
DONE!


In [24]:
print("Load Validation Data")
_val_reader = IrLabeledTupleDatasetReader(
        lazy=True, max_doc_length=180, max_query_length=30
    )
_val_reader = _val_reader.read(config["validation_data"])
_val_reader.index_with(vocab)
val_loader = PyTorchDataLoader(_val_reader, batch_size=128)
print("DONE!")

Load Validation Data
DONE!


In [25]:
# This is the evaluation function in order to evaluate our models based on the corresponding data set
def evaluate_model(model, data_loader, p, Part_3 = False):
    model.eval()  # Set the model to evaluation mode
    all_scores = []

    print("Start Testing")

    with torch.no_grad():
        for batch in Tqdm.tqdm(data_loader):
            batch = move_to_device(batch, device)
            queries, docs = batch["query_tokens"], batch["doc_tokens"]
            scores = model(queries, docs).squeeze().tolist()
            ids_scores = zip(batch["query_id"], batch["doc_id"], scores)
            all_scores.extend(ids_scores)

    print("End Testing")
    print("===========")

    if Part_3:
        return all_scores

    print("Start Evaluation")

    # Process results
    results = {}
    for q_id, d_id, score in all_scores:
        if q_id not in results:
            results[q_id] = []
        results[q_id].append((d_id, score))

    # Convert to ranked results and calculate metrics
    ranked_results = unrolled_to_ranked_result(results)

    qrels = load_qrels(p)
    
    metrics = calculate_metrics_plain(ranked_results, qrels)
    metrics_list = [metrics[k] for k in ["MRR@10", "MRR@20", "nDCG@10", "nDCG@20"]]
    rounded_metrics = tuple(round(m, 3) for m in metrics_list)

    print("End Evaluation")

    return rounded_metrics

In [26]:
%%time
print("Start training Loop")
results = []

best = 0
last = 0
num_epochs = 3
for epoch in range(num_epochs):
    print("Starting epoch....")
    losses = []
    last = 0
    iteration = 0
    for batch in Tqdm.tqdm(train_loader):
        model.to(device)
        model.train()
        optimizer.zero_grad()
        batch = move_to_device(batch, device)

        # Positive and negative scores

        positive_scores = model.forward(batch["query_tokens"], batch["doc_pos_tokens"])
        negative_scores = model.forward(batch["query_tokens"], batch["doc_neg_tokens"]) 

        # Labels for MarginRankingLoss should be 1 for positive, -1 for negative
        target = torch.ones(batch["query_tokens"]["tokens"]["tokens"].shape[0], dtype=torch.float).to(device)

        # Compute loss
        current_loss = loss_function(positive_scores, negative_scores, target)

        # Backward pass and optimization
        current_loss.backward()
        optimizer.step()

        losses.append(current_loss.item())

        # Validation evaluation
        if iteration % config["validation_frequency"] == 0:
            mrr10, mrr20, ndcg10, ndcg20 = evaluate_model(model, val_loader, config['qrels'])
            print(f"Validation Results - Epoch: {epoch + 1}, MRR@10: {mrr10}, MRR@20: {mrr20}, nDCG@10: {ndcg10}, nDCG@20: {ndcg20}")

            if mrr10 > best:
                    best = mrr10
                    last = 0
            else:
                last += 1
            print('Validations Runs: ',last)
            if last >= 5 and best != 0:
                print("Early stopping, since MRR@10 did not improve for 5 validation runs.")
                break  
        iteration += 1

    print("Epoch Completed!")

    # avg_train_loss = sum(losses) / iteration
    # print(f"Epoch: {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}")
    print("Epoch Completed!")  
print("END OF TRAINING LOOP")

Start training Loop
Starting epoch....


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.095, MRR@20: 0.104, nDCG@10: 0.128, nDCG@20: 0.162
Validations Runs:  0
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.146, MRR@20: 0.154, nDCG@10: 0.181, nDCG@20: 0.214
Validations Runs:  0
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.144, MRR@20: 0.153, nDCG@10: 0.182, nDCG@20: 0.213
Validations Runs:  1
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.146, MRR@20: 0.154, nDCG@10: 0.183, nDCG@20: 0.214
Validations Runs:  2
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.146, MRR@20: 0.154, nDCG@10: 0.183, nDCG@20: 0.214
Validations Runs:  3
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.142, MRR@20: 0.151, nDCG@10: 0.179, nDCG@20: 0.211
Validations Runs:  4
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 1, MRR@10: 0.141, MRR@20: 0.15, nDCG@10: 0.177, nDCG@20: 0.211
Validations Runs:  5
Early stopping, since MRR@10 did not improve for 5 validation runs.
Epoch Completed!
Epoch Completed!
Starting epoch....


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 2, MRR@10: 0.141, MRR@20: 0.15, nDCG@10: 0.177, nDCG@20: 0.21
Validations Runs:  1
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 2, MRR@10: 0.138, MRR@20: 0.147, nDCG@10: 0.174, nDCG@20: 0.208
Validations Runs:  2
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 2, MRR@10: 0.136, MRR@20: 0.145, nDCG@10: 0.172, nDCG@20: 0.206
Validations Runs:  3
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 2, MRR@10: 0.136, MRR@20: 0.145, nDCG@10: 0.172, nDCG@20: 0.205
Validations Runs:  4
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 2, MRR@10: 0.136, MRR@20: 0.145, nDCG@10: 0.172, nDCG@20: 0.205
Validations Runs:  5
Early stopping, since MRR@10 did not improve for 5 validation runs.
Epoch Completed!
Epoch Completed!
Starting epoch....


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 3, MRR@10: 0.136, MRR@20: 0.145, nDCG@10: 0.172, nDCG@20: 0.205
Validations Runs:  1
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 3, MRR@10: 0.133, MRR@20: 0.142, nDCG@10: 0.169, nDCG@20: 0.202
Validations Runs:  2
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 3, MRR@10: 0.131, MRR@20: 0.14, nDCG@10: 0.167, nDCG@20: 0.201
Validations Runs:  3
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 3, MRR@10: 0.132, MRR@20: 0.141, nDCG@10: 0.167, nDCG@20: 0.201
Validations Runs:  4
Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Validation Results - Epoch: 3, MRR@10: 0.131, MRR@20: 0.14, nDCG@10: 0.167, nDCG@20: 0.201
Validations Runs:  5
Early stopping, since MRR@10 did not improve for 5 validation runs.
Epoch Completed!
Epoch Completed!
END OF TRAINING LOOP
Wall time: 14min 11s


In [27]:
"""
The following code block is not related to the training, evaluation, and testing of our models. 
However, the reason this code is used is either to save our models in .pth file format or to load 
the already saved models so that we do not have to retrain our models.
"""
# Save or Load the model
if config["model"] == "knrm":
    model_file_path = 'ModelKNRM.pth'
elif config["model"] == "tk":
    model_file_path = 'ModelTK.pth'

# Check if the model file exists
if os.path.exists(model_file_path):
    # Load the model
    model.load_state_dict(torch.load(model_file_path)) 
    print("Model loaded successfully.")
else:
    # Save the model
    torch.save(model.state_dict(), model_file_path)
    print("Model saved successfully.")


Model saved successfully.


In [28]:
'''
Use the msmarco_tuples.test.tsv input to feed the neural models and msmarco_qrels.txt qrels to evaluate the output
'''

# msmarco_tuples.test.tsv
# msmarco_qrels.txt
_test_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_test_reader = _test_reader.read(config["test_data"])
_test_reader.index_with(vocab)
test_loader = PyTorchDataLoader(_test_reader, batch_size=128)

mrr10, mrr20, ndcg10, ndcg20 = evaluate_model(model, test_loader, config['qrels'])
print(f"Testing Results - MRR@10: {mrr10}, MRR@20: {mrr20}, nDCG@10: {ndcg10}, nDCG@20: {ndcg20}")

Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Testing Results - MRR@10: 0.131, MRR@20: 0.141, nDCG@10: 0.169, nDCG@20: 0.205


In [29]:
'''
Use the fira-2022.tuples.tsv input to feed the neural models and fira-2022.baseline-qrels.tsv qrels to evaluate the output
'''

# fira-2022.tuples.tsv
# fira-2022.baseline-qrels.tsv

_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["fira_2022_data_set"])
_tuple_reader.index_with(vocab)
fira_baseline_loader = PyTorchDataLoader(_tuple_reader, batch_size=128)

mrr10, mrr20, ndcg10, ndcg20 = evaluate_model(model, fira_baseline_loader, config['fira_2022_baseline_qrels'])
print(f"Testing Results - MRR@10: {mrr10}, MRR@20: {mrr20}, nDCG@10: {ndcg10}, nDCG@20: {ndcg20}")

Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Testing Results - MRR@10: 0.95, MRR@20: 0.95, nDCG@10: 0.894, nDCG@20: 0.908


In [30]:
'''
Use the fira-2022.tuples.tsv input to feed the neural models and your qrels from part 1 to evaluate the output
'''

# fira-2022.tuples.tsv
# Final_Exercise_1.tsv

_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["fira_2022_data_set"])
_tuple_reader.index_with(vocab)
fira_Part1_loader = PyTorchDataLoader(_tuple_reader, batch_size=128)

mrr10, mrr20, ndcg10, ndcg20 = evaluate_model(model, fira_Part1_loader, config['fira_2022_PART_1'])
print(f"Testing Results - MRR@10: {mrr10}, MRR@20: {mrr20}, nDCG@10: {ndcg10}, nDCG@20: {ndcg20}")

Start Testing


0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

End Testing
Start Evaluation
End Evaluation
Testing Results - MRR@10: 0.727, MRR@20: 0.727, nDCG@10: 0.746, nDCG@20: 0.767


## Here the last bullet for part 3 starts

In [None]:
"""
The following code block is not related to the training, evaluation, and testing of our models.
"""
# Start the last bullet for part 3 using the msmarco test data set
# Our goal is to create a top-1 MSMARCO passage results from the best re-ranking model
# Then we take this top-1 data set (Format: query_id    doc_id	    question	    answer)
# and use the pre trained model in order to extract the relevant information
# Finally this is relevance information is evaluated using the 
# 1. msmarco-fira-21.qrels.qa-answers.tsv 
# 2. msmarco-fira-21.qrels.qa-tuples.tsv

# msmarco_tuples.test.tsv
# msmarco_qrels.txt
_test_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_test_reader = _test_reader.read(config["test_data"])
_test_reader.index_with(vocab)
test_loader = PyTorchDataLoader(_test_reader, batch_size=128)

##############################################################################################################
# Retrieve all the score for each (query id-document id) pair
all_scores = evaluate_model(model, test_loader, config['qrels'], True)
# Change it to data frame
all_scores_pd = pd.DataFrame(all_scores, columns=['query_id', 'doc_id', 'score'])
# Get the top-1 MSMARCO passage results from the best re-ranking model (last bullet for part 3 - Extractive QA)
top_1 = all_scores_pd.loc[all_scores_pd.groupby('query_id')['score'].idxmax()].reset_index(drop=True).drop(columns = ['score'])


column_names = ['query_id', 'doc_id', 'question', 'answer']
# Read the .tsv file with the specified column names
msmarco_test = pd.read_csv(config['test_data'], sep='\t', names=column_names)

# Change the query and doc id columns to int
top_1['query_id'] = top_1['query_id'].astype(int)
top_1['doc_id'] = top_1['doc_id'].astype(int)

# Final merge (using inner join) between the msmarco_test data set and the top-1
# The goal of merging is to have the combination of (query_id doc_id) with the 
# highest score (based on out re-ranking model (here is the tk model)) 
# and also the question and answer, based on the (query_id doc_id)
final_top_1 = pd.merge(msmarco_test, top_1, on=['query_id', 'doc_id'], how='inner')

# Save the data set for the part 3
filename_dataset = "../Part-2/" + config['model'] + "_dataset_for_part3.tsv"
final_top_1.to_csv(filename_dataset, sep='\t', index=False, header=False)