# AIR - Exercise in Google Colab

## Colab Preparation

Open via google drive -> right click: open with Colab

**Get a GPU**

Toolbar -> Runtime -> Change Runtime Type -> GPU

**Mount Google Drive**

* Download data and clone your github repo to your Google Drive folder
* Use Google Drive as connection between Github and Colab (Could also use direct github access, but re-submitting credentials might be annoying)
* Commit to Github locally from the synced drive

**Keep Alive**

When training google colab tends to kick you out, This might help: https://medium.com/@shivamrawat_756/how-to-prevent-google-colab-from-disconnecting-717b88a128c0

**Get Started**

Run the following script to mount google drive and install needed python packages. Pytorch comes pre-installed.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -r ../requirements.txt

In [1]:
import torch

print("Version:",torch.__version__)
print("Has GPU:",torch.cuda.is_available()) # check that 1 gpu is available
print("Random tensor:",torch.rand(10,device="cuda")) # check that pytorch works 

Version: 1.6.0
Has GPU: False


AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

# Main.py Replacement

-> add your code here

- Replace *air_test* with your google drive location in the sys.path.append()

In [2]:
from typing import Any, Callable, Tuple
from allennlp.common import Params, Tqdm
from allennlp.common.util import prepare_environment
from allennlp.data.dataloader import PyTorchDataLoader
prepare_environment(Params({})) # sets the seeds to be fixed

import torch

from allennlp.data.vocabulary import Vocabulary

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

from data_loading import *
from model_knrm.model_knrm import *
from model_tk.model_tk import *

from core_metrics.core_metrics import calculate_metrics_plain, load_qrels

In [3]:
# change paths to your data directory
config = {
    "vocab_directory": "../data/Part-2/allen_vocab_lower_10",
    "pre_trained_embedding": "../data/Part-2/glove.42B.300d.txt",
    "model": "knrm",
    "train_data": "../data/Part-2/triples.train.tsv",
    "validation_data": "../data/Part-2/tuples.validation.tsv",
    "test_data":"../data/Part-2/tuples.test.tsv",
    "qurels": "../data/Part-2/msmarco_qrels.txt"
}


In [4]:
# data loading
vocab = Vocabulary.from_files(config["vocab_directory"])
tokens_embedder = Embedding(vocab=vocab,
                           pretrained_file= config["pre_trained_embedding"],
                           embedding_dim=300,
                           trainable=True,
                           padding_index=0)
word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

0it [00:00, ?it/s]

In [5]:
# recommended default params for the models (but you may change them if you want)
if config["model"] == "knrm":
    model = KNRM(word_embedder, n_kernels=11)
elif config["model"] == "tk":
    model = TK(word_embedder, n_kernels=11, n_layers = 2, n_tf_dim = 300, n_tf_heads = 10)

In [6]:
criterion = torch.nn.HingeEmbeddingLoss()
optimizer = torch.optim.Adadelta(model.parameters())

print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Network:', model)

Model knrm total parameters: 94382411
Network: KNRM(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (fully_connected): Linear(in_features=11, out_features=1, bias=False)
)


In [9]:
def create_loader(data_path: str, create_loader: Callable[[], Any]) -> PyTorchDataLoader:
    _triple_reader = create_loader()
    _triple_reader = _triple_reader.read(data_path)
    _triple_reader.index_with(vocab)
    return PyTorchDataLoader(_triple_reader, batch_size=32)

loader = create_loader(
    config["train_data"], 
    lambda: IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30))

validation_loader = create_loader(
    config["validation_data"], 
    lambda: IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30))

In [10]:
qrel_dict = load_qrels(config["qurels"])

In [12]:
def train_batch(batch: Dict):
    query = batch["query_tokens"]
    doc_pos = batch["doc_pos_tokens"]
    doc_neg = batch["doc_neg_tokens"]

    # Zero your gradients for every batch!
    optimizer.zero_grad()

    # Make predictions for this batch
    out_pos = model.forward(query, doc_pos)
    out_neg = model.forward(query, doc_neg)

    # Compute the loss and its gradients
    loss = criterion(out_pos, out_neg)
    loss.backward()

    # Adjust learning weights
    optimizer.step()

def validation_batch(batch: Dict) -> List[Tuple[Any, Any, float]]:
    query = batch["query_tokens"]
    doc = batch["doc_tokens"]

    out = model.forward(query, doc)

    return list(zip(batch["query_id"], batch["doc_id"], out.float().tolist()))

In [13]:
# train
metrics = []
best_mrr_at_10 = -1

for epoch in range(10):
    for batch in Tqdm.tqdm(loader):
        train_batch(batch)
    
    unsorted_validations = [validation_batch(batch) for batch in Tqdm.tqdm(validation_loader)]
    sorted_output = sorted(unsorted_validations, key = lambda x: x[2])
    validations_dict = {}
    for query_id, doc_id, ranking in sorted_output:
        if query_id not in validations_dict:
            validations_dict[query_id] = list()
        validations_dict[query_id].append(doc_id)
    
    metrics.append(calculate_metrics_plain(validations_dict, qrels))

    is_best_model_yet = metrics[-1]['MRR@10'] > best_mrr_at_10
    if is_best_model_yet:
        best_mrr_at_10 = metrics[-1]['MRR@10']
        torch.save(model.state_dict(), f'./models/{config["model"]}.pt')

    if epoch > 2:
        no_improvement_since_last_epoch = metrics[-1]['MRR@10'] < metrics[-2]['MRR@10']
        if no_improvement_since_last_epoch:
            break

0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

RuntimeError: size mismatch, m1: [11 x 32], m2: [11 x 1] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:41

In [14]:
# eval (duplicate for validation inside train loop - but rename "loader", since
# otherwise it will overwrite the original train iterator, which is instantiated outside the loop)
#

_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["test_data"])
_tuple_reader.index_with(vocab)
loader = PyTorchDataLoader(_tuple_reader, batch_size=128)

for batch in Tqdm.tqdm(loader):
    # todo test loop 
    # todo evaluation
    pass

# %%

0it [00:00, ?it/s]

reading instances: 0it [00:00, ?it/s]

FileNotFoundError: file ../data/Part-2/tuples.test.tsv not found