# 2. BERT Rerank

## Imports

In [1]:
!pip install pytorch-pretrained-bert
!pip install livelossplot
!pip install nvidia-ml-py3
!pip install unidecode
import pandas as pd
import numpy as np
import os
import json
import unidecode
import re
import torch

from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME, BertForMultipleChoice
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  whitespace_tokenize)

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 21.7MB/s ta 0:00:01
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/ed/36/fd20c656fb4a4fbe8db367ea274c3465b81cb2e01ffc57b9980f0578e131/regex-2020.2.20-cp36-cp36m-manylinux1_x86_64.whl (690kB)
[K    100% |████████████████████████████████| 696kB 28.2MB/s ta 0:00:01
[31mfastai 1.0.59 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: regex, pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2 regex-2020.2.20
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting livelossplot
  Downloading https://fi

## Helper Functions

In [2]:
# function to get the IDs of the previous queries of a query in a session 
def get_lower_ids(session_df, query_id):
    session_id = int(query_id.split('_')[0])
    current_id = int(query_id.split('_')[1])
    all_ids = [int(x.split('_')[1]) for x in session_df['query_id'].tolist()]
    lower_ids = [x for x in all_ids if x < current_id]
    lower_ids = [str(session_id) + '_' + str(x) for x in lower_ids]
    return lower_ids

In [3]:
# function that strips all non-alphanumeric characters
def remove_non_alphanumeric(text):
    text = unidecode.unidecode(str(text))
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    return text

In [4]:
# function that returns a list of segment ids based on indexed tokens (BERT)
def get_segment_ids_from_index_tokens(indexed_tokens):
    segment_ids = []
    sep = False
    for i, token in enumerate(indexed_tokens):
        if token == 102:
            sep = True
        if sep:
            segment_ids.append(1)
        else:
            segment_ids.append(0)
    return segment_ids

In [5]:
def run_bert(data):
    activations = []
    for i in tqdm_notebook(range(len(data))):
        # convert inputs to PyTorch tensors
        tokens = data.iloc[i]['indexed_tokens']
        segment_ids = data.iloc[i]['segment_ids']
        
        # make sure the input fits
        token_size_diff = len(tokens) - 512
        if token_size_diff > 0:
            tokens = [tokens[0]] + tokens[token_size_diff:]
            segment_ids = [segment_ids[0]] + segment_ids[token_size_diff:]

        tokens_tensor = torch.tensor([tokens])
        segments_tensors = torch.tensor([segment_ids])

        # set everything to run on GPU
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')

        with torch.no_grad():
            prediction = bertmodel(tokens_tensor, segments_tensors) 
            activations.append(prediction)

    data['pooled_output'] = activations
    return data

## Load Data

CHANGE FOLLOWING FILENAMES

In [1]:
query_subset_filename = 'queries.dev.small.tsv'
anserini_output_filename = 'run_development_top100.tsv'
top100_query_ids_filename = 'top100_dev_small_query_ids.tsv'
output_filename = 'bert_run_development_top100.tsv'

In [7]:
models_dir = "../data/models/"
msmarco_dir = "../data/msmarco_files/"
anserini_output_dir = "../data/anserini_output/"
output_dir = "../data/output/"

In [8]:
# MSMARCO collection
msmarco_collection = pd.read_csv(msmarco_dir + 'collection.tsv',delimiter='\t',encoding='utf-8', header=None)
msmarco_collection.columns = ['passage_id', 'passage']

In [9]:
query_subset = pd.read_csv(msmarco_dir + query_subset_filename,delimiter='\t',encoding='utf-8', header=None)
query_subset.columns = ['query_id', 'query']

In [10]:
query_anserini_output = pd.read_csv(anserini_output_dir + anserini_output_filename,delimiter='\t',encoding='utf-8', header=None)
query_anserini_output.columns = ['query_id', 'passage_id', 'bm25_rank']

In [11]:
top100_query_ids = pd.read_csv(output_dir + top100_query_ids_filename,delimiter='\t',encoding='utf-8', header=None)
top100_query_ids.columns = ['query_id']

## Make BERT Dataframe

In [12]:
tqdm.pandas()
bert_df = top100_query_ids.copy()
bert_df = bert_df.merge(query_anserini_output,how='left',on=['query_id'])
bert_df = bert_df.merge(query_subset,how='left',on=['query_id'])
bert_df = bert_df.merge(msmarco_collection,how='left',on=['passage_id'])
bert_df['query'] = bert_df['query'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))
tqdm.pandas()
bert_df['passage'] = bert_df['passage'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))
bert_df['input_text'] = "[CLS] " + bert_df['query'] +" [SEP] " + bert_df['passage'] + " [SEP]"

HBox(children=(IntProgress(value=0, max=473568), HTML(value='')))




HBox(children=(IntProgress(value=0, max=473568), HTML(value='')))




In [13]:
bert_df.head(5)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text
0,2,1782337,1,androgen receptor define,enzalutamide is an androgen receptor inhibitor...,[CLS] androgen receptor define [SEP] enzaluta...
1,2,1001873,2,androgen receptor define,the ar gene provides instructions for making a...,[CLS] androgen receptor define [SEP] the ar g...
2,2,4339075,3,androgen receptor define,during androgen independent progression prost...,[CLS] androgen receptor define [SEP] during a...
3,2,6285817,4,androgen receptor define,the term sarms stands for aselective androgen ...,[CLS] androgen receptor define [SEP] the term...
4,2,3634076,5,androgen receptor define,sarms or selective androgen receptor modulator...,[CLS] androgen receptor define [SEP] sarms or...


## Load BERT Model

IMPORTANT NOTE: please make sure to run instance of ml.p2.xlarge instead of ml.t2.2xlarge --> otherwise no cuda available

In [14]:
bertmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', 2)
bertmodel.load_state_dict(torch.load(models_dir + 'fine_tuned_bert_base_uncased'))

bertmodel.eval()
bertmodel.to('cuda')

100%|██████████| 407873900/407873900 [00:18<00:00, 21895566.39B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [15]:
tqdm.pandas()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 815534.16B/s]


## Run Bert

Running Bert takes a long time on the dataset. Therefor, it is wise to first split the dataset in two subsets.

#### Make subsets

This is only necessary with large sets. Not if you use the development set

In [16]:
bert_df.shape

(473568, 6)

In [41]:
bert_df_pt1 = bert_df[:1654400]
bert_df_pt2 = bert_df[1654400:]

In [42]:
bert_df_pt1.shape

(1654400, 6)

In [43]:
bert_df_pt1.tail(2)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text
1654398,613748,6169808,99,what county is tonopah nv,the real property transfer tax is a tax collec...,[CLS] what county is tonopah nv [SEP] the rea...
1654399,613748,6987332,100,what county is tonopah nv,battle mountain nv sponsored topics battle ...,[CLS] what county is tonopah nv [SEP] battle ...


In [44]:
bert_df_pt2.shape

(1654300, 6)

In [45]:
bert_df_pt2.head(2)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text
1654400,613749,1988501,1,what county is tontogany oh in,currently our records indicate that there are ...,[CLS] what county is tontogany oh in [SEP] cur...
1654401,613749,1988505,2,what county is tontogany oh in,sex offenders registry statistics for tontogan...,[CLS] what county is tontogany oh in [SEP] sex...


#### Run Bert

In [17]:
bert_df['indexed_tokens'] = bert_df.progress_apply(lambda row: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(row['input_text'])), axis=1)

HBox(children=(IntProgress(value=0, max=473568), HTML(value='')))




In [18]:
bert_df.head(1)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens
0,2,1782337,1,androgen receptor define,enzalutamide is an androgen receptor inhibitor...,[CLS] androgen receptor define [SEP] enzaluta...,"[101, 1998, 22991, 10769, 9375, 102, 4372, 167..."


In [19]:
bert_df['segment_ids'] = bert_df.progress_apply(lambda row: get_segment_ids_from_index_tokens(row['indexed_tokens']), axis=1)

HBox(children=(IntProgress(value=0, max=473568), HTML(value='')))




In [20]:
bert_df.head(1)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens,segment_ids
0,2,1782337,1,androgen receptor define,enzalutamide is an androgen receptor inhibitor...,[CLS] androgen receptor define [SEP] enzaluta...,"[101, 1998, 22991, 10769, 9375, 102, 4372, 167...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [21]:
output_df = run_bert(bert_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(IntProgress(value=0, max=473568), HTML(value='')))




In [22]:
output_df.head(5)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens,segment_ids,pooled_output
0,2,1782337,1,androgen receptor define,enzalutamide is an androgen receptor inhibitor...,[CLS] androgen receptor define [SEP] enzaluta...,"[101, 1998, 22991, 10769, 9375, 102, 4372, 167...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(1.7580, device='cuda:0'), tensor(-1.7..."
1,2,1001873,2,androgen receptor define,the ar gene provides instructions for making a...,[CLS] androgen receptor define [SEP] the ar g...,"[101, 1998, 22991, 10769, 9375, 102, 1996, 120...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-0.7092, device='cuda:0'), tensor(2.2..."
2,2,4339075,3,androgen receptor define,during androgen independent progression prost...,[CLS] androgen receptor define [SEP] during a...,"[101, 1998, 22991, 10769, 9375, 102, 2076, 199...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(1.8136, device='cuda:0'), tensor(-1.8..."
3,2,6285817,4,androgen receptor define,the term sarms stands for aselective androgen ...,[CLS] androgen receptor define [SEP] the term...,"[101, 1998, 22991, 10769, 9375, 102, 1996, 274...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-0.9003, device='cuda:0'), tensor(2.4..."
4,2,3634076,5,androgen receptor define,sarms or selective androgen receptor modulator...,[CLS] androgen receptor define [SEP] sarms or...,"[101, 1998, 22991, 10769, 9375, 102, 18906, 52...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-0.4892, device='cuda:0'), tensor(2.0..."


In [23]:
output_df.to_csv(output_dir + output_filename,sep="\t", header=False,index=False)

In [24]:
output_df['score_bert'] = output_df.progress_apply(lambda row: row['pooled_output'].data[0][1].item(), axis=1)
output_df = output_df.drop(columns=['input_text', 'indexed_tokens', 'segment_ids', 'pooled_output'])

HBox(children=(IntProgress(value=0, max=473568), HTML(value='')))




In [25]:
output_df["bert_rank"] = output_df.groupby("query_id")["score_bert"].rank(ascending=0,method='dense')
output_df["bert_rank"] = output_df['bert_rank'].astype(int)

In [26]:
output_df.head(10)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,score_bert,bert_rank
0,2,1782337,1,androgen receptor define,enzalutamide is an androgen receptor inhibitor...,-1.713628,34
1,2,1001873,2,androgen receptor define,the ar gene provides instructions for making a...,2.283455,5
2,2,4339075,3,androgen receptor define,during androgen independent progression prost...,-1.800174,36
3,2,6285817,4,androgen receptor define,the term sarms stands for aselective androgen ...,2.404728,4
4,2,3634076,5,androgen receptor define,sarms or selective androgen receptor modulator...,2.02906,8
5,2,7496506,6,androgen receptor define,the binding of an androgen to the androgen rec...,-0.913418,22
6,2,4339068,7,androgen receptor define,the androgen receptor ar also known as nr3c...,2.824988,1
7,2,2022782,8,androgen receptor define,the androgen dihydrotestosterone binds to the ...,-0.705901,20
8,2,7496507,9,androgen receptor define,the gene for the syndrome is on the x chromoso...,-1.25611,29
9,2,4339072,10,androgen receptor define,in some cell types testosterone interacts dir...,-2.665199,58


In [27]:
output_df.to_csv(output_dir + output_filename,sep="\t", header=False,index=False)