## Install libraries

In [1]:
!pip install pytorch-pretrained-bert
!pip install livelossplot
!pip install nvidia-ml-py3
!pip install unidecode

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 27.5MB/s ta 0:00:01
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/1d/07/fb11080a1324bc8d7b68deb009a4c08bd675e0789a213028c58323c4aaab/regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl (675kB)
[K    100% |████████████████████████████████| 686kB 22.0MB/s ta 0:00:01
[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: regex, pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2 regex-2020.5.14
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting livelossplot
  Downloading https://fi

## Import Labrariers

In [2]:
import pandas as pd
import numpy as np
import os
import json
import unidecode
import re
import torch

from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME, BertForMultipleChoice
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  whitespace_tokenize)

## Helper Functions

In [3]:
# function to get the IDs of the previous queries of a query in a session 
def get_lower_ids(session_df, query_id):
    session_id = int(query_id.split('_')[0])
    current_id = int(query_id.split('_')[1])
    all_ids = [int(x.split('_')[1]) for x in session_df['query_id'].tolist()]
    lower_ids = [x for x in all_ids if x < current_id]
    lower_ids = [str(session_id) + '_' + str(x) for x in lower_ids]
    return lower_ids

In [4]:
# function that strips all non-alphanumeric characters
def remove_non_alphanumeric(text):
    text = unidecode.unidecode(str(text))
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    return text

In [5]:
# function that returns a list of segment ids based on indexed tokens (BERT)
def get_segment_ids_from_index_tokens(indexed_tokens):
    segment_ids = []
    sep = False
    for i, token in enumerate(indexed_tokens):
        if token == 102:
            sep = True
        if sep:
            segment_ids.append(1)
        else:
            segment_ids.append(0)
    return segment_ids

In [6]:
def run_bert(data):
    activations = []
    for i in tqdm_notebook(range(len(data))):
        # convert inputs to PyTorch tensors
        tokens = data.iloc[i]['indexed_tokens']
        segment_ids = data.iloc[i]['segment_ids']
        
        # make sure the input fits
        token_size_diff = len(tokens) - 512
        if token_size_diff > 0:
            tokens = [tokens[0]] + tokens[token_size_diff:]
            segment_ids = [segment_ids[0]] + segment_ids[token_size_diff:]

        tokens_tensor = torch.tensor([tokens])
        segments_tensors = torch.tensor([segment_ids])

        # set everything to run on GPU
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')

        with torch.no_grad():
            prediction = bertmodel(tokens_tensor, segments_tensors) 
            activations.append(prediction)

    data['pooled_output'] = activations
    return data

## Load Data

In [7]:
queries_filename = 'queries.dev.small.tsv'
anserini_output_filename = 'run_development_top100.tsv'
thesis_qrels_filename = 'thesis_relevance_labels.tsv'
output_filename = 'bert_thesis_dataset_top100.tsv'

In [8]:
models_dir = "../data/models/"
msmarco_dir = "../data/msmarco_files/"
anserini_output_dir = "../data/anserini_output/"
firebase_dir = "../data/firebase_data/"
output_dir = "../data/output/"

In [9]:
# MSMARCO collection
msmarco_collection = pd.read_csv(msmarco_dir + 'collection.tsv',delimiter='\t',encoding='utf-8', header=None)
msmarco_collection.columns = ['passage_id', 'passage']

In [11]:
queries_df = pd.read_csv(msmarco_dir + queries_filename,delimiter='\t',encoding='utf-8', header=None)
queries_df.columns = ['query_id', 'query']

In [20]:
query_anserini_output = pd.read_csv(anserini_output_dir + anserini_output_filename,delimiter='\t',encoding='utf-8', header=None)
query_anserini_output.columns = ['query_id', 'passage_id', 'bm25_rank']

In [12]:
qrels_df = pd.read_csv(firebase_dir + thesis_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
qrels_df.columns = ['query_id','label1','passage_id','label2']

In [14]:
thesis_query_ids = list(np.unique(qrels_df['query_id'].values.tolist()))

## Make BERT DataFrame

In [22]:
tqdm.pandas()
query_ids_dict = {"query_id": thesis_query_ids}
bert_df = pd.DataFrame(query_ids_dict)
bert_df = bert_df.merge(query_anserini_output,how='left',on=['query_id'])
bert_df = bert_df.merge(query_subset,how='left',on=['query_id'])
bert_df = bert_df.merge(msmarco_collection,how='left',on=['passage_id'])
bert_df['query'] = bert_df['query'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))
tqdm.pandas()
bert_df['passage'] = bert_df['passage'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))
bert_df['input_text'] = "[CLS] " + bert_df['query'] +" [SEP] " + bert_df['passage'] + " [SEP]"

HBox(children=(FloatProgress(value=0.0, max=4200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4200.0), HTML(value='')))




In [25]:
bert_df.head(5)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text
0,9083,7067274,1,is considered the father ...,true hippocrates is considered the father of ...,[CLS] is considered the f...
1,9083,4429824,2,is considered the father ...,who was hippocrates and what contributions did...,[CLS] is considered the f...
2,9083,6653709,3,is considered the father ...,in the west hippocrates is considered a fathe...,[CLS] is considered the f...
3,9083,5066808,4,is considered the father ...,about edgar cayce considered by many to be th...,[CLS] is considered the f...
4,9083,1176430,5,is considered the father ...,niccola2 machiavelli can be considered the fat...,[CLS] is considered the f...


## Load BERT Model

In [26]:
bertmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', 2)
bertmodel.load_state_dict(torch.load(models_dir + 'fine_tuned_bert_base_uncased'))

bertmodel.eval()
bertmodel.to('cuda')

100%|██████████| 407873900/407873900 [00:18<00:00, 21822949.03B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [27]:
tqdm.pandas()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 763140.74B/s]


## Run BERT

In [28]:
bert_df.shape

(4200, 6)

In [29]:
bert_df['indexed_tokens'] = bert_df.progress_apply(lambda row: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(row['input_text'])), axis=1)

HBox(children=(FloatProgress(value=0.0, max=4200.0), HTML(value='')))




In [30]:
bert_df.head(1)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens
0,9083,7067274,1,is considered the father ...,true hippocrates is considered the father of ...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200..."


In [31]:
bert_df['segment_ids'] = bert_df.progress_apply(lambda row: get_segment_ids_from_index_tokens(row['indexed_tokens']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=4200.0), HTML(value='')))




In [32]:
bert_df.head(1)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens,segment_ids
0,9083,7067274,1,is considered the father ...,true hippocrates is considered the father of ...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."


In [33]:
output_df = run_bert(bert_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=4200.0), HTML(value='')))




In [34]:
output_df.head(5)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens,segment_ids,pooled_output
0,9083,7067274,1,is considered the father ...,true hippocrates is considered the father of ...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-2.5108, device='cuda:0'), tensor(3.5..."
1,9083,4429824,2,is considered the father ...,who was hippocrates and what contributions did...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-1.8188, device='cuda:0'), tensor(3.0..."
2,9083,6653709,3,is considered the father ...,in the west hippocrates is considered a fathe...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-2.3673, device='cuda:0'), tensor(3.4..."
3,9083,5066808,4,is considered the father ...,about edgar cayce considered by many to be th...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(-2.2244, device='cuda:0'), tensor(3.3..."
4,9083,1176430,5,is considered the father ...,niccola2 machiavelli can be considered the fat...,[CLS] is considered the f...,"[101, 2003, 2641, 1996, 2269, 1997, 2715, 4200...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[[tensor(1.0835, device='cuda:0'), tensor(-0.4..."


In [35]:
output_df.to_csv(output_dir + output_filename,sep="\t", header=False,index=False)

In [36]:
output_df['score_bert'] = output_df.progress_apply(lambda row: row['pooled_output'].data[0][1].item(), axis=1)
output_df = output_df.drop(columns=['input_text', 'indexed_tokens', 'segment_ids', 'pooled_output'])

HBox(children=(FloatProgress(value=0.0, max=4200.0), HTML(value='')))




In [37]:
output_df["bert_rank"] = output_df.groupby("query_id")["score_bert"].rank(ascending=0,method='dense')
output_df["bert_rank"] = output_df['bert_rank'].astype(int)

In [38]:
output_df.head(10)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,score_bert,bert_rank
0,9083,7067274,1,is considered the father ...,true hippocrates is considered the father of ...,3.594209,1
1,9083,4429824,2,is considered the father ...,who was hippocrates and what contributions did...,3.096903,7
2,9083,6653709,3,is considered the father ...,in the west hippocrates is considered a fathe...,3.49077,2
3,9083,5066808,4,is considered the father ...,about edgar cayce considered by many to be th...,3.386449,4
4,9083,1176430,5,is considered the father ...,niccola2 machiavelli can be considered the fat...,-0.403724,42
5,9083,1176427,6,is considered the father ...,niccola2 machiavelli is the father of modern p...,-1.473451,54
6,9083,1176428,7,is considered the father ...,niccola2 machiavelli can be considered the fat...,-1.467033,53
7,9083,3766897,8,is considered the father ...,was a swedish botanist physician and zool...,-1.427717,52
8,9083,8221644,9,is considered the father ...,1 justus von liebig the father of modern nut...,2.079953,29
9,9083,7149238,10,is considered the father ...,antoine lavoisier is considered the father of ...,3.035284,10
