# BERT Try-out

## Imports

In [1]:
!pip install pytorch-pretrained-bert
!pip install livelossplot
!pip install nvidia-ml-py3
!pip install unidecode
import pandas as pd
import numpy as np
import os
import json
import unidecode
import re
import torch

from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME, BertForMultipleChoice
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  whitespace_tokenize)

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 26.7MB/s ta 0:00:01
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/8c/db/4b29a0adec5881542cd81cb5d1929b5c0787003c5740b3c921e627d9c2e5/regex-2019.12.9.tar.gz (669kB)
[K    100% |████████████████████████████████| 675kB 24.0MB/s ta 0:00:01
Building wheels for collected packages: regex
  Running setup.py bdist_wheel for regex ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/0d/fb/b3/a89169557229468c49ca64f6839418f22461f6ee0a74f342b1
Successfully built regex
[31mfastai 1.0.59 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: regex, pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.

[31mfastai 1.0.59 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: livelossplot
Successfully installed livelossplot-0.4.1
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting nvidia-ml-py3
  Downloading https://files.pythonhosted.org/packages/6d/64/cce82bddb80c0b0f5c703bbdafa94bfb69a1c5ad7a79cff00b482468f0d3/nvidia-ml-py3-7.352.0.tar.gz
Building wheels for collected packages: nvidia-ml-py3
  Running setup.py bdist_wheel for nvidia-ml-py3 ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/e4/1d/06/640c93f5270d67d0247f30be91f232700d19023f9e66d735c7
Successfully built nvidia-ml-py3
Installing collected packages: nvidia-ml-py3
Successfully installed nvidia-ml-py3-7.352.0
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

## Helper Functions

In [2]:
# function to get the IDs of the previous queries of a query in a session 
def get_lower_ids(session_df, query_id):
    session_id = int(query_id.split('_')[0])
    current_id = int(query_id.split('_')[1])
    all_ids = [int(x.split('_')[1]) for x in session_df['query_id'].tolist()]
    lower_ids = [x for x in all_ids if x < current_id]
    lower_ids = [str(session_id) + '_' + str(x) for x in lower_ids]
    return lower_ids

In [3]:
# function that strips all non-alphanumeric characters
def remove_non_alphanumeric(text):
    text = unidecode.unidecode(str(text))
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    return text

In [4]:
# function that returns a list of segment ids based on indexed tokens (BERT)
def get_segment_ids_from_index_tokens(indexed_tokens):
    segment_ids = []
    sep = False
    for i, token in enumerate(indexed_tokens):
        if token == 102:
            sep = True
        if sep:
            segment_ids.append(1)
        else:
            segment_ids.append(0)
    return segment_ids

In [5]:
def run_bert(data):
    activations = []
    for i in tqdm_notebook(range(len(data))):
        # convert inputs to PyTorch tensors
        tokens = data.iloc[i]['indexed_tokens']
        segment_ids = data.iloc[i]['segment_ids']
        
        # make sure the input fits
        token_size_diff = len(tokens) - 512
        if token_size_diff > 0:
            tokens = [tokens[0]] + tokens[token_size_diff:]
            segment_ids = [segment_ids[0]] + segment_ids[token_size_diff:]

        tokens_tensor = torch.tensor([tokens])
        segments_tensors = torch.tensor([segment_ids])

        # set everything to run on GPU
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')

        with torch.no_grad():
            prediction = bertmodel(tokens_tensor, segments_tensors) 
            activations.append(prediction)

    data['pooled_output'] = activations
    return data

## Load Data

In [6]:
models_dir = "../data/models/"
msmarco_dir = "../data/msmarco_files/"
anserini_output_dir = "../data/anserini_output/"

In [7]:
# MSMARCO collection
msmarco_collection = pd.read_csv(msmarco_dir + 'collection.tsv',delimiter='\t',encoding='utf-8', header=None)
msmarco_collection.columns = ['passage_id', 'passage']

In [9]:
query_subset = pd.read_csv(msmarco_dir + 'queries.train.subset.tsv',delimiter='\t',encoding='utf-8', header=None)
query_subset.columns = ['query_id', 'query']

In [10]:
query_anserini_output = pd.read_csv(anserini_output_dir + 'run_queries_subset.tsv',delimiter='\t',encoding='utf-8', header=None)
query_anserini_output.columns = ['query_id', 'passage_id', 'bm25_rank']

In [11]:
query_anserini_output.head(1)

Unnamed: 0,query_id,passage_id,bm25_rank
0,413367,670475,1


## Make BERT Dataframe

In [12]:
tqdm.pandas()
bert_df = query_anserini_output.copy()
bert_df = bert_df.merge(query_subset,how='left',on=['query_id'])
bert_df = bert_df.merge(msmarco_collection,how='left',on=['passage_id'])
bert_df['query'] = bert_df['query'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))
tqdm.pandas()
bert_df['passage'] = bert_df['passage'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))
bert_df['input_text'] = "[CLS] " + bert_df['query'] +" [SEP] " + bert_df['passage'] + " [SEP]"

HBox(children=(IntProgress(value=0, max=8086564), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8086564), HTML(value='')))




In [13]:
bert_df.head(5)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text
0,413367,670475,1,is it a footlong and them some,cold cut combo 5 50 footlong italian b m t...,[CLS] is it a footlong and them some [SEP] col...
1,413367,4359509,2,is it a footlong and them some,cheese amount on 6 inch sandwich double valu...,[CLS] is it a footlong and them some [SEP] che...
2,413367,7279276,3,is it a footlong and them some,footlong chicken bacon ranch melt with mayo...,[CLS] is it a footlong and them some [SEP] foo...
3,413367,5821389,4,is it a footlong and them some,calories in a footlong italian b m t there ar...,[CLS] is it a footlong and them some [SEP] cal...
4,413367,2340426,5,is it a footlong and them some,cheese amount on 6 inch sandwich double valu...,[CLS] is it a footlong and them some [SEP] che...


## Load BERT Model

In [14]:
bertmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', 2)
bertmodel.load_state_dict(torch.load(models_dir + 'fine_tuned_bert_base_uncased'))

bertmodel.eval()
bertmodel.to('cuda')

100%|██████████| 407873900/407873900 [00:22<00:00, 18372975.98B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [15]:
tqdm.pandas()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 1084499.08B/s]


## Test on small subset

In [17]:
small_bert_df = bert_df[:100].copy()

In [18]:
small_bert_df['indexed_tokens'] = small_bert_df.progress_apply(lambda row: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(row['input_text'])), axis=1)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [19]:
small_bert_df['segment_ids'] = small_bert_df.progress_apply(lambda row: get_segment_ids_from_index_tokens(row['indexed_tokens']), axis=1)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [20]:
output_df = run_bert(small_bert_df)

HBox(children=(IntProgress(value=0), HTML(value='')))




## Run Bert

In [29]:
output_df.tail(5)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,input_text,indexed_tokens,segment_ids,pooled_output
95,413367,8631041,96.0,is it a footlong and them some,magpie faq birdwatch received more queries a...,[CLS] is it a footlong and them some [SEP] mag...,"[101, 2003, 2009, 1037, 3329, 10052, 1998, 206...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[tensor(3.5071, device='cuda:0'), tensor(-4.4..."
96,413367,3835076,97.0,is it a footlong and them some,then for the unclean person they shall take so...,[CLS] is it a footlong and them some [SEP] the...,"[101, 2003, 2009, 1037, 3329, 10052, 1998, 206...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[tensor(3.5581, device='cuda:0'), tensor(-4.5..."
97,413367,4662852,98.0,is it a footlong and them some,it depends on how you categorize them for ins...,[CLS] is it a footlong and them some [SEP] it ...,"[101, 2003, 2009, 1037, 3329, 10052, 1998, 206...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[tensor(3.4378, device='cuda:0'), tensor(-4.2..."
98,413367,4951657,99.0,is it a footlong and them some,a 100g of raw watermelon fruit contains ...,[CLS] is it a footlong and them some [SEP] ...,"[101, 2003, 2009, 1037, 3329, 10052, 1998, 206...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[tensor(3.5575, device='cuda:0'), tensor(-4.5..."
99,413367,6515568,100.0,is it a footlong and them some,if you ve decided to start a small farm busine...,[CLS] is it a footlong and them some [SEP] if ...,"[101, 2003, 2009, 1037, 3329, 10052, 1998, 206...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[tensor(3.5080, device='cuda:0'), tensor(-4.5..."


In [30]:
output_df['score_bert'] = output_df.progress_apply(lambda row: row['pooled_output'].data[0][1].item(), axis=1)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [31]:
# remove unneeded columns
output_df = output_df.drop(columns=['input_text', 'indexed_tokens', 'segment_ids', 'pooled_output'])

# assign a rank to the scores
output_df["bert_rank"] = output_df.groupby("query_id")["score_bert"].rank(ascending=0,method='dense')
output_df["bert_rank"] = output_df['bert_rank'].astype(int)

In [32]:
output_df

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,score_bert,bert_rank
0,413367,670475,1.0,is it a footlong and them some,cold cut combo 5 50 footlong italian b m t...,-3.164047,29
1,413367,4359509,2.0,is it a footlong and them some,cheese amount on 6 inch sandwich double valu...,-3.241947,33
2,413367,7279276,3.0,is it a footlong and them some,footlong chicken bacon ranch melt with mayo...,-3.161703,28
3,413367,5821389,4.0,is it a footlong and them some,calories in a footlong italian b m t there ar...,-3.973064,53
4,413367,2340426,5.0,is it a footlong and them some,cheese amount on 6 inch sandwich double valu...,-3.364845,35
5,413367,6756670,6.0,is it a footlong and them some,footlong quarter pound coney this sonic class...,-2.786904,21
6,413367,3179620,7.0,is it a footlong and them some,unhealthiest options on the subway menu 1 fo...,-3.985879,56
7,413367,2240246,8.0,is it a footlong and them some,it should be 10 99 a foot but it depends on ...,1.892739,6
8,413367,1243418,9.0,is it a footlong and them some,news subway april 2014 featured 5 footlong ...,-2.421246,19
9,413367,2240245,10.0,is it a footlong and them some,how much is a 3ft subway sandwich it should b...,-2.166980,16
