In [1]:
!pip install pytorch-pretrained-bert
!pip install livelossplot
!pip install nvidia-ml-py3
!pip install unidecode

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 10.8MB/s ta 0:00:01
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/1d/07/fb11080a1324bc8d7b68deb009a4c08bd675e0789a213028c58323c4aaab/regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl (675kB)
[K    100% |████████████████████████████████| 686kB 21.8MB/s ta 0:00:01
[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: regex, pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2 regex-2020.5.14
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting livelossplot
  Downloading https://fi

[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: livelossplot
Successfully installed livelossplot-0.5.1
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting nvidia-ml-py3
  Downloading https://files.pythonhosted.org/packages/6d/64/cce82bddb80c0b0f5c703bbdafa94bfb69a1c5ad7a79cff00b482468f0d3/nvidia-ml-py3-7.352.0.tar.gz
Building wheels for collected packages: nvidia-ml-py3
  Running setup.py bdist_wheel for nvidia-ml-py3 ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/e4/1d/06/640c93f5270d67d0247f30be91f232700d19023f9e66d735c7
Successfully built nvidia-ml-py3
Installing collected packages: nvidia-ml-py3
Successfully installed nvidia-ml-py3-7.352.0
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [2]:
import pandas as pd
import numpy as np
import os
import json
import unidecode
import re
import torch

from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME, BertForMultipleChoice
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  whitespace_tokenize)

In [3]:
def run_bert(data):
    activations = []
    for i in tqdm_notebook(range(len(data))):
        # convert inputs to PyTorch tensors
        tokens = data.iloc[i]['indexed_tokens']
        segment_ids = data.iloc[i]['segment_ids']
        
        # make sure the input fits
        token_size_diff = len(tokens) - 512
        if token_size_diff > 0:
            tokens = [tokens[0]] + tokens[token_size_diff:]
            segment_ids = [segment_ids[0]] + segment_ids[token_size_diff:]

        tokens_tensor = torch.tensor([tokens])
        segments_tensors = torch.tensor([segment_ids])

        # set everything to run on GPU
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')

        with torch.no_grad():
            prediction = bertmodel(tokens_tensor, segments_tensors) 
            activations.append(prediction)

    data['pooled_output'] = activations
    return data

In [4]:
data_dir = "../data/Rodrigo_Experiment/"
output_dir = "../data/output/"
models_dir = "../data/models/"

In [54]:
bert_df_filename = 'bert_msmarco_leaderboard_df_subset_28.tsv'
output_df_filename = 'bert_run_msmarco_leaderboard_df_subset_28.tsv'

In [55]:
bert_df = pd.read_csv(data_dir + bert_df_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id','passage_id','bm25_rank','query','passage','input_text','indexed_tokens','segment_ids']

In [56]:
tqdm.pandas()
bert_df['indexed_tokens'] = bert_df['indexed_tokens'].progress_apply(lambda x: json.loads(x))
bert_df['segment_ids'] = bert_df['segment_ids'].progress_apply(lambda x: json.loads(x))

HBox(children=(FloatProgress(value=0.0, max=228208.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=228208.0), HTML(value='')))




In [57]:
bertmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', 2)
bertmodel.load_state_dict(torch.load(models_dir + 'fine_tuned_bert_base_uncased'))

bertmodel.eval()
bertmodel.to('cuda')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [58]:
output_df = run_bert(bert_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=228208.0), HTML(value='')))




In [47]:
output_df['score_bert'] = output_df.progress_apply(lambda row: row['pooled_output'].data[0][1].item(), axis=1)

HBox(children=(FloatProgress(value=0.0, max=249069.0), HTML(value='')))




In [48]:
output_df = output_df.drop(columns=['input_text', 'indexed_tokens', 'segment_ids', 'pooled_output'])

In [49]:
output_df["bert_rank"] = output_df.groupby("query_id")["score_bert"].rank(ascending=0,method='dense')
output_df["bert_rank"] = output_df['bert_rank'].astype(int)

In [50]:
output_df.to_csv(output_dir + output_df_filename,sep="\t", header=False,index=False)

In [51]:
nan_rows = output_df[output_df.isnull().T.any().T]

In [52]:
nan_rows

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,score_bert,bert_rank


In [53]:
output_df[output_df['bert_rank'] == 1]

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,score_bert,bert_rank
222,1095566,7578238,223,how much players in the nfl get drafted,that means that just nine in 10 000 or 09 pe...,2.850218,1
1078,1095571,7577917,79,how much pepto per lb for a dog,according to veterinarian dr mark papich the ...,2.970931,1
2042,1095631,6151902,43,how much money will americans spend for easter,according to the national retail federation a...,3.473790,1
3065,1095633,8462673,66,how much money was given to trump by nra,and trump is by far the largest beneficiary w...,3.335880,1
4209,1095641,5801505,210,how much money is made by donations,contributions of money can be made in cash by...,2.131101,1
5173,1095650,7560420,174,how much money is a bitcoin,bitcoin to usd 0 1 btc to usd 0 2 btc to usd ...,2.495796,1
6001,1095654,2642156,2,how much money does the nra contribute to the ...,according to a report from the washington post...,3.554845,1
7191,1095687,7539863,192,how much money do you get for winning gold medal,gold medal winning athletes from the united st...,2.866782,1
8016,1095699,7569503,17,how much money did the nra give to ted cruz of...,data maplight conducted an analysis of campai...,3.320624,1
9035,1095704,6001219,36,average salary of a team lead etrade,team leader salary team leader average salary...,3.091347,1
