<a href="https://colab.research.google.com/github/Prakhar314/COL764-Project/blob/main/evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers tensorflow-addons



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Model

In [9]:
import transformers
from transformers import BertTokenizer, BertConfig, TFBertModel
import tensorflow as tf
import tensorflow_addons as tfa

In [10]:
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

#### Pointwise

In [13]:
def create_inputs(num_nodes,name):
  layers = []
  for layer_name in ['input_ids','token_type_ids','attention_mask']:
    layers.append(tf.keras.layers.Input(shape=(num_nodes,),dtype=tf.int32,name=layer_name))
  return layers

def create_model_pointwise(output_bias=None):
    if output_bias is not None:
      output_bias = tf.keras.initializers.Constant(output_bias)
    config = BertConfig(hidden_dropout_prob=0.1)
    bert = TFBertModel.from_pretrained(bert_model_name,config=config)
    bert.save_pretrained('./bert-model/')
    for layer in bert.layers[:]:
      if isinstance(layer, transformers.models.bert.modeling_tf_bert.TFBertMainLayer):
        layer.embeddings.trainable=False
        layer.pooler.trainable=False
        for idx, layer in enumerate(layer.encoder.layer):
            # print(layer)
            # freeze first 10
            if idx in range(8):
                layer.trainable = False
      else:
        layer.trainable = False
              
    input_layer = create_inputs(512,'pair')
    bert_out = bert(input_layer).last_hidden_state
    cls = tf.keras.layers.Lambda(lambda x:x[:,0,:])(bert_out)
    # print(avg_q.shape)
    output = tf.keras.layers.Dense(1, activation="sigmoid",bias_initializer=output_bias)(cls)
    model = tf.keras.models.Model(inputs=input_layer, outputs=[output])
    # opt,schedule = transformers.create_optimizer(num_train_steps=num_train_steps,init_lr=3e-5,adam_beta1=0.9,adam_beta2=0.999,weight_decay_rate=0.01,num_warmup_steps=10000)
    opt = tfa.optimizers.RectifiedAdam()
    model.compile(optimizer=opt,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(),
                       tf.keras.metrics.AUC(curve="ROC")])
    model.summary()
    return model

#### Load Weights

In [14]:
model = create_model_pointwise()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'token_type_ids[0][0]',   

In [15]:
model.load_weights("/content/gdrive/MyDrive/col764/pointwise_best_model_cw_50.h5")

### Reranking

In [16]:
!tar -zxf /content/gdrive/MyDrive/col764/output_qrel_only.tar.gz

In [17]:
!ls /content/gdrive/MyDrive/col764/results

run.desc.test.txt   run.query.test.txt		 run.query.train.txt
run.desc.train.txt  run.query.test.txt.reranked


In [18]:
import os

In [19]:
import pandas as pd

In [20]:
query_file = '/content/gdrive/MyDrive/col764/queries/podcasts_2020_topics_test_query.tsv'

In [21]:
queries = pd.read_csv(query_file,sep='\t',header=None)
queries.columns = ['qnum','query']

In [22]:
queries

Unnamed: 0,qnum,query
0,9,trump call ukrainian president
1,10,boeing 737 crash causes
2,11,how to cook turkey
3,12,imran khan career
4,13,drug addiction recovery
5,14,near death experiences
6,15,podcast about podcasts
7,16,causes and prevention of wildfires
8,17,time between meetings
9,18,women in stem


In [23]:
results_file = '/content/gdrive/MyDrive/col764/results/run.query.test.txt'

In [24]:
bm25_ranks = pd.read_csv(results_file,sep=' ',header=None)
bm25_ranks.columns = ['qnum','qid','epid','rank','score','runid']

In [25]:
bm25_ranks

Unnamed: 0,qnum,qid,epid,rank,score,runid
0,9,Q0,spotify:episode:6O8djf3RL94yNfaoWqvk3r_840.0,1,16.569401,Anserini
1,9,Q0,spotify:episode:3gJ0fBXYRryWAjgwxY4q6C_240.0,2,16.302999,Anserini
2,9,Q0,spotify:episode:6svv5L5kRn8groSsM8gkpc_3180.0,3,15.984800,Anserini
3,9,Q0,spotify:episode:4JqqVFhNjtlwoIMl5tvL8R_780.0,4,15.275800,Anserini
4,9,Q0,spotify:episode:3gJ0fBXYRryWAjgwxY4q6C_0.0,5,14.734600,Anserini
...,...,...,...,...,...,...
47753,58,Q0,spotify:episode:30j4F0dABykbXJuGevrfNB_1920.0,996,4.663695,Anserini
47754,58,Q0,spotify:episode:3ALoC0SU00c8RtxQHohesZ_1440.0,997,4.663694,Anserini
47755,58,Q0,spotify:episode:3ALoC0SU00c8RtxQHohesZ_420.0,998,4.663693,Anserini
47756,58,Q0,spotify:episode:3bieaciVail62osQ0YC8JE_1380.0,999,4.663692,Anserini


In [26]:
bm25_ranks = bm25_ranks[bm25_ranks['rank']<51].reset_index(drop=True)

In [27]:
episode_contents = {k:None for k in bm25_ranks.epid.unique()}

In [28]:
!ls /content/

bert-model  gdrive  output_copy  sample_data  trec_eval


In [29]:
import json
for root, dirs, files in os.walk('/content/output_copy'):
  files.sort()
  for file in files:
    # print(file)
    with open(root+"/"+file,'r') as f:
      episode_segments = json.load(f)
      for segment in episode_segments:
        if segment['id'] in episode_contents:
          episode_contents[segment['id']] = segment['contents']

In [30]:
len(sorted(list(episode_contents.keys())))

2471

In [31]:
bm25_ranks['segment'] = bm25_ranks['epid'].map(episode_contents)

In [32]:
bm25_ranks=bm25_ranks.merge(queries,how='left',left_on='qnum',right_on='qnum')

In [33]:
q_trunc = 64
p_trunc = 512-64+1

In [34]:
import gc
import numpy as np

In [35]:
q = tokenizer(bm25_ranks['query'].values.tolist(), return_tensors="tf",padding="max_length",max_length=q_trunc,truncation=True)
p = tokenizer(bm25_ranks['segment'].values.tolist(), return_tensors="tf",padding="max_length",max_length=p_trunc,truncation=True)
X_test = []
for id in ['input_ids','token_type_ids','attention_mask']:
  X_test.append(tf.concat([q[id],p[id][:,1:]],1))
q = None
p = None
gc.collect()

24990

In [36]:
scores = model.predict(X_test,batch_size=128,verbose=1)



In [37]:
print(scores.shape)

(2476, 1)


In [38]:
bm25_ranks['score'] = scores

In [39]:
bm25_ranks = bm25_ranks.drop(['segment','query'],axis=1)

In [40]:
bm25_ranks = bm25_ranks.sort_values(['qnum','score'], ascending = [True, False]).reset_index(drop=True)

In [41]:
bm25_ranks['rank'] = bm25_ranks.groupby('qnum')['score'].rank(ascending=False).astype(np.int16)

In [42]:
bm25_ranks

Unnamed: 0,qnum,qid,epid,rank,score,runid
0,9,Q0,spotify:episode:6svv5L5kRn8groSsM8gkpc_3120.0,1,0.550882,Anserini
1,9,Q0,spotify:episode:4JqqVFhNjtlwoIMl5tvL8R_780.0,2,0.482461,Anserini
2,9,Q0,spotify:episode:6O8djf3RL94yNfaoWqvk3r_840.0,3,0.465984,Anserini
3,9,Q0,spotify:episode:3gJ0fBXYRryWAjgwxY4q6C_240.0,4,0.327170,Anserini
4,9,Q0,spotify:episode:6svv5L5kRn8groSsM8gkpc_3180.0,5,0.318324,Anserini
...,...,...,...,...,...,...
2471,58,Q0,spotify:episode:0CwYbNjJlIYF49WAMyAg4u_240.0,46,0.022834,Anserini
2472,58,Q0,spotify:episode:5xDfrpC4pmkHBVf28JYKv1_120.0,47,0.022609,Anserini
2473,58,Q0,spotify:episode:157pxhEAYkEViaxyYY2Qqi_2460.0,48,0.020754,Anserini
2474,58,Q0,spotify:episode:0Ec1VOSw0IcIqyhKJXT4hi_2220.0,49,0.020148,Anserini


In [43]:
bm25_ranks.to_csv(results_file+'.reranked', sep = '\t', index=False)

In [44]:
!git clone https://github.com/usnistgov/trec_eval

fatal: destination path 'trec_eval' already exists and is not an empty directory.


In [45]:
!cd trec_eval;make

make: 'trec_eval' is up to date.


In [46]:
!./trec_eval/trec_eval  -q -c -M1000 -m ndcg_cut.5 /content/gdrive/MyDrive/col764/qrels/2020_test_qrels.list {results_file}.reranked

ndcg_cut_5            	10	0.0000
ndcg_cut_5            	11	0.3688
ndcg_cut_5            	12	0.0000
ndcg_cut_5            	13	0.0000
ndcg_cut_5            	14	0.6667
ndcg_cut_5            	15	0.0000
ndcg_cut_5            	16	0.1696
ndcg_cut_5            	17	0.0000
ndcg_cut_5            	18	0.1378
ndcg_cut_5            	19	0.7636
ndcg_cut_5            	20	0.1830
ndcg_cut_5            	21	0.3766
ndcg_cut_5            	22	0.0477
ndcg_cut_5            	23	0.2234
ndcg_cut_5            	24	0.6417
ndcg_cut_5            	25	0.7227
ndcg_cut_5            	26	0.4773
ndcg_cut_5            	27	0.7939
ndcg_cut_5            	28	0.4344
ndcg_cut_5            	29	0.3392
ndcg_cut_5            	30	0.2246
ndcg_cut_5            	31	0.0000
ndcg_cut_5            	32	0.0000
ndcg_cut_5            	33	0.3352
ndcg_cut_5            	34	0.2733
ndcg_cut_5            	35	0.3107
ndcg_cut_5            	36	0.3300
ndcg_cut_5            	37	0.4907
ndcg_cut_5            	38	0.8156
ndcg_cut_5            	39	0.3389
ndcg_cut_5

In [47]:
!./trec_eval/trec_eval  -q -c -m ndcg_cut.5  /content/gdrive/MyDrive/col764/qrels/2020_test_qrels.list {results_file}

ndcg_cut_5            	10	0.6302
ndcg_cut_5            	11	0.3383
ndcg_cut_5            	12	0.2140
ndcg_cut_5            	13	0.0000
ndcg_cut_5            	14	0.7945
ndcg_cut_5            	15	0.0000
ndcg_cut_5            	16	0.1918
ndcg_cut_5            	17	0.0000
ndcg_cut_5            	18	0.0000
ndcg_cut_5            	19	0.7174
ndcg_cut_5            	20	0.6740
ndcg_cut_5            	21	0.4979
ndcg_cut_5            	22	0.1556
ndcg_cut_5            	23	0.0000
ndcg_cut_5            	24	0.2517
ndcg_cut_5            	25	0.6399
ndcg_cut_5            	26	0.7021
ndcg_cut_5            	27	0.4790
ndcg_cut_5            	28	0.6070
ndcg_cut_5            	29	0.7154
ndcg_cut_5            	30	0.1598
ndcg_cut_5            	31	0.0000
ndcg_cut_5            	32	0.1091
ndcg_cut_5            	33	0.3493
ndcg_cut_5            	34	0.2140
ndcg_cut_5            	35	0.3284
ndcg_cut_5            	36	0.5932
ndcg_cut_5            	37	0.2953
ndcg_cut_5            	38	0.9076
ndcg_cut_5            	39	0.4480
ndcg_cut_5

In [48]:
!ls /content/gdrive/MyDrive/col764/results/

run.desc.test.txt   run.query.test.txt		 run.query.train.txt
run.desc.train.txt  run.query.test.txt.reranked
