In [1]:
import torch
import pandas as pd
import numpy as np


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [111]:
merged_df = pd.read_parquet('processed_df/merged_df.parquet')

In [112]:
merged_df = merged_df.reset_index()
merged_df.rename(columns={'index': 'claim_index'}, inplace=True)
merged_df.head()

Unnamed: 0,claim_index,claim_id,label,claim,wiki_title,wiki_index,wiki_text
0,0,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
1,1,58718,REFUTES,nikolaj coster waldau was not in a danish thri...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
2,2,134655,SUPPORTS,nikolaj coster waldau worked with peter dinklage,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
3,3,86306,REFUTES,nikolaj coster waldau refused to ever work wit...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
4,4,149361,SUPPORTS,nikolaj coster waldau was in a film,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...


In [113]:
wiki_df = merged_df[['claim_index', 'wiki_index', 'wiki_text']].drop_duplicates(subset = ['wiki_index', 'wiki_text'])

In [118]:
wiki_df.head()

Unnamed: 0,claim_index,wiki_index,wiki_text
0,0,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
33,33,1736417,the fox broadcasting company lrb often shorten...
52,52,2247952,the history of art is the history of any activ...
88,88,264220,adrienne eliza houghton lrb n e bailon born oc...
121,121,2251350,homeland is an american spy thriller televisio...


In [75]:
claims = list(merged_df['claim'])
wiki_text = list(set(list(merged_df['wiki_text'])))
# evidence = list(df['evidence_wiki_url'])


In [54]:
claims[:3]

['nikolaj coster waldau worked with the fox broadcasting company',
 'nikolaj coster waldau was not in a danish thriller film',
 'nikolaj coster waldau worked with peter dinklage']

In [55]:
len(claims)

102751

In [56]:
len(wiki_text)

9530

In [57]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import torch

question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-multiset-base').to(device)
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-multiset-base')

def encode_questions(questions, batch_size=32):
    question_encoder.eval()
    question_embeddings = []

    for start_idx in range(0, len(questions), batch_size):
        
        print(f"Encoding indices: {start_idx}:{start_idx+batch_size}")
        batch_questions = questions[start_idx:start_idx + batch_size]

        # Using batch_encode_plus for efficient tokenization
        inputs = question_tokenizer.batch_encode_plus(batch_questions, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Move tokenized inputs to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through the encoder
        with torch.no_grad():
            batch_embeddings = question_encoder(**inputs).pooler_output
        question_embeddings.append(batch_embeddings.cpu())
        
        # Clearing memory
        del inputs, batch_embeddings
        torch.cuda.empty_cache()

    return torch.cat(question_embeddings, dim=0)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.weight', 'question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
batch_size = 256
claims_embeddings = encode_questions(claims, batch_size=batch_size)

Encoding indices: 0:256
Encoding indices: 256:512
Encoding indices: 512:768
Encoding indices: 768:1024
Encoding indices: 1024:1280
Encoding indices: 1280:1536
Encoding indices: 1536:1792
Encoding indices: 1792:2048
Encoding indices: 2048:2304
Encoding indices: 2304:2560
Encoding indices: 2560:2816
Encoding indices: 2816:3072
Encoding indices: 3072:3328
Encoding indices: 3328:3584
Encoding indices: 3584:3840
Encoding indices: 3840:4096
Encoding indices: 4096:4352
Encoding indices: 4352:4608
Encoding indices: 4608:4864
Encoding indices: 4864:5120
Encoding indices: 5120:5376
Encoding indices: 5376:5632
Encoding indices: 5632:5888
Encoding indices: 5888:6144
Encoding indices: 6144:6400
Encoding indices: 6400:6656
Encoding indices: 6656:6912
Encoding indices: 6912:7168
Encoding indices: 7168:7424
Encoding indices: 7424:7680
Encoding indices: 7680:7936
Encoding indices: 7936:8192
Encoding indices: 8192:8448
Encoding indices: 8448:8704
Encoding indices: 8704:8960
Encoding indices: 8960:9216
E

Encoding indices: 71168:71424
Encoding indices: 71424:71680
Encoding indices: 71680:71936
Encoding indices: 71936:72192
Encoding indices: 72192:72448
Encoding indices: 72448:72704
Encoding indices: 72704:72960
Encoding indices: 72960:73216
Encoding indices: 73216:73472
Encoding indices: 73472:73728
Encoding indices: 73728:73984
Encoding indices: 73984:74240
Encoding indices: 74240:74496
Encoding indices: 74496:74752
Encoding indices: 74752:75008
Encoding indices: 75008:75264
Encoding indices: 75264:75520
Encoding indices: 75520:75776
Encoding indices: 75776:76032
Encoding indices: 76032:76288
Encoding indices: 76288:76544
Encoding indices: 76544:76800
Encoding indices: 76800:77056
Encoding indices: 77056:77312
Encoding indices: 77312:77568
Encoding indices: 77568:77824
Encoding indices: 77824:78080
Encoding indices: 78080:78336
Encoding indices: 78336:78592
Encoding indices: 78592:78848
Encoding indices: 78848:79104
Encoding indices: 79104:79360
Encoding indices: 79360:79616
Encoding i

In [59]:
claims_embeddings.shape

torch.Size([102751, 768])

In [60]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-multiset-base').to(device)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')

def encode_contexts(contexts, batch_size=32):
    context_encoder.eval()
    context_embeddings = []

    for start_idx in range(0, len(contexts), batch_size):
        
        print(f"Encoding indices: {start_idx}:{start_idx+batch_size}")
        batch_contexts = contexts[start_idx:start_idx + batch_size]

        # Using batch_encode_plus for efficient tokenization
        inputs = context_tokenizer.batch_encode_plus(batch_contexts, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Move tokenized inputs to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through the encoder
        with torch.no_grad():
            batch_embeddings = context_encoder(**inputs).pooler_output
        context_embeddings.append(batch_embeddings.cpu())
        
        # Clearing memory
        del inputs, batch_embeddings
        torch.cuda.empty_cache()

    return torch.cat(context_embeddings, dim=0)

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenize

In [61]:
batch_size = 256
context_embeddings = encode_contexts(wiki_text, batch_size=batch_size)

Encoding indices: 0:256
Encoding indices: 256:512
Encoding indices: 512:768
Encoding indices: 768:1024
Encoding indices: 1024:1280
Encoding indices: 1280:1536
Encoding indices: 1536:1792
Encoding indices: 1792:2048
Encoding indices: 2048:2304
Encoding indices: 2304:2560
Encoding indices: 2560:2816
Encoding indices: 2816:3072
Encoding indices: 3072:3328
Encoding indices: 3328:3584
Encoding indices: 3584:3840
Encoding indices: 3840:4096
Encoding indices: 4096:4352
Encoding indices: 4352:4608
Encoding indices: 4608:4864
Encoding indices: 4864:5120
Encoding indices: 5120:5376
Encoding indices: 5376:5632
Encoding indices: 5632:5888
Encoding indices: 5888:6144
Encoding indices: 6144:6400
Encoding indices: 6400:6656
Encoding indices: 6656:6912
Encoding indices: 6912:7168
Encoding indices: 7168:7424
Encoding indices: 7424:7680
Encoding indices: 7680:7936
Encoding indices: 7936:8192
Encoding indices: 8192:8448
Encoding indices: 8448:8704
Encoding indices: 8704:8960
Encoding indices: 8960:9216
E

In [62]:
context_embeddings.shape

torch.Size([9530, 768])

In [63]:
claims_embeddings = claims_embeddings.to(device)
context_embeddings = context_embeddings.to(device)


In [64]:

def compute_similarity_in_batches(claims_embeddings, context_embeddings, batch_size=100):
    num_claims = claims_embeddings.size(0)
    num_contexts = context_embeddings.size(0)
    top_passages_indices = np.zeros(num_claims, dtype=int)
    
    for start_idx in range(0, num_claims, batch_size):
        end_idx = min(start_idx + batch_size, num_claims)
        print(f"Computing from {start_idx} : {end_idx}")
        batch_scores = torch.matmul(claims_embeddings[start_idx:end_idx], context_embeddings.T)

        # Compute top passages for the current batch and store the indices
        top_passages_batch = np.argmax(batch_scores.detach().cpu().numpy(), axis=1)
        
        top_passages_indices[start_idx:end_idx] = top_passages_batch
        
        # Clearing memory
        del batch_scores
        torch.cuda.empty_cache()

    return top_passages_indices

embeddings = compute_similarity_in_batches(claims_embeddings, context_embeddings, batch_size=100)

Computing from 0 : 100
Computing from 100 : 200
Computing from 200 : 300
Computing from 300 : 400
Computing from 400 : 500
Computing from 500 : 600
Computing from 600 : 700
Computing from 700 : 800
Computing from 800 : 900
Computing from 900 : 1000
Computing from 1000 : 1100
Computing from 1100 : 1200
Computing from 1200 : 1300
Computing from 1300 : 1400
Computing from 1400 : 1500
Computing from 1500 : 1600
Computing from 1600 : 1700
Computing from 1700 : 1800
Computing from 1800 : 1900
Computing from 1900 : 2000
Computing from 2000 : 2100
Computing from 2100 : 2200
Computing from 2200 : 2300
Computing from 2300 : 2400
Computing from 2400 : 2500
Computing from 2500 : 2600
Computing from 2600 : 2700
Computing from 2700 : 2800
Computing from 2800 : 2900
Computing from 2900 : 3000
Computing from 3000 : 3100
Computing from 3100 : 3200
Computing from 3200 : 3300
Computing from 3300 : 3400
Computing from 3400 : 3500
Computing from 3500 : 3600
Computing from 3600 : 3700
Computing from 3700 : 

Computing from 56000 : 56100
Computing from 56100 : 56200
Computing from 56200 : 56300
Computing from 56300 : 56400
Computing from 56400 : 56500
Computing from 56500 : 56600
Computing from 56600 : 56700
Computing from 56700 : 56800
Computing from 56800 : 56900
Computing from 56900 : 57000
Computing from 57000 : 57100
Computing from 57100 : 57200
Computing from 57200 : 57300
Computing from 57300 : 57400
Computing from 57400 : 57500
Computing from 57500 : 57600
Computing from 57600 : 57700
Computing from 57700 : 57800
Computing from 57800 : 57900
Computing from 57900 : 58000
Computing from 58000 : 58100
Computing from 58100 : 58200
Computing from 58200 : 58300
Computing from 58300 : 58400
Computing from 58400 : 58500
Computing from 58500 : 58600
Computing from 58600 : 58700
Computing from 58700 : 58800
Computing from 58800 : 58900
Computing from 58900 : 59000
Computing from 59000 : 59100
Computing from 59100 : 59200
Computing from 59200 : 59300
Computing from 59300 : 59400
Computing from

In [65]:
len(embeddings)

102751

In [66]:
print(min(embeddings), max(embeddings))

1 9529


In [33]:
merged_df.head()

Unnamed: 0,claim_id,label,claim,wiki_title,wiki_index,wiki_text
0,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
1,58718,REFUTES,nikolaj coster waldau was not in a danish thri...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
2,134655,SUPPORTS,nikolaj coster waldau worked with peter dinklage,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
3,86306,REFUTES,nikolaj coster waldau refused to ever work wit...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
4,149361,SUPPORTS,nikolaj coster waldau was in a film,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...


In [120]:
wiki_df.head()

Unnamed: 0,claim_index,wiki_index,wiki_text
0,0,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
33,33,1736417,the fox broadcasting company lrb often shorten...
52,52,2247952,the history of art is the history of any activ...
88,88,264220,adrienne eliza houghton lrb n e bailon born oc...
121,121,2251350,homeland is an american spy thriller televisio...


In [None]:
merged_df.iloc[i]['wiki_text']

In [121]:
for i, claim in enumerate(claims[:10]):
    similar_context_index = embeddings[i]
    wiki_passage = wiki_df.iloc[similar_context_index]['wiki_text']

    print(f"Claim: {claim}")
    print(f"Most similar context: {wiki_passage}")
    print(f"Actual: {merged_df.iloc[i]['wiki_text']}\n\n")


Claim: nikolaj coster waldau worked with the fox broadcasting company
Most similar context: an electric car is an automobile that is propelled by one or more electric motors using electrical energy stored in rechargeable batteries or another energy storage device electric motors give electric cars instant torque creating strong and smooth acceleration they are also around three times as efficient as cars with an internal combustion engine the first practical electric cars were produced in the 1880s electric cars were popular in the late 19th century and early 20th century until advances in internal combustion engines electric starters in particular and mass production of cheaper gasoline vehicles led to a decline in the use of electric drive vehicles since 2008 a renaissance in electric vehicle manufacturing has occurred due to advances in batteries and energy management concerns about increasing oil prices and the need to reduce greenhouse gas emissions several national and local gove

In [70]:
import h5py

# Assuming context_embeddings is your PyTorch tensor on the CUDA device
# First, move the tensor to the CPU, then convert it to a NumPy array
context_embeddings_cpu = context_embeddings.cpu().numpy()

# Now you can save it using h5py
with h5py.File('embeddings/merged_embeddings.h5', 'w') as file:
    file.create_dataset('merged_embeddings', data=context_embeddings_cpu)
