# Fact Verification and Evidence Retrieval

### Import libraries

In [142]:
import json
import os
import os.path as path
import heapq

### Define the paths

In [5]:
BASE_DIR = os.getcwd()

raw_training_set = path.join(BASE_DIR, "fever/train.jsonl")
training_doc_file = path.join(BASE_DIR,"fever/train.wiki7.jsonl")
db_path = path.join(BASE_DIR, "fever/fever.db")

print("Base dir:",BASE_DIR)
print("Raw Training set:",raw_training_set)
print("Training DocRetrieval Output File:",training_doc_file)
print("Database path:",db_path)


Base dir: /Users/debrup/PycharmProjects/ir_project/ir_project
Raw Training set: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/train.jsonl
Training DocRetrieval Output File: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/train.wiki7.jsonl
Database path: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/fever.db


### Open the train.jsonl file

In [6]:
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

In [7]:
class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in fp.readlines():
            data.append(json.loads(line.strip()))
        return data
    

### Read lines of train.json

In [8]:
processed = dict()
jlr = JSONLineReader()
lines = jlr.read(raw_training_set)

In [9]:
test_lines = lines[0:3]
print(type(test_lines))

<class 'list'>


### Get the claim lines

In [10]:
claim_test_lines = [ sub['claim'] for sub in test_lines]
print(claim_test_lines )

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'Roman Atwood is a content creator.', 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.']


### Install stanza

In [53]:
pip install stanza


Collecting stanza
  Using cached stanza-1.5.0-py3-none-any.whl (802 kB)
Collecting emoji
  Using cached emoji-2.2.0.tar.gz (240 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting protobuf
  Using cached protobuf-4.22.1-cp37-abi3-macosx_10_9_universal2.whl (397 kB)
Collecting torch>=1.3.0
  Downloading torch-2.0.0-cp311-none-macosx_11_0_arm64.whl (55.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 MB[0m [31m563.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hCollecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting filelock
  Downloading filelock-3.10.7-py3-none-any.whl (10 kB)
Collecting sympy
  Using cached sympy-1.11.1-py3-none-any.whl (6.5 MB)
Collecting networkx
  Using cached networkx-3.0-py3-none-any.whl (2.0 MB)
Collecting mpmath>=0.19
  Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel

### Import stanza and download english model

In [11]:
import stanza
stanza.download('en') # download the English model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 8.55MB/s]
2023-04-07 14:21:19 INFO: Downloading default packages for language: en (English) ...
2023-04-07 14:21:21 INFO: File exists: /Users/debrup/stanza_resources/en/default.zip
2023-04-07 14:21:29 INFO: Finished downloading models and saved to /Users/debrup/stanza_resources.


### Try NER on the claim

In [57]:
## Find the entities in the lines and the noun phrases

nlp = stanza.Pipeline('en', processors='tokenize,ner')

for text in claim_test_lines:
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, ent.type)


2023-04-02 11:52:53 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 4.80MB/s]
2023-04-02 11:52:54 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-04-02 11:52:54 INFO: Using device: cpu
2023-04-02 11:52:54 INFO: Loading: tokenize
2023-04-02 11:52:54 INFO: Loading: ner
2023-04-02 11:52:55 INFO: Done loading processors!


Nikolaj Coster-Waldau PERSON
the Fox Broadcasting Company ORG
Roman Atwood PERSON


### Install spacy

In [59]:
pip install spacy

Collecting spacy
  Downloading spacy-3.5.1-cp311-cp311-macosx_11_0_arm64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Using cached spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp311-cp311-macosx_11_0_arm64.whl (18 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.7-cp311-cp311-macosx_11_0_arm64.whl (30 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp311-cp311-macosx_11_0_arm64.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.9-cp311-cp311-macosx_11_0_arm64.whl (767 kB)
[2K     [90m

### Try NER using spacy

In [62]:
import spacy

nlp = spacy.load("en_core_web_sm")


docs = list(nlp.pipe(claim_test_lines))

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)

Nikolaj Coster-Waldau PERSON
the Fox Broadcasting Company ORG
Roman Atwood PERSON


#### As we can see the NER Models cannot get all the required entities. They can only derive very specific entity objects.

### Install nltk

In [83]:
pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2023.3.23-cp311-cp311-macosx_11_0_arm64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.8/288.8 kB[0m [31m227.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.3.23
Note: you may need to restart the kernel to use updated packages.


### Extract all the noun phrases of the claims using constituency tree

In [13]:
import stanza
import wikipedia

def preprocess(np):
    page = np.replace('( ', '-LRB-')
    page = page.replace(' )', '-RRB-')
    page = page.replace(' - ', '-')
    page = page.replace(' :', '-COLON-')
    page = page.replace(' ,', ',')
    page = page.replace(" 's", "'s")
    page = page.replace(' ', '_')
    return page
    

claim_train_lines = [ sub['claim'] for sub in lines]

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
entities=[]  # entities will be a list of size len(train_claims) having list of entities(noun_phrases) as each elem
wiki_pages=[]

test=["I am a happy person"]

for claim in claim_test_lines:
    doc = nlp(claim)
    for sentence in doc.sentences:
        tree = sentence.constituency
        np=[]
        tree.visit_preorder(internal = lambda x: np.append(x.leaf_labels()) if x.label=="NP" else None)
        noun_phrases = [' '.join(n) for n in np]
        entities.append(noun_phrases)
        
        predicted_wiki = [preprocess((wikipedia.search(n))[0]) for n in noun_phrases] 
        wiki_pages.append(predicted_wiki)
        
        ##print(noun_phrases)
        ##print(type(sentence.constituency))

        
    

2023-04-07 14:22:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 14.6MB/s]
2023-04-07 14:22:15 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-07 14:22:15 INFO: Using device: cpu
2023-04-07 14:22:15 INFO: Loading: tokenize
2023-04-07 14:22:15 INFO: Loading: pos
2023-04-07 14:22:16 INFO: Loading: constituency
2023-04-07 14:22:16 INFO: Done loading processors!


### Example constituency trees

In [15]:
print([i.constituency for i in doc.sentences])

[(ROOT (S (NP (NP (NN History)) (PP (IN of) (NP (NN art)))) (VP (VBZ includes) (NP (NP (NN architecture)) (, ,) (NP (NN dance)) (, ,) (NP (NN sculpture)) (, ,) (NP (NN music)) (, ,) (NP (NN painting)) (, ,) (NP (NN poetry) (NN literature)) (, ,) (NP (NN theatre)) (, ,) (NP (JJ narrative)) (, ,) (NP (NN film)) (, ,) (NP (NN photography)) (CC and) (NP (JJ graphic) (NNS arts)))) (. .)))]


In [14]:
tree

(ROOT (S (NP (NP (NN History)) (PP (IN of) (NP (NN art)))) (VP (VBZ includes) (NP (NP (NN architecture)) (, ,) (NP (NN dance)) (, ,) (NP (NN sculpture)) (, ,) (NP (NN music)) (, ,) (NP (NN painting)) (, ,) (NP (NN poetry) (NN literature)) (, ,) (NP (NN theatre)) (, ,) (NP (JJ narrative)) (, ,) (NP (NN film)) (, ,) (NP (NN photography)) (CC and) (NP (JJ graphic) (NNS arts)))) (. .)))

In [16]:
len(wiki_pages)

3

### Example noun phrases

In [17]:
print(entities)

[['Nikolaj Coster - Waldau', 'the Fox Broadcasting Company'], ['Roman Atwood', 'a content creator'], ['History of art', 'History', 'art', 'architecture , dance , sculpture , music , painting , poetry literature , theatre , narrative , film , photography and graphic arts', 'architecture', 'dance', 'sculpture', 'music', 'painting', 'poetry literature', 'theatre', 'narrative', 'film', 'photography', 'graphic arts']]


### Example predicted wiki pages

In [42]:
wiki_pages

[['Nikolaj_Coster-Waldau', 'Fox_Broadcasting_Company'],
 ['Roman_Atwood', 'Content_creation'],
 ['History_of_art',
  'History',
  'Art',
  'Arts_in_the_Philippines',
  'Architecture',
  'Dance',
  'Sculpture',
  'Music',
  'Painting',
  'Epic_poetry',
  'Theatre',
  'Narrative',
  'Film',
  'Photography',
  'Graphic_arts']]

### Example code for retrieving from fever.db

In [47]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

# Execute a SELECT query on the database
query = "SELECT lines FROM documents where id='Nikolaj_Coster-Waldau'"
cursor.execute(query)

# Retrieve the results of the query
results = cursor.fetchone()

# Print the results
for row in results:
    print(row)

# Close the connection to the database
conn.close()

0	Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter .
1	He graduated from Danish National School of Theatre in Copenhagen in 1993 .	Danish National School of Theatre	Danish National School of Theatre and Contemporary Dance	Copenhagen	Copenhagen
2	Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- .	Nightwatch	Nightwatch (1994 film)
3	Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .	Headhunters	Headhunters (film)	A Thousand Times Good Night	A Thousand Times Good Night
4	
5	
6	In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon .	Black Hawk Down	Black Hawk Down (film)	Gary Gordon	Gary Gordon
7	He then played Detective John

### Get the names of titles and the lines of pages in fever.db that matches with wiki_pages

In [77]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

candidate_lines = []  ## list of size (no_claims), each element of list should be lines of each matched page

count=0

for claim_pages in wiki_pages:
    
    
    for page in claim_pages:
        
        #print(page)
       # Execute a SELECT query on the database
        query = f"SELECT lines FROM documents where id='{page}'"
        cursor.execute(query)
        

       # Retrieve the results of the query
        results = cursor.fetchone()
        
        #print(results)
        
       # If results are empty continue
        if results is None or len(results) == 0:
            continue  
            
       # Split the string into a list of lines using the newline character
        lines_page = results[0].split('\n')   
        #print(lines_page)
        
        for line in lines_page:
            
         
           # remove tab characters
           line = line.replace('\t', ' ')
           
           while len(line) >= 1 and line[0].isdigit():
                try:
                    line = line[1:]
                    
                except:
                    line=''
                    break
                
           # remove digits
           #line = line.translate(str.maketrans('', '', '0123456789'))
            

           # remove all words after last . (the links to other pages)
           last_period_index = line.rfind(".")

           # Remove everything after the last period
           
           if last_period_index != -1:
               
                line = line[:last_period_index+1]
                   
           
        
           #line = line.split('.')[0]
    
           # remove extra spaces
           line = ' '.join(line.split())
            
           # Each of the elements of candidate lines is a dictionary with (title,lines_page) (K,V) pairs
           
         
           try:
            
              ((candidate_lines[count])[page]).append(line)
                    
           except:
              
              if len(candidate_lines)==count:
                 candidate_lines.append({})
                    
               
                 
            
              try:
                      ((candidate_lines[count])[page]).append(line)
              except:
                      ((candidate_lines[count])[page])=[]
                      ((candidate_lines[count])[page]).append(line)  
                    
        
        
        
    count=count+1


print(candidate_lines[0])

# Close the connection to the database
conn.close()


{'Nikolaj_Coster-Waldau': ['Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter .', 'He graduated from Danish National School of Theatre in Copenhagen in 1993 .', "Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- .", 'Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .', '', '', 'In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon .', 'He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .', 'He became widely known to a broad audience for his current role

In [129]:
print(candidate_lines[1])


{'Roman_Atwood': ['Roman Bernard Atwood -LRB- born May 28 , 1983 -RRB- is an American YouTube personality , comedian , vlogger and pranker .', 'He is best known for his vlogs , where he posts updates about his life on a daily basis .', "His vlogging channel , `` RomanAtwoodVlogs '' , has a total of 3.3 billion views and 11.9 million subscribers .", "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .", 'His prank videos have gained over 1.4 billion views and 10.3 million subscribers .', 'Both of these channels are in the top 100 most subscribed on YouTube , and he became the second YouTuber after Germán Garmendia to receive two Diamond Play Buttons for his two channels .', ''], 'Content_creation': ['Content Creation is the contribution of information to any media and most especially to digital media for an end-user/audience in specific contexts .', "Content is `` something that is to be expressed through some medium , as speech , writing or any of var

In [171]:
print(candidate_lines[1]['Roman_Atwood'][1])

He is best known for his vlogs , where he posts updates about his life on a daily basis .


In [40]:
pip install wikipedia

Collecting wikipedia
  Using cached wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=e813a16c5cf96f4a8c2b5da18702d4199cb06133177bb8cd4187340c8ec37144
  Stored in directory: /Users/debrup/Library/Caches/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [166]:
import wikipedia

# Query text
query = "a content creator"

# Search for matching page titles
results = wikipedia.search(query)

# Print the list of matching page titles
print("Matching page titles:")
for title in results:
    print(title)

# Choose a page to retrieve
page_title = results[0]

# Retrieve the content of the page
page = wikipedia.page(page_title)

# Print the page summary and content
print("Page summary:", page.summary)
##print("Page content:", page.content)


Matching page titles:
Content creation
Creator Clash
OnlyFans
YouTuber
One True King
Creator economy
Emiru
Patreon
QTCinderella
Content house
Page summary: Content creation is the act of producing and sharing information or media content for specific audiences, particularly in digital contexts. According to the Merriam-Webster dictionary, content refers to "something that is to be expressed through some medium, as speech, writing or any of various arts" for self-expression, distribution, marketing and/or publication. Content creation encompasses various activities including maintaining and updating web sites, blogging, article writing, photography, videography, online commentary, the maintenance of social media accounts, and editing and distribution of digital media. In a survey conducted by Pew, content creation was defined as "the material people contribute to the online world."


### Compute Similarity between the Claim and the Candidate Evidences found using TF-IDF

In [91]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

query = "SELECT lines FROM documents"
cursor.execute(query)

results = cursor.fetchall()


In [109]:
L = len(results)
documents = []
for i in range(0, 3):
    document = results[i][0]
    
    # remove tab characters
    document = document.replace('\t', ' ')
    
    # remove all words after last . (the links to other pages)
    last_period_index = document.rfind(".")

   # Remove everything after the last period

    if last_period_index != -1:
        document = document[:last_period_index+1]
    
#     while len(document) >= 1 and document[0].isdigit():
#         try:
#             document = document[1:]
                    
#         except:
#             document=''
#             break

    document = ' '.join(document.split())
    
    documents.append(document)
    
print((documents))

['0 A Diffusion limited enzyme is an enzyme which catalyses a reaction so efficiently that the rate limiting step is that of substrate diffusion into the active site , or product diffusion out . enzyme enzyme catalyses catalysis rate limiting step rate limiting step substrate enzyme substrate diffusion diffusion active site active site product product (chemistry) 1 This is also known as kinetic perfection or catalytic perfection . 2 Since the rate of catalysis of such enzymes is set by the diffusion-controlled reaction , it therefore represents an intrinsic , physical constraint on evolution -LRB- a maximum peak height in the fitness landscape -RRB- . diffusion diffusion diffusion-controlled reaction diffusion-controlled reaction fitness landscape fitness landscape fitness fitness (biology) 3 Diffusion limited perfect enzymes are very rare . 4 Most enzymes catalyse their reactions to a rate that is 1,000-10 ,000 times slower than this limit . 5 This is due to both the chemical limitati

### Term frequency and inverse document frequency


In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [113]:
print(claim_test_lines)

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'Roman Atwood is a content creator.', 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.']


### Extract all sentences with relevance scores  as tuples 

In [147]:

vectorizer = TfidfVectorizer()

# Vectorize the claim and evidences
vectors = vectorizer.fit_transform(documents)
# candidate_sentence = candidate_lines[0]

# Candidate_lines[i] -> the collection of candidate lines in an ith claim from all retrieved documents


claim_similarities = []
for i in range(len(claim_test_lines)):
    similarity_scores_list = []
    sim_score = {}
    claim = claim_test_lines[i]
    
    max_similarity_score = 0
    max_candidate_sentence = ""
    
    for key in candidate_lines[i].keys():
        similarity_all_sentences = []
        candidate_list_sentence = candidate_lines[i][key]
        
        for idx in range(len(candidate_list_sentence)) :
            
            candidate_sentence = candidate_list_sentence[idx]
           # Vectorize the new sentences
            new_vectors = vectorizer.transform([claim,candidate_sentence])

           # Calculate cosine similarity between the two new sentences
            similarity_scores = cosine_similarity(new_vectors)
            
            max_similarity_score = max(max_similarity_score, similarity_scores[0][1])
            if(similarity_scores[0][1] == max_similarity_score):
                max_candidate_sentence = candidate_sentence
            
            similarity_scores_list.append((key,idx, candidate_sentence, similarity_scores[0][1]))
        
    claim_similarities.append(similarity_scores_list)
    

In [148]:
claim_similarities

[[('Nikolaj_Coster-Waldau',
   0,
   'Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter .',
   0.0),
  ('Nikolaj_Coster-Waldau',
   1,
   'He graduated from Danish National School of Theatre in Copenhagen in 1993 .',
   0.0),
  ('Nikolaj_Coster-Waldau',
   2,
   "Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- .",
   0.33985062364609137),
  ('Nikolaj_Coster-Waldau',
   3,
   'Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .',
   0.0),
  ('Nikolaj_Coster-Waldau', 4, '', 0.0),
  ('Nikolaj_Coster-Waldau', 5, '', 0.0),
  ('Nikolaj_Coster-Waldau',
   6,
   'In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon .',
   0.54807

### Extract top K evidence sentences for every claim

In [158]:
# my_list = [(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 2), (13, 14, 15, 8), (17, 18, 19, 1)]

def topK(my_list, K):
    # Find the indices of the top K tuples in the list based on the 4th element
    top_indices = heapq.nlargest(K, range(len(my_list)), key=lambda i: my_list[i][3])

    # Find the tuples corresponding to the top K indices
    top_tuples = [my_list[i] for i in top_indices]

    # Print the top indices and tuples
    
    zipped_list = list(zip(top_indices, top_tuples))
    
    return zipped_list

In [159]:
K=5
topK_evidences_for_each_claim = []
for i in range(len(claim_similarities)):
    topK_evidences_for_each_claim.append(topK(claim_similarities[i], K))
    

In [161]:
topK_evidences_for_each_claim
# 
# Inner tuple:
# 

# 1st index: position in the claim_no

#     inner tuple:
#         1st element: retrieved document name 
#         2nd: index of the candidate sentence in the retrieved document
#         3rd: candidate sentence
#         4th: Similarity score


[[(17,
   ('Fox_Broadcasting_Company',
    6,
    'It was the highest-rated broadcast network in the 18 -- 49 demographic from 2004 to 2012 , and earned the position as the most-watched American television network in total viewership during the 2007 -- 08 season .',
    0.7662925917871872)),
  (12,
   ('Fox_Broadcasting_Company',
    1,
    'The network is headquartered at the 20th Century Fox studio lot on Pico Boulevard in the Century City section of Los Angeles , with additional major offices and production facilities at the Fox Television Center in nearby West Los Angeles and the Fox Broadcasting Center in the Yorkville neighborhood of Manhattan , New York City .',
    0.6962784459401956)),
  (24,
   ('Fox_Broadcasting_Company',
    13,
    "The network is named after sister company 20th Century Fox , and indirectly for producer William Fox , who founded one of the movie studio 's predecessors , Fox Film .",
    0.5812106375713183)),
  (6,
   ('Nikolaj_Coster-Waldau',
    6,
    'I

In [157]:
claim_similarities[0][17]

('Fox_Broadcasting_Company',
 6,
 'It was the highest-rated broadcast network in the 18 -- 49 demographic from 2004 to 2012 , and earned the position as the most-watched American television network in total viewership during the 2007 -- 08 season .',
 0.7662925917871872)

### Find relevance of each candidate evidence sentence using BERT-based models(training)

### Install transformers

In [166]:
pip install transformers

Collecting transformers
  Using cached transformers-4.27.4-py3-none-any.whl (6.8 MB)
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m46.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 transformers-4.27.4
Note: you may need to restart the kernel to use updated packages.


### Import required libraries

In [167]:
import torch
import random
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

# Set random seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

### Load pre-trained BERT model and tokenizer


In [172]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

### This is a sequence classification task with two labels (Relevant or not)
#model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


In [173]:
# Load your training data
train_claims = ['History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.','Roman Atwood is a content creator.','Roman Atwood is a content creator.'] # list of claim sentences
train_evidence = ['The subsequent expansion of the list of principal arts in the 20th century reached to nine : architecture , dance , sculpture , music , painting , poetry -LRB- described broadly as a form of literature with aesthetic purpose or function , which also includes the distinct genres of theatre and narrative -RRB- , film , photography and graphic arts .','He is best known for his vlogs , where he posts updates about his life on a daily basis .','Beavers are large, semiaquatic rodents of the Northern Hemisphere. ']
train_labels = [1,1,0] # list of labels (0 for not relevant, 1 for relevant)

# Tokenize input sequences
input_ids = []
attention_masks = []
for claim, evidence in zip(train_claims, train_evidence):
    encoded_dict = tokenizer.encode_plus(
                        claim,
                        evidence,
                        add_special_tokens = True,
                        max_length = 128,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_labels)

In [174]:
input_ids

tensor([[  101,  2381,  1997,  2396,  2950,  4294,  1010,  3153,  1010,  6743,
          1010,  2189,  1010,  4169,  1010,  4623,  3906,  1010,  3004,  1010,
          7984,  1010,  2143,  1010,  5855,  1998,  8425,  2840,  1012,   102,
          1996,  4745,  4935,  1997,  1996,  2862,  1997,  4054,  2840,  1999,
          1996,  3983,  2301,  2584,  2000,  3157,  1024,  4294,  1010,  3153,
          1010,  6743,  1010,  2189,  1010,  4169,  1010,  4623,  1011,  1048,
         15185,  1011,  2649, 13644,  2004,  1037,  2433,  1997,  3906,  2007,
         12465,  3800,  2030,  3853,  1010,  2029,  2036,  2950,  1996,  5664,
         11541,  1997,  3004,  1998,  7984,  1011, 25269,  2497,  1011,  1010,
          2143,  1010,  5855,  1998,  8425,  2840,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [177]:
input_ids.shape

torch.Size([3, 128])

In [175]:
attention_masks

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [178]:
attention_masks.shape

torch.Size([3, 128])

In [179]:
labels

tensor([1, 1, 0])

In [180]:
labels.shape

torch.Size([3])

### Construct dataset with (input_ids,attention_masks,labels)

In [181]:
# Combine input sequences with labels
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [183]:
# Create data loaders for batching

batch_size = 3
train_dataloader = DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=torch.utils.data.SequentialSampler(val_dataset), batch_size=batch_size)


In [None]:
## Load pretrained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2,from_tf=True)

In [None]:
#Set hyperparameters and optimizer
learning_rate = 2e-5
num_epochs = 3
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
