# Fact Verification and Evidence Retrieval

### Import libraries

In [1]:
import json
import os
import os.path as path
import heapq
import torch
import random

### Define the paths

In [2]:
BASE_DIR = os.getcwd()

raw_training_set = path.join(BASE_DIR, "fever/train.jsonl")
training_doc_file = path.join(BASE_DIR,"fever/train.wiki7.jsonl")
db_path = path.join(BASE_DIR, "fever/fever.db")

print("Base dir:",BASE_DIR)
print("Raw Training set:",raw_training_set)
print("Training DocRetrieval Output File:",training_doc_file)
print("Database path:",db_path)


Base dir: /Users/debrup/PycharmProjects/ir_project/ir_project
Raw Training set: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/train.jsonl
Training DocRetrieval Output File: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/train.wiki7.jsonl
Database path: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/fever.db


### Check if gpu is available

In [3]:
print(torch.backends.mps.is_available())

True


In [4]:
print(torch.backends.mps.is_built())

True


In [5]:
device = torch.device("mps" if torch.device("mps") else "cpu")
print(device)

mps


### Open the train.jsonl file

In [6]:
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

In [7]:
class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in fp.readlines():
            data.append(json.loads(line.strip()))
        return data
    

### Read lines of train.json

In [8]:
processed = dict()
jlr = JSONLineReader()
lines = jlr.read(raw_training_set)

In [9]:
test_lines = lines[0:3]
print(type(test_lines))

<class 'list'>


### Get the claim lines

In [10]:
claim_test_lines = [ sub['claim'] for sub in test_lines]
print(claim_test_lines )

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'Roman Atwood is a content creator.', 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.']


### Install stanza

In [11]:
pip install stanza


Note: you may need to restart the kernel to use updated packages.


### Import stanza and download english model

In [12]:
import stanza
stanza.download('en') # download the English model

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 31.9MB/s]
2023-04-09 15:38:05 INFO: Downloading default packages for language: en (English) ...
2023-04-09 15:38:08 INFO: File exists: /Users/debrup/stanza_resources/en/default.zip
2023-04-09 15:38:12 INFO: Finished downloading models and saved to /Users/debrup/stanza_resources.


### Try NER on the claim

In [13]:
## Find the entities in the lines and the noun phrases

nlp = stanza.Pipeline('en', processors='tokenize,ner')

for text in claim_test_lines:
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.text, ent.type)


2023-04-09 15:38:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 4.14MB/s]
2023-04-09 15:38:13 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-04-09 15:38:13 INFO: Using device: cpu
2023-04-09 15:38:13 INFO: Loading: tokenize
2023-04-09 15:38:13 INFO: Loading: ner
2023-04-09 15:38:14 INFO: Done loading processors!


Nikolaj Coster-Waldau PERSON
the Fox Broadcasting Company ORG
Roman Atwood PERSON


### Install spacy

In [14]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


### Try NER using spacy

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")


docs = list(nlp.pipe(claim_test_lines))

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)

Nikolaj Coster-Waldau PERSON
the Fox Broadcasting Company ORG
Roman Atwood PERSON


#### As we can see the NER Models cannot get all the required entities. They can only derive very specific entity objects.

### Install nltk

In [16]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


### Extract all the noun phrases of the claims using constituency tree

In [17]:
import stanza
import wikipedia

def preprocess(np):
    page = np.replace('( ', '-LRB-')
    page = page.replace(' )', '-RRB-')
    page = page.replace(' - ', '-')
    page = page.replace(' :', '-COLON-')
    page = page.replace(' ,', ',')
    page = page.replace(" 's", "'s")
    page = page.replace(' ', '_')
    return page
    

claim_train_lines = [ sub['claim'] for sub in lines]

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
entities=[]  # entities will be a list of size len(train_claims) having list of entities(noun_phrases) as each elem
wiki_pages=[]

test=["I am a happy person"]

for claim in claim_test_lines:
    doc = nlp(claim)
    for sentence in doc.sentences:
        tree = sentence.constituency
        np=[]
        tree.visit_preorder(internal = lambda x: np.append(x.leaf_labels()) if x.label=="NP" else None)
        noun_phrases = [' '.join(n) for n in np]
        entities.append(noun_phrases)
        
        predicted_wiki = [preprocess((wikipedia.search(n))[0]) for n in noun_phrases] 
        wiki_pages.append(predicted_wiki)
        
        ##print(noun_phrases)
        ##print(type(sentence.constituency))

        
    

2023-04-09 15:38:20 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 2.81MB/s]
2023-04-09 15:38:21 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-09 15:38:21 INFO: Using device: cpu
2023-04-09 15:38:21 INFO: Loading: tokenize
2023-04-09 15:38:21 INFO: Loading: pos
2023-04-09 15:38:21 INFO: Loading: constituency
2023-04-09 15:38:22 INFO: Done loading processors!


### Example constituency trees

In [18]:
print([i.constituency for i in doc.sentences])

[(ROOT (S (NP (NP (NN History)) (PP (IN of) (NP (NN art)))) (VP (VBZ includes) (NP (NP (NN architecture)) (, ,) (NP (NN dance)) (, ,) (NP (NN sculpture)) (, ,) (NP (NN music)) (, ,) (NP (NN painting)) (, ,) (NP (NN poetry) (NN literature)) (, ,) (NP (NN theatre)) (, ,) (NP (JJ narrative)) (, ,) (NP (NN film)) (, ,) (NP (NN photography)) (CC and) (NP (JJ graphic) (NNS arts)))) (. .)))]


In [19]:
tree

(ROOT (S (NP (NP (NN History)) (PP (IN of) (NP (NN art)))) (VP (VBZ includes) (NP (NP (NN architecture)) (, ,) (NP (NN dance)) (, ,) (NP (NN sculpture)) (, ,) (NP (NN music)) (, ,) (NP (NN painting)) (, ,) (NP (NN poetry) (NN literature)) (, ,) (NP (NN theatre)) (, ,) (NP (JJ narrative)) (, ,) (NP (NN film)) (, ,) (NP (NN photography)) (CC and) (NP (JJ graphic) (NNS arts)))) (. .)))

In [20]:
len(wiki_pages)

3

### Example noun phrases

In [21]:
print(entities)

[['Nikolaj Coster - Waldau', 'the Fox Broadcasting Company'], ['Roman Atwood', 'a content creator'], ['History of art', 'History', 'art', 'architecture , dance , sculpture , music , painting , poetry literature , theatre , narrative , film , photography and graphic arts', 'architecture', 'dance', 'sculpture', 'music', 'painting', 'poetry literature', 'theatre', 'narrative', 'film', 'photography', 'graphic arts']]


### Example predicted wiki pages

In [22]:
wiki_pages

[['Nikolaj_Coster-Waldau', 'Fox_Broadcasting_Company'],
 ['Roman_Atwood', 'Content_creation'],
 ['History_of_art',
  'History',
  'Art',
  'Arts_in_the_Philippines',
  'Architecture',
  'Dance',
  'Sculpture',
  'Music',
  'Painting',
  'Epic_poetry',
  'Theatre',
  'Narrative',
  'Film',
  'Photography',
  'Graphic_arts']]

### Example code for retrieving from fever.db

In [74]:
P=preprocess('List_of_The_Simpsons_guest_stars_(seasons 21–present)')

In [76]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

# Execute a SELECT query on the database
query = f"SELECT lines FROM documents where id='Game_of_Thrones'"
cursor.execute(query)

# Retrieve the results of the query
results = cursor.fetchone()

# Print the results
for row in results:
    print(row)

# Close the connection to the database
conn.close()

0	Game of Thrones is an American fantasy drama television series created by David Benioff and D. B. Weiss .	David Benioff	David Benioff	D. B. Weiss	D. B. Weiss	fantasy	fantasy	drama	Drama (film and television)
1	It is an adaptation of A Song of Ice and Fire , George R. R. Martin 's series of fantasy novels , the first of which is A Game of Thrones .	A Song of Ice and Fire	A Song of Ice and Fire	George R. R. Martin	George R. R. Martin	fantasy	fantasy	A Game of Thrones	A Game of Thrones
2	It is filmed at Titanic Studios in Belfast , on location in the United Kingdom , and in Canada , Croatia , Iceland , Malta , Morocco , Spain , and the United States .	Belfast	Belfast
3	The series premiered on HBO in the United States on April 17 , 2011 , and its sixth season ended on June 26 , 2016 .	HBO	HBO
4	The series was renewed for a seventh season , which is scheduled to premiere on July 16 , 2017 , and will conclude with its eighth season in 2018 .
5	
6	
7	Set on the fictional continents of Weste

### Get the names of titles and the lines of pages in fever.db that matches with wiki_pages

In [24]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

candidate_lines = []  ## list of size (no_claims), each element of list should be lines of each matched page

count=0

for claim_pages in wiki_pages:
    
    
    for page in claim_pages:
        
        #print(page)
       # Execute a SELECT query on the database
        query = f"SELECT lines FROM documents where id='{page}'"
        cursor.execute(query)
        

       # Retrieve the results of the query
        results = cursor.fetchone()
        
        #print(results)
        
       # If results are empty continue
        if results is None or len(results) == 0:
            continue  
            
       # Split the string into a list of lines using the newline character
        lines_page = results[0].split('\n')   
        #print(lines_page)
        
        for line in lines_page:
            
         
           # remove tab characters
           line = line.replace('\t', ' ')
           
           while len(line) >= 1 and line[0].isdigit():
                try:
                    line = line[1:]
                    
                except:
                    line=''
                    break
                
           # remove digits
           #line = line.translate(str.maketrans('', '', '0123456789'))
            

           # remove all words after last . (the links to other pages)
           last_period_index = line.rfind(".")

           # Remove everything after the last period
           
           if last_period_index != -1:
               
                line = line[:last_period_index+1]
                   
           
        
           #line = line.split('.')[0]
    
           # remove extra spaces
           line = ' '.join(line.split())
            
           # Each of the elements of candidate lines is a dictionary with (title,lines_page) (K,V) pairs
           
         
           try:
            
              ((candidate_lines[count])[page]).append(line)
                    
           except:
              
              if len(candidate_lines)==count:
                 candidate_lines.append({})
                    
               
                 
            
              try:
                      ((candidate_lines[count])[page]).append(line)
              except:
                      ((candidate_lines[count])[page])=[]
                      ((candidate_lines[count])[page]).append(line)  
                    
        
        
        
    count=count+1


print(candidate_lines[0])

# Close the connection to the database
conn.close()


{'Nikolaj_Coster-Waldau': ['Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter .', 'He graduated from Danish National School of Theatre in Copenhagen in 1993 .', "Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- .", 'Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .', '', '', 'In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon .', 'He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .', 'He became widely known to a broad audience for his current role

In [25]:
print(candidate_lines[1])


{'Roman_Atwood': ['Roman Bernard Atwood -LRB- born May 28 , 1983 -RRB- is an American YouTube personality , comedian , vlogger and pranker .', 'He is best known for his vlogs , where he posts updates about his life on a daily basis .', "His vlogging channel , `` RomanAtwoodVlogs '' , has a total of 3.3 billion views and 11.9 million subscribers .", "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .", 'His prank videos have gained over 1.4 billion views and 10.3 million subscribers .', 'Both of these channels are in the top 100 most subscribed on YouTube , and he became the second YouTuber after Germán Garmendia to receive two Diamond Play Buttons for his two channels .', ''], 'Content_creation': ['Content Creation is the contribution of information to any media and most especially to digital media for an end-user/audience in specific contexts .', "Content is `` something that is to be expressed through some medium , as speech , writing or any of var

In [26]:
print(candidate_lines[1]['Roman_Atwood'][1])

He is best known for his vlogs , where he posts updates about his life on a daily basis .


In [27]:
pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


In [72]:
import wikipedia

# Query text
query = "List_of_The_Simpsons_guest_stars_(seasons 21–present)"

# Search for matching page titles
results = wikipedia.search(query)

# Print the list of matching page titles
print("Matching page titles:")
for title in results:
    print(title)

# Choose a page to retrieve
page_title = results[0]

# Retrieve the content of the page
page = wikipedia.page(page_title)

# Print the page summary and content
print("Page summary:", page.summary)
##print("Page content:", page.content)


Matching page titles:
List of The Simpsons guest stars (seasons 21–present)
List of The Simpsons episodes (season 21–present)
List of The Simpsons episodes
The Simpsons (season 34)
The Simpsons
Rosebud (The Simpsons)
Mona Simpson (The Simpsons)
List of The Simpsons cast members
List of The Simpsons episodes (seasons 1–20)
Jimmi Simpson
Page summary: In addition to the show's regular cast of voice actors, celebrity guest stars have been a staple of The Simpsons, an American animated television sitcom created by Matt Groening for the Fox Broadcasting Company, since its first season. The Simpsons focuses on the eponymous family, which consists of Homer, Marge, Bart, Lisa and Maggie. The family was initially conceived by Groening for a series of animated shorts, which originally aired as a part of The Tracey Ullman Show between 1987 and 1989. The shorts were developed into a half-hour prime time series which began in December 1989. The series' 34th season began on September 25, 2022, and 7

### Compute Similarity between the Claim and the Candidate Evidences found using TF-IDF

In [29]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

query = "SELECT lines FROM documents"
cursor.execute(query)

results = cursor.fetchall()


In [30]:
L = len(results)
documents = []
for i in range(0, 3):
    document = results[i][0]
    
    # remove tab characters
    document = document.replace('\t', ' ')
    
    # remove all words after last . (the links to other pages)
    last_period_index = document.rfind(".")

   # Remove everything after the last period

    if last_period_index != -1:
        document = document[:last_period_index+1]
    
#     while len(document) >= 1 and document[0].isdigit():
#         try:
#             document = document[1:]
                    
#         except:
#             document=''
#             break

    document = ' '.join(document.split())
    
    documents.append(document)
    
print((documents))

['0 A Diffusion limited enzyme is an enzyme which catalyses a reaction so efficiently that the rate limiting step is that of substrate diffusion into the active site , or product diffusion out . enzyme enzyme catalyses catalysis rate limiting step rate limiting step substrate enzyme substrate diffusion diffusion active site active site product product (chemistry) 1 This is also known as kinetic perfection or catalytic perfection . 2 Since the rate of catalysis of such enzymes is set by the diffusion-controlled reaction , it therefore represents an intrinsic , physical constraint on evolution -LRB- a maximum peak height in the fitness landscape -RRB- . diffusion diffusion diffusion-controlled reaction diffusion-controlled reaction fitness landscape fitness landscape fitness fitness (biology) 3 Diffusion limited perfect enzymes are very rare . 4 Most enzymes catalyse their reactions to a rate that is 1,000-10 ,000 times slower than this limit . 5 This is due to both the chemical limitati

### Term frequency and inverse document frequency


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
print(claim_test_lines)

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'Roman Atwood is a content creator.', 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.']


### Extract all sentences with relevance scores  as tuples 

In [33]:

vectorizer = TfidfVectorizer()

# Vectorize the claim and evidences
vectors = vectorizer.fit_transform(documents)
# candidate_sentence = candidate_lines[0]

# Candidate_lines[i] -> the collection of candidate lines in an ith claim from all retrieved documents


claim_similarities = []
for i in range(len(claim_test_lines)):
    similarity_scores_list = []
    sim_score = {}
    claim = claim_test_lines[i]
    
    max_similarity_score = 0
    max_candidate_sentence = ""
    
    for key in candidate_lines[i].keys():
        similarity_all_sentences = []
        candidate_list_sentence = candidate_lines[i][key]
        
        for idx in range(len(candidate_list_sentence)) :
            
            candidate_sentence = candidate_list_sentence[idx]
           # Vectorize the new sentences
            new_vectors = vectorizer.transform([claim,candidate_sentence])

           # Calculate cosine similarity between the two new sentences
            similarity_scores = cosine_similarity(new_vectors)
            
            max_similarity_score = max(max_similarity_score, similarity_scores[0][1])
            if(similarity_scores[0][1] == max_similarity_score):
                max_candidate_sentence = candidate_sentence
            
            similarity_scores_list.append((key,idx, candidate_sentence, similarity_scores[0][1]))
        
    claim_similarities.append(similarity_scores_list)
    

In [34]:
claim_similarities

[[('Nikolaj_Coster-Waldau',
   0,
   'Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter .',
   0.0),
  ('Nikolaj_Coster-Waldau',
   1,
   'He graduated from Danish National School of Theatre in Copenhagen in 1993 .',
   0.0),
  ('Nikolaj_Coster-Waldau',
   2,
   "Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- .",
   0.33985062364609137),
  ('Nikolaj_Coster-Waldau',
   3,
   'Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .',
   0.0),
  ('Nikolaj_Coster-Waldau', 4, '', 0.0),
  ('Nikolaj_Coster-Waldau', 5, '', 0.0),
  ('Nikolaj_Coster-Waldau',
   6,
   'In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon .',
   0.54807

### Extract top K evidence sentences for every claim

In [35]:
# my_list = [(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 2), (13, 14, 15, 8), (17, 18, 19, 1)]

def topK(my_list, K):
    # Find the indices of the top K tuples in the list based on the 4th element
    top_indices = heapq.nlargest(K, range(len(my_list)), key=lambda i: my_list[i][3])

    # Find the tuples corresponding to the top K indices
    top_tuples = [my_list[i] for i in top_indices]

    # Print the top indices and tuples
    
    zipped_list = list(zip(top_indices, top_tuples))
    
    return zipped_list

In [36]:
K=5
topK_evidences_for_each_claim = []
for i in range(len(claim_similarities)):
    topK_evidences_for_each_claim.append(topK(claim_similarities[i], K))
    

In [37]:
topK_evidences_for_each_claim
# 
# Inner tuple:
# 

# 1st index: position in the claim_no

#     inner tuple:
#         1st element: retrieved document name 
#         2nd: index of the candidate sentence in the retrieved document
#         3rd: candidate sentence
#         4th: Similarity score


[[(17,
   ('Fox_Broadcasting_Company',
    6,
    'It was the highest-rated broadcast network in the 18 -- 49 demographic from 2004 to 2012 , and earned the position as the most-watched American television network in total viewership during the 2007 -- 08 season .',
    0.7662925917871872)),
  (12,
   ('Fox_Broadcasting_Company',
    1,
    'The network is headquartered at the 20th Century Fox studio lot on Pico Boulevard in the Century City section of Los Angeles , with additional major offices and production facilities at the Fox Television Center in nearby West Los Angeles and the Fox Broadcasting Center in the Yorkville neighborhood of Manhattan , New York City .',
    0.6962784459401956)),
  (24,
   ('Fox_Broadcasting_Company',
    13,
    "The network is named after sister company 20th Century Fox , and indirectly for producer William Fox , who founded one of the movie studio 's predecessors , Fox Film .",
    0.5812106375713183)),
  (6,
   ('Nikolaj_Coster-Waldau',
    6,
    'I

In [38]:
claim_similarities[0][17]

('Fox_Broadcasting_Company',
 6,
 'It was the highest-rated broadcast network in the 18 -- 49 demographic from 2004 to 2012 , and earned the position as the most-watched American television network in total viewership during the 2007 -- 08 season .',
 0.7662925917871872)

### Find relevance of each candidate evidence sentence using BERT-based models(training)

### Install transformers

In [39]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


### Import required libraries

In [40]:
import torch
import random
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

# Set random seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

### Load pre-trained BERT model and tokenizer


In [41]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

### This is a sequence classification task with two labels (Relevant or not)
#model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


### Get (claim,evidence,label) pairs

In [42]:
evidence_test_lines = [ sub['evidence'] for sub in test_lines]
print(evidence_test_lines)


[[[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]], [[[174271, 187498, 'Roman_Atwood', 1]], [[174271, 187499, 'Roman_Atwood', 3]]], [[[255136, 254645, 'History_of_art', 2]]]]


In [43]:
print(evidence_test_lines[0][0][0][2])

Nikolaj_Coster-Waldau


In [44]:
len(evidence_test_lines[0][0])

2

In [45]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

claim_label_pos = []
candidate_evidence_pos = []
claim_pos = []
for i in range(len(evidence_test_lines)):
    for j in range(len(evidence_test_lines[i])):
        for k in range(len(evidence_test_lines[i][j])):
            claim_pos.append(claim_test_lines[i])

            page = evidence_test_lines[i][j][k][2]
            line_no = evidence_test_lines[i][j][k][3]

            # Execute a SELECT query on the database
            query = f"SELECT lines FROM documents where id='{page}'"
            cursor.execute(query)

            # Retrieve the results of the query
            results = cursor.fetchone()
                
            lines_page = results[0].split('\n') 
            line = lines_page[line_no]
            line = line.replace('\t', ' ')
            
            while len(line) >= 1 and line[0].isdigit():
                try:
                    line = line[1:]
                    
                except:
                    line=''
                    break
                
      
            # remove all words after last . (the links to other pages)
            last_period_index = line.rfind(".")
            
            # Remove everything after the last period
            if last_period_index != -1:
                line = line[:last_period_index+1]
                   
           
            # remove extra spaces
            line = ' '.join(line.split())
            
            
            candidate_evidence_pos.append(line)
            claim_label_pos.append(1)
            
                
            
# Close the connection to the database
conn.close()

In [46]:
claim_label_pos

[1, 1, 1, 1, 1]

In [47]:
candidate_evidence_pos

['He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .',
 'The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .',
 'He is best known for his vlogs , where he posts updates about his life on a daily basis .',
 "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .",
 'The subsequent expansion of the list of principal arts in the 20th century reached to nine : architecture , dance , sculpture , music , painting , poetry -LRB- described broadly as a form of literature with aesthetic purpose or function , which also includes the distinct genres of theatre and narrative -RRB- , film , photography and graphic a

In [48]:
claim_pos

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'Roman Atwood is a content creator.',
 'Roman Atwood is a content creator.',
 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.']

### Negative samples

In [86]:
no_of_neg = 3
k=10

claim_neg = []
candidate_neg = []
label_neg =[]

for i in range(len(claim_test_lines)):
    claim = claim_test_lines[i]
    for j in range(no_of_neg):
        
        # Look at pages related to claim
        random_pages = wikipedia.search(claim)
        #print("AAAAAAAAA........",random_pages)
        n = len(random_pages)
        
        # Pick a random page from the list
        page_name = random_pages[random.randint(0, n-1)]
        #print("BBBBBBB",page_name)
       
        
        conn = sqlite3.connect(db_path)

        # Create a cursor object to execute SQL commands
        cursor = conn.cursor()

        page_name=preprocess(page_name)
        
        
        query = f"SELECT lines FROM documents where id='{page_name}'"
        cursor.execute(query)
        

        # Retrieve the results of the query
        results = cursor.fetchone()
        
        #print("CCCCCCC",results)
        
        # If results are empty continue
        if results is None or len(results) == 0:
            continue  
            
        # Split the string into a list of lines using the newline character
        lines_page = results[0].split('\n') 
        #lines_page = lines_page[0:k]
        #print(type(lines_page))
        
        # Get a random line from first k lines
        flag = True

        # simulate a do-while loop
        while flag:
         
            # sample random line
            line = lines_page[random.randrange(len(lines_page))]
         
            # preprocess the line
            line = line.replace('\t', ' ')
           
            while len(line) >= 1 and line[0].isdigit():
                try:
                    line = line[1:]
                    
                except:
                    line=''
                    break
                
          

            # remove all words after last . (the links to other pages)
            last_period_index = line.rfind(".")

            # Remove everything after the last period
           
            if last_period_index != -1:
                line = line[:last_period_index+1]
            
            # remove extra spaces
            line = ' '.join(line.split())
                
            if len(line)!=0:
                flag = False
                
        print((claim,line,0))                             
        claim_neg.append(claim)
        candidate_neg.append(line)
        label_neg.append(0)
        

('Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'The series has received 110 Emmy Award nominations , including six consecutive Outstanding Drama Series nominations , with 38 wins .', 0)
('Roman Atwood is a content creator.', 'Much of the importance of Hermeticism arises from its connection with the development of science during the time from 1300 to 1600 AD .', 0)
('History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.', 'From the Middle Ages through the Renaissance painters worked for the church and a wealthy aristocracy .', 0)
('History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.', 'The Goryeo Dynasty -LRB- 918 -- 1392 -RRB- was one of the most prolific periods for a wide range of disciplines , especially pottery .', 0)
('History of art includes architecture, dance, sculpture,

In [89]:
print(claim_neg[0])
print(candidate_neg[0])
print(label_neg[0])

Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
The series has received 110 Emmy Award nominations , including six consecutive Outstanding Drama Series nominations , with 38 wins .
0


In [93]:
print(claim_pos[0]) 
print(candidate_evidence_pos[0])
print(claim_label_pos[0])


Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .
1


### Define train_claims,train_evidence and train_labels using positive and negative samples

In [94]:
train_claims = claim_pos + claim_neg
train_evidence = candidate_evidence_pos + candidate_neg
train_labels = claim_label_pos + label_neg


### Load your training data

In [95]:
# Tokenize input sequences
input_ids = []
attention_masks = []
for claim, evidence in zip(train_claims, train_evidence):
    encoded_dict = tokenizer.encode_plus(
                        claim,
                        evidence,
                        add_special_tokens = True,
                        max_length = 128,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_labels)

In [96]:
input_ids

tensor([[  101, 24794,  3501,  ...,     0,     0,     0],
        [  101, 24794,  3501,  ...,     0,     0,     0],
        [  101,  3142,  2012,  ...,     0,     0,     0],
        ...,
        [  101,  2381,  1997,  ...,     0,     0,     0],
        [  101,  2381,  1997,  ...,     0,     0,     0],
        [  101,  2381,  1997,  ...,     0,     0,     0]])

In [97]:
input_ids.shape

torch.Size([10, 128])

In [98]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [99]:
attention_masks.shape

torch.Size([10, 128])

In [100]:
labels

tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [101]:
labels.shape

torch.Size([10])

### Construct dataset with (input_ids,attention_masks,labels)

In [102]:
# Combine input sequences with labels
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [103]:
print(len(train_dataset))
print(len(val_dataset))

8
2


### Construct the dataloaders

In [104]:
# Create data loaders for batching

batch_size = 2
train_dataloader = DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=torch.utils.data.SequentialSampler(val_dataset), batch_size=batch_size)


### Load the pretrained model

In [105]:
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Finetune the pretrained BERT Model using training examples

In [108]:
# Set up the optimizer and loss function
import torch.nn.functional as F

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

n_epochs = 8

model = model.to(device)
model.train()

for epoch in range(n_epochs):
    print('Training epoch %d...' % epoch)
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        #print("AAAAA",outputs[0])
        loss = criterion(outputs[1], labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    print('Average training loss: %.4f' % (total_loss / len(train_dataloader)))

# Evaluate the fine-tuned model on your validation set
# Let's assume your validation set is in the same format as your training set
# and you have converted it into a DataLoader of input tensors called val_dataloader
# with the same structure as train_dataloader
model.eval()
with torch.no_grad():
    val_loss = 0
    val_accuracy = 0
    val_steps = 0
    
    for batch in val_dataloader:
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        
        # Extract loss and logits from the model output
        val_loss += outputs.loss.item()
        logits = outputs.logits

        # Convert logits to probabilities and get predicted labels
        probs = F.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        # Compute accuracy for the batch
        val_accuracy += torch.sum(preds == labels).item()

        # Update the number of evaluation steps
        val_steps += 1
        
        
# Calculate average evaluation loss and accuracy
avg_val_loss = val_loss / val_steps
avg_val_accuracy = val_accuracy / len(val_dataset)
        


Training epoch 0...
Average training loss: 0.1866
Training epoch 1...
Average training loss: 0.1271
Training epoch 2...
Average training loss: 0.0949
Training epoch 3...
Average training loss: 0.0705
Training epoch 4...
Average training loss: 0.0515
Training epoch 5...
Average training loss: 0.0392
Training epoch 6...
Average training loss: 0.0269
Training epoch 7...
Average training loss: 0.0180


### Print Validation Accuracy

In [109]:
print("avg val accuracy", avg_val_accuracy)
print("avg loss",avg_val_loss)

avg val accuracy 1.0
avg loss 0.012902320362627506


In [None]:

def get_top_k_lines(page_title, k):
    # Get the Wikipedia page
    page = wikipedia.page(page_title)

    # Get the summary of the page and split it into sentences
    summary_sentences = page.summary.split('. ')

    # Get the top k sentences from the summary
    top_k_sentences = summary_sentences[:k]

    # Join the sentences back into a single string and return it
    return '. '.join(top_k_sentences)




