# Fact Verification and Evidence Retrieval

### Import libraries

In [1]:
import json
import os
import os.path as path
import heapq
import torch
import random

In [2]:
pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


### Define the paths

In [3]:
BASE_DIR = os.getcwd()

raw_training_set = path.join(BASE_DIR, "fever/train.jsonl")
training_doc_file = path.join(BASE_DIR,"fever/train.wiki7.jsonl")
db_path = path.join(BASE_DIR, "fever/fever.db")

print("Base dir:",BASE_DIR)
print("Raw Training set:",raw_training_set)
print("Training DocRetrieval Output File:",training_doc_file)
print("Database path:",db_path)


Base dir: /Users/debrup/PycharmProjects/ir_project/ir_project
Raw Training set: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/train.jsonl
Training DocRetrieval Output File: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/train.wiki7.jsonl
Database path: /Users/debrup/PycharmProjects/ir_project/ir_project/fever/fever.db


### Check if gpu is available

In [4]:
print(torch.backends.mps.is_available())

True


In [5]:
print(torch.backends.mps.is_built())

True


In [6]:
device = torch.device("mps" if torch.device("mps") else "cpu")
print(device)

mps


### Open the train.jsonl file

In [7]:
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

In [8]:
class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in fp.readlines():
            data.append(json.loads(line.strip()))
        return data
    

### Read lines of train.json

In [9]:
processed = dict()
jlr = JSONLineReader()
lines = jlr.read(raw_training_set)

In [10]:
print(type(lines))

<class 'list'>


### Get the claim lines

In [11]:
claim_test_lines = [sub['claim'] for sub in lines]
evidence_test_lines = [ sub['evidence'] for sub in lines]

claim_test_lines = claim_test_lines[0:50]
evidence_test_lines = evidence_test_lines[0:50]

print(claim_test_lines[0:3])
print(evidence_test_lines[0:3])

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'Roman Atwood is a content creator.', 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.']
[[[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]], [[[174271, 187498, 'Roman_Atwood', 1]], [[174271, 187499, 'Roman_Atwood', 3]]], [[[255136, 254645, 'History_of_art', 2]]]]


### Install stanza

In [12]:
pip install stanza


Note: you may need to restart the kernel to use updated packages.


### Import stanza and download english model

In [13]:
import stanza
stanza.download('en') # download the English model

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:03, 59.7kB/s]
2023-04-12 21:57:26 INFO: Downloading default packages for language: en (English) ...
2023-04-12 21:57:28 INFO: File exists: /Users/debrup/stanza_resources/en/default.zip
2023-04-12 21:57:33 INFO: Finished downloading models and saved to /Users/debrup/stanza_resources.


### Try NER on the claim

### Install spacy

In [14]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


### Try NER using spacy

In [15]:
# import spacy

# nlp = spacy.load("en_core_web_sm")


# docs = list(nlp.pipe(claim_test_lines[0:3]))

# for doc in docs:
#     for ent in doc.ents:
#         print(ent.text, ent.label_)

#### As we can see the NER Models cannot get all the required entities. They can only derive very specific entity objects.

### Install nltk

In [16]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


### Extract all the noun phrases of the claims using constituency tree

In [None]:
import stanza
import wikipedia
from tqdm import tqdm

def preprocess(np):
    page = np.replace('( ', '-LRB-')
    page = page.replace(' )', '-RRB-')
    page = page.replace(' - ', '-')
    page = page.replace(' :', '-COLON-')
    page = page.replace(' ,', ',')
    page = page.replace(" 's", "'s")
    page = page.replace(' ', '_')
    return page
    

claim_train_lines = [ sub['claim'] for sub in lines]

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency',tokenize_pretokenized=True)
entities=[]  # entities will be a list of size len(train_claims) having list of entities(noun_phrases) as each elem
wiki_pages=[]



for claim in tqdm(claim_test_lines):
        doc = nlp(claim)
        sentence=doc.sentences[0]
        tree = sentence.constituency
        np=[]
        tree.visit_preorder(internal = lambda x: np.append(x.leaf_labels()) if (x.label=="NP" or x.label=="NML") else None)
        noun_phrases = [' '.join(n) for n in np]
        entities.append(noun_phrases)
        #www = wikipedia.search(n)
        #print("LLLL:",len(noun_phrases))
        #print(tree)
            
        #predicted_wiki = [preprocess((wikipedia.search(n))[0]) for n in noun_phrases] 
        predicted_wiki = []
        for n in noun_phrases:
            if not len(wikipedia.search(n)) == 0:
                predicted_wiki.append(preprocess((wikipedia.search(n))[0]))
        wiki_pages.append(predicted_wiki)
        
        ##print(noun_phrases)
        ##print(type(sentence.constituency))

        

2023-04-12 21:57:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:04, 44.0kB/s]
2023-04-12 21:57:44 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-12 21:57:44 INFO: Using device: cpu
2023-04-12 21:57:44 INFO: Loading: tokenize
2023-04-12 21:57:44 INFO: Loading: pos
2023-04-12 21:57:44 INFO: Loading: constituency
2023-04-12 21:57:44 INFO: Done loading processors!
 64%|███████████████████████████▌               | 32/50 [03:37<01:42,  5.68s/it]

In [None]:
entities

In [None]:
len(entities)

### DEBUG TO FIND EMPTY SUBLIST

In [None]:
c=0
for sublist in entities:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
c=0
for sublist in wiki_pages:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
c=0
for sublist in claim_test_lines:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
entities[44]

In [None]:
wiki_pages[44]

In [None]:
len(wiki_pages)

In [None]:
# for claim in tqdm(claim_test_lines[859:1000]):
    
#         doc = nlp(claim)
#         sentence=doc.sentences[0]
#         tree = sentence.constituency
#         np=[]
#         tree.visit_preorder(internal = lambda x: np.append(x.leaf_labels()) if (x.label=="NP" or x.label=="NML") else None)
#         noun_phrases = [' '.join(n) for n in np]
#         entities.append(noun_phrases)
#         #www = wikipedia.search(n)
#         #print("LLLL:",len(noun_phrases))
#         #print(tree)
            
#         #predicted_wiki = [preprocess((wikipedia.search(n))[0]) for n in noun_phrases] 
#         predicted_wiki = []
#         for n in noun_phrases:
#             if not len(wikipedia.search(n)) == 0:
#                 predicted_wiki.append(preprocess((wikipedia.search(n))[0]))
#         wiki_pages.append(predicted_wiki)
        
#         ##print(noun_phrases)
#         ##print(type(sentence.constituency))

In [None]:
len(wiki_pages)

### Example noun phrases

In [None]:
print(entities)

### Example predicted wiki pages

In [None]:
wiki_pages

### Example code for retrieving from fever.db

In [None]:
P=preprocess('List_of_The_Simpsons_guest_stars_(seasons 21–present)')

In [None]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

# Execute a SELECT query on the database
query = f"SELECT lines FROM documents where id='Game_of_Thrones'"
cursor.execute(query)

# Retrieve the results of the query
results = cursor.fetchone()

# Print the results
for row in results:
    print(row)

# Close the connection to the database
conn.close()

### Get the names of titles and the lines of pages in fever.db that matches with wiki_pages

In [None]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

candidate_lines = []  ## list of size (no_claims), each element of list should be lines of each matched page

count=0

for claim_pages in wiki_pages:
    
    
    for page in claim_pages:
        
        #print("page:",page)
        #Execute a SELECT query on the database
        query = "SELECT lines FROM documents WHERE id=?"
        cursor.execute(query, (page,))

        #print("query:",query)
       
        # Retrieve the results of the query
        results = cursor.fetchone()
        
        #print(results)
        
       # If results are empty continue
        if results is None or len(results) == 0:
            continue  
            
       # Split the string into a list of lines using the newline character
        lines_page = results[0].split('\n')   
        #print(lines_page)
        
        for line in lines_page:
            
         
           # remove tab characters
           line = line.replace('\t', ' ')
           
           while len(line) >= 1 and line[0].isdigit():
                try:
                    line = line[1:]
                    
                except:
                    line=''
                    break
                
           # remove digits
           #line = line.translate(str.maketrans('', '', '0123456789'))
            

           # remove all words after last . (the links to other pages)
           last_period_index = line.rfind(".")

           # Remove everything after the last period
           
           if last_period_index != -1:
               
                line = line[:last_period_index+1]
                   
           
        
           #line = line.split('.')[0]
    
           # remove extra spaces
           line = ' '.join(line.split())
            
           # Each of the elements of candidate lines is a dictionary with (title,lines_page) (K,V) pairs
           
         
           try:
            
              ((candidate_lines[count])[page]).append(line)
                    
           except:
              
              if len(candidate_lines)==count:
                 candidate_lines.append({})
                    
               
                 
            
              try:
                      ((candidate_lines[count])[page]).append(line)
              except:
                      ((candidate_lines[count])[page])=[]
                      ((candidate_lines[count])[page]).append(line)  
                    
        
        
        
    count=count+1


print(candidate_lines[0])

# Close the connection to the database
conn.close()


In [None]:
print(len(candidate_lines))

### DEBUG

In [None]:
c=0
for sublist in candidate_lines:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

# Define the SQL query with a placeholder for the id parameter
query = "SELECT lines FROM documents WHERE id=?"

# Execute the query with the id parameter passed as a tuple
id = "Robert_J._O'Neill"
cursor.execute(query, (id,))

# Fetch the results and print the first row
results = cursor.fetchone()
print(results[0])

In [None]:
print(candidate_lines[1])


In [None]:
## Get the retrieved documents for each claim

retrieved_docs = []
for i in candidate_lines:
    retrieved_docs.append(list(i.keys()))
        

In [None]:
len(retrieved_docs)

In [None]:
print(retrieved_docs)

In [None]:
### Find list of ground truth documents
evidence_test_lines[0]

ground_truth_docs=[]
for i in range(len(evidence_test_lines)):
    l1 = evidence_test_lines[i]
    ground_truth_docs.append([])
    for j in range(len(l1)):
        for l2 in evidence_test_lines[i][j]:
             ground_truth_docs[i].append(l2[2])
                
                
    

In [None]:
len(ground_truth_docs)

In [None]:
# Find document retrieval accuracy

total_docs=0
correct_docs=0
for i in range(len(ground_truth_docs)):
    
    claim_docs=ground_truth_docs[i]
    for doc in claim_docs:
        total_docs=total_docs+1
        print(doc)
        if doc in ground_truth_docs[i]:
            correct_docs=correct_docs+1
    

In [None]:
# Find document retrieval accuracy

total_docs=0
correct_docs=0
for i in range(len(ground_truth_docs)):
    
    ground_truth_doc=ground_truth_docs[i]
    total_docs=total_docs+1
    
    if ground_truth_doc==[None]:
        correct_docs=correct_docs+1 
        #print("Here ",i)
        continue
        
    flag=1
    
    #print("GG", retrieved_docs[i])
    #print(ground_truth_doc)
    for doc in ground_truth_doc:
        if doc not in retrieved_docs[i]:
            flag=0
            #print("HERE", i)
            break
            
    if flag==1:
        correct_docs=correct_docs+1 
    
   
       
            
            


In [None]:
ground_truth_docs

In [None]:
total_docs=0
correct_docs=0

# Accuracy of document retrieval for each claim 
# Correct if the entire ground truth for a claim  is a subset of the retrieved document list for a claim
for i in range(len(ground_truth_docs)):
    doc = ground_truth_docs[i]
    total_docs=total_docs+1
    
    if len(doc) == 1 and doc[0] is None:
        print("YES")
        correct_docs=correct_docs+1
        print("a", retrieved_docs[i])
        print("b", doc)
        
    elif set(doc).issubset(set(retrieved_docs[i])):
        print("YES")
        correct_docs=correct_docs+1
        print("a", retrieved_docs[i])
        print("b", doc)
    else:
        print("NO")
        print("a", retrieved_docs[i])
        print("b", doc)
    print()

In [None]:
total_docs

In [None]:
correct_docs

In [None]:
accuracy = correct_docs/total_docs
print("Accuracy of document retrieval",accuracy*100,"%")

In [None]:
ground_truth_docs[0]

In [None]:
print(len(ground_truth_docs))

In [None]:
evidence_test_lines[0][0]

In [None]:
print(candidate_lines[1]['Roman_Atwood'][1])

In [None]:
import wikipedia

# Query text
query = "List_of_The_Simpsons_guest_stars_(seasons 21–present)"

# Search for matching page titles
results = wikipedia.search(query)

# Print the list of matching page titles
print("Matching page titles:")
for title in results:
    print(title)

# Choose a page to retrieve
page_title = results[0]

# Retrieve the content of the page
page = wikipedia.page(page_title)

# Print the page summary and content
print("Page summary:", page.summary)
##print("Page content:", page.content)


### Compute Similarity between the Claim and the Candidate Evidences found using TF-IDF

In [None]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

query = "SELECT lines FROM documents"
cursor.execute(query)

results = cursor.fetchall()


In [None]:
L = len(results)
documents = []
for i in range(0, 300):
    document = results[i][0]
    
    # remove tab characters
    document = document.replace('\t', ' ')
    
    # remove all words after last . (the links to other pages)
    last_period_index = document.rfind(".")

   # Remove everything after the last period

    if last_period_index != -1:
        document = document[:last_period_index+1]
    
#     while len(document) >= 1 and document[0].isdigit():
#         try:
#             document = document[1:]
                    
#         except:
#             document=''
#             break

    document = ' '.join(document.split())
    
    documents.append(document)
    
print((documents))

In [None]:
print(len(documents))

In [None]:
print(documents[0])

### Term frequency and inverse document frequency


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
print(claim_test_lines)

In [None]:
print(len(claim_test_lines))

### Extract all sentences with relevance scores  as tuples 

In [None]:

vectorizer = TfidfVectorizer()

# Vectorize the claim and evidences
vectors = vectorizer.fit_transform(documents)
# candidate_sentence = candidate_lines[0]

# Candidate_lines[i] -> the collection of candidate lines in an ith claim from all retrieved documents


claim_similarities = []
for i in range(len(claim_test_lines)):
    similarity_scores_list = []
    sim_score = {}
    claim = claim_test_lines[i]
    
    max_similarity_score = 0
    max_candidate_sentence = ""
    
    for key in candidate_lines[i].keys():
        similarity_all_sentences = []
        candidate_list_sentence = candidate_lines[i][key]
        
        for idx in range(len(candidate_list_sentence)) :
            
            candidate_sentence = candidate_list_sentence[idx]
           # Vectorize the new sentences
            new_vectors = vectorizer.transform([claim,candidate_sentence])

           # Calculate cosine similarity between the two new sentences
            similarity_scores = cosine_similarity(new_vectors)
            
            max_similarity_score = max(max_similarity_score, similarity_scores[0][1])
            if(similarity_scores[0][1] == max_similarity_score):
                max_candidate_sentence = candidate_sentence
            
            similarity_scores_list.append((key,idx, candidate_sentence, similarity_scores[0][1]))
        
    claim_similarities.append(similarity_scores_list)
    

In [None]:
claim_similarities

In [None]:
print(len(claim_similarities))

In [None]:
print(claim_similarities[0])

### Extract top K evidence sentences for every claim

In [None]:
# my_list = [(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 2), (13, 14, 15, 8), (17, 18, 19, 1)]

def topK(my_list, K):
    # Find the indices of the top K tuples in the list based on the 4th element
    top_indices = heapq.nlargest(K, range(len(my_list)), key=lambda i: my_list[i][3])

    # Find the tuples corresponding to the top K indices
    top_tuples = [my_list[i] for i in top_indices]

    # Print the top indices and tuples
    
    zipped_list = list(zip(top_indices, top_tuples))
    
    return zipped_list

In [None]:
K=5
topK_evidences_for_each_claim = []
for i in range(len(claim_similarities)):
    topK_evidences_for_each_claim.append(topK(claim_similarities[i], K))
    

In [None]:
topK_evidences_for_each_claim
# 
# Inner tuple:
# 

# 1st index: position in the claim_no

#     inner tuple:
#         1st element: retrieved document name 
#         2nd: index of the candidate sentence in the retrieved document
#         3rd: candidate sentence
#         4th: Similarity score


In [None]:
print(len(topK_evidences_for_each_claim))

In [None]:
print((topK_evidences_for_each_claim)[0][0][1][0])  #i j 1 0
print((topK_evidences_for_each_claim)[0][0][1][1])  #i j 1 1

In [None]:
print((topK_evidences_for_each_claim)[0][0])  #i j 1 0

In [None]:
retrieved_evidences_tfidf=[]

for i in range(len(topK_evidences_for_each_claim)):
    retrieved_evidences_tfidf.append([])
    t = topK_evidences_for_each_claim[i]
    for j in range(len(t)):
         retrieved_evidences_tfidf[i].append([topK_evidences_for_each_claim[i][j][1][0],topK_evidences_for_each_claim[i][j][1][1]])
        
        
        

In [None]:
len(retrieved_evidences_tfidf)

In [None]:
retrieved_evidences_tfidf[0]

In [None]:
evidence_test_lines[0]

In [None]:
# ## Evidence retrieval accuracy

# for i in range(len(evidence_test_lines)):
#     evidence_sets = evidence_test_lines[i]
#     print("rrr",retrieved_evidences_tfidf[i])
#     for j in range(len(evidence_sets)):
#         evidence_set = evidence_sets[j]
#         print("eee",evidence_set) 
#         for k in range(len(evidence_set)):
#             evidence_set[k]=(evidence_set[k])[2:]
            
#         if set(evidence_set).issubset(set(retrieved_evidences_tfidf[i])):
#             print("True")
          
    

In [None]:
claim_test_lines[0]

In [None]:
evidence_test_lines[0]

In [None]:
claim_similarities[0][11]

### Find relevance of each candidate evidence sentence using BERT-based models(training)

### Install transformers

In [None]:
pip install transformers

### Import required libraries

In [None]:
import torch
import random
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

# Set random seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

### Load pre-trained BERT model and tokenizer


In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

### This is a sequence classification task with two labels (Relevant or not)
#model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


### Get (claim,evidence,label) pairs

In [None]:
print(len(evidence_test_lines))

In [None]:
print(evidence_test_lines[0])

In [None]:
print(evidence_test_lines)

In [None]:
import sqlite3

# Open a connection to the database file
conn = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

claims_to_train_on = [ sub['claim'] for sub in lines]
claims_to_train_on =claims_to_train_on[0:50]        # We take 50 claims to get examples

claim_label_pos = []
candidate_evidence_pos = []
claim_pos = []

for i in range(len(claims_to_train_on)):    # For all the claims
    
    for j in range(len(evidence_test_lines[i][0])):
        
            page = evidence_test_lines[i][0][j][2]
            
            #Check if page is NONE (can happen in not enough info claim case)
            if (page==None):  # For None type page (for not enough info)
                continue
            

            line_no = evidence_test_lines[i][0][j][3]
            
           
            # Execute a SELECT query on the database
            query = "SELECT lines FROM documents WHERE id=?"
            cursor.execute(query, (page,))
            
            # Retrieve the results of the query
            results = cursor.fetchone()
                
            lines_page = results[0].split('\n') 
            line = lines_page[line_no]
            line = line.replace('\t', ' ')
            
            while len(line) >= 1 and line[0].isdigit():
                try:
                    line = line[1:]
                    
                except:
                    line=''
                    break
                
      
            # remove all words after last . (the links to other pages)
            last_period_index = line.rfind(".")
            
            # Remove everything after the last period
            if last_period_index != -1:
                line = line[:last_period_index+1]
                   
           
            # remove extra spaces
            line = ' '.join(line.split())
            
            claim_pos.append(claims_to_train_on[i])
            candidate_evidence_pos.append(line)
            claim_label_pos.append(1)
            
                
            
# Close the connection to the database
conn.close()

In [None]:
claim_label_pos

In [None]:
candidate_evidence_pos

In [None]:
claim_pos

In [None]:
len(claim_pos)

In [None]:
len(claim_label_pos)

In [None]:
len(candidate_evidence_pos)

In [None]:
c=0
for sublist in claim_pos:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
c=0
for sublist in claim_label_pos:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
c=0
for sublist in candidate_evidence_pos:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

### Negative samples

In [None]:
print(len(claims_to_train_on))

In [None]:
no_of_neg = 1
k=10

claim_neg = []
candidate_neg = []
label_neg =[]

import re

def check_digit(s):
    # remove any extra spaces or tabs from the string
    s = s.strip()
    # define a regular expression pattern to match 2-digit numbers
    pattern = r'^\d{1,2}$'
    # use the re.match() function to check if the string matches the pattern
    match = re.match(pattern, s)
    # if the match object is not None, then the string is a 2-digit number
    if match:
        return True
    else:
        return False



for i in range(len(claims_to_train_on)):
    claim = claims_to_train_on[i]
    # Look at pages related to claim
    random_pages = wikipedia.search(claim)
    #print("AAAAAAAAA........",random_pages)
        
    n = len(random_pages)
    #print("value of n",n)
        
    # Check if n=0 then continue(no search result for the claim)
    if(n==0):
       continue
            
    for j in range(no_of_neg):
        
        # Pick a random page from the list
        page_name = random_pages[random.randint(0, n-1)]
        
        
        
        #print("BBBBBBB",page_name)
       
        
        conn = sqlite3.connect(db_path)

        # Create a cursor object to execute SQL commands
        cursor = conn.cursor()

        page_name=preprocess(page_name)
        
        # Execute a SELECT query on the database
        query = "SELECT lines FROM documents WHERE id=?"
        cursor.execute(query, (page_name,))
        
        
        # Retrieve the results of the query
        results = cursor.fetchone()
        
        #print("CCCCCCC",results)
        
        # If results are empty continue
        if results is None or len(results) == 0:
            continue  
            
        # Split the string into a list of lines using the newline character
        lines_page = results[0].split('\n') 
        #lines_page = lines_page[0:k]
        #print(type(lines_page))
        
        # Get a random line from first k lines
        flag = True

        # simulate a do-while loop
       
         
        # sample random line
        line = lines_page[random.randrange(len(lines_page))]

        # preprocess the line
        line = line.replace('\t', ' ')

        while len(line) >= 1 and line[0].isdigit():
            try:
                line = line[1:]

            except:
                line=''
                break



        # remove all words after last . (the links to other pages)
        last_period_index = line.rfind(".")

        # Remove everything after the last period

        if last_period_index != -1:
            line = line[:last_period_index+1]

        # remove extra spaces
        line = ' '.join(line.split())

        if len(line)==0:
           continue 

        
        #print((claim,line,0))                             
        claim_neg.append(claim)
        candidate_neg.append(line)
        label_neg.append(0)

In [None]:
print(len(claim_neg))

In [None]:
print(len(candidate_neg))

In [None]:
print(len(label_neg))

In [None]:
print(claim_neg[1])
print(candidate_neg[1])
print(label_neg[1])

In [None]:
c=0
for sublist in claim_neg:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
c=0
for sublist in candidate_neg:
    if not sublist:
        print("Empty sublist found!")
        print()
        break
    c=c+1   

In [None]:
print(claim_pos[0]) 
print(candidate_evidence_pos[0])
print(claim_label_pos[0])


### Define train_claims,train_evidence and train_labels using positive and negative samples

In [None]:
train_claims = claim_pos + claim_neg
train_evidence = candidate_evidence_pos + candidate_neg
train_labels = claim_label_pos + label_neg


In [None]:
print(train_claims)

In [None]:
print(len(train_claims))

In [None]:
print(len(train_evidence))

In [None]:
print(len(train_labels))

### Load your training data

In [None]:
# Tokenize input sequences
input_ids = []
attention_masks = []
for claim, evidence in zip(train_claims, train_evidence):
    encoded_dict = tokenizer.encode_plus(
                        claim,
                        evidence,
                        add_special_tokens = True,
                        max_length = 256,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

    


In [None]:
print(len(input_ids))

In [None]:
print(input_ids)

In [None]:
import numpy

In [None]:
print(len(attention_masks))

In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_labels)

In [None]:
input_ids

In [None]:
input_ids.shape

In [None]:
attention_masks

In [None]:
attention_masks.shape

In [None]:
labels

In [None]:
labels.shape

### Construct dataset with (input_ids,attention_masks,labels)

In [None]:
# Combine input sequences with labels
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [None]:
print(len(train_dataset))
print(len(val_dataset))

### Construct the dataloaders

In [None]:
# Create data loaders for batching

batch_size = 4
train_dataloader = DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=torch.utils.data.SequentialSampler(val_dataset), batch_size=batch_size)


### Load the pretrained model

In [None]:
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

### Finetune the pretrained BERT Model using training examples

In [None]:
# Set up the optimizer and loss function
import torch.nn.functional as F

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

n_epochs = 5

model = model.to(device)
model.train()

for epoch in range(n_epochs):
    print('Training epoch %d...' % epoch)
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        #print("AAAAA",outputs[0])
        loss = criterion(outputs[1], labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    print('Average training loss: %.4f' % (total_loss / len(train_dataloader)))

# Evaluate the fine-tuned model on your validation set
# Let's assume your validation set is in the same format as your training set
# and you have converted it into a DataLoader of input tensors called val_dataloader
# with the same structure as train_dataloader
model.eval()
with torch.no_grad():
    val_loss = 0
    val_accuracy = 0
    val_steps = 0
    
    for batch in val_dataloader:
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        
        # Extract loss and logits from the model output
        val_loss += outputs.loss.item()
        logits = outputs.logits

        # Convert logits to probabilities and get predicted labels
        probs = F.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        # Compute accuracy for the batch
        val_accuracy += torch.sum(preds == labels).item()

        # Update the number of evaluation steps
        val_steps += 1
        
        
# Calculate average evaluation loss and accuracy
avg_val_loss = val_loss / val_steps
avg_val_accuracy = val_accuracy / len(val_dataset)
        


In [None]:
probs.shape

### Print Validation Accuracy

In [None]:
print("avg val accuracy", avg_val_accuracy)
print("avg loss",avg_val_loss)

In [None]:
## Get the sentences with top relevant scores for every claim 

In [None]:
print(len(candidate_lines))

In [None]:
print(len(claim_test_lines))

In [None]:
print(len(claim_similarities_bert))

In [None]:
claim_test_lines

In [None]:
claim_similarities_bert=[]

for i in tqdm(range(len(claim_test_lines))):
    
    similarity_scores_list_bert = []
    sim_score_bert = {}
    claim = claim_test_lines[i]
    
    max_similarity_score_bert = 0
    max_candidate_sentence_bert = ""
    
    for key in candidate_lines[i].keys():
        similarity_all_sentences = []
        candidate_list_sentence = candidate_lines[i][key]
        
        for idx in range(len(candidate_list_sentence)) :
            
            candidate_sentence = candidate_list_sentence[idx]
            
            #Vectorize the new sentences
            #new_vectors = vectorizer.transform([claim,candidate_sentence])

            #Calculate cosine similarity between the two new sentences
            #similarity_scores = cosine_similarity(new_vectors)
            
            ## Tokenizer acts on (claim,sentence)
            encoded_dict = tokenizer.encode_plus(
                        claim,
                        candidate_sentence,
                        add_special_tokens = True,
                        max_length = 256,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
            input_ids = encoded_dict['input_ids'].to(device)
            attention_mask = encoded_dict['attention_mask'].to(device)
            
            #print("Input_ids shape",input_ids.shape)
            #print("Att masks shape",attention_mask.shape)
            
            ## Get the output corresponding to this example
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
            
            probs = F.softmax(outputs.logits, dim=1)
            probs_relevance = probs[0,1]
            
            max_similarity_score_bert = max(max_similarity_score_bert, probs_relevance)
            if(probs_relevance == max_similarity_score_bert):
                max_candidate_sentence_bert = candidate_sentence
            
            similarity_scores_list_bert.append((key,idx, candidate_sentence,probs_relevance))
        
    claim_similarities_bert.append(similarity_scores_list_bert)

In [None]:
claim_similarities_bert[0]

In [None]:
claim_similarities_bert[1]

In [None]:
K

In [None]:
## Get the top k corresponding to each claim(Same as tf-idf)

topK_evidences_for_each_claim_bert = []
for i in range(len(claim_similarities_bert)):
    topK_evidences_for_each_claim_bert.append(topK(claim_similarities_bert[i], K))
    

In [None]:
topK_evidences_for_each_claim_bert[0]

In [None]:
## Save the bert model used for evidence retrieval

# Save the model to a file
model.save_pretrained('models/')

# Save the tokenizer to a file
tokenizer.save_pretrained('models/')

In [None]:
# Load the saved model from a file
model_load = BertForSequenceClassification.from_pretrained('models/')

# Load the saved tokenizer from a file
tokenizer_load = BertTokenizer.from_pretrained('models/')


In [None]:
model_load

In [None]:
tokenizer_load

In [None]:
label_train_lines = [ sub['label'] for sub in test_lines]

In [None]:
label_train_lines = label_train_lines[0:50]

In [None]:
print(label_train_lines)

In [None]:
# Get list of dictionaries 

retrieved_evidence=[]

for i in range(len(claim_test_lines)):
    retrieved_evidence.append({})
    
    (retrieved_evidence[i])['claim']=claim_test_lines[i]
    (retrieved_evidence[i])['evidence']=[]
    ret_evidence =topK_evidences_for_each_claim_bert[i]
    (retrieved_evidence[i])['label']= label_train_lines[i]
    for j in range(len(ret_evidence)):
        (retrieved_evidence[i])['evidence'].append(ret_evidence[j][1][2])
        
    
print(retrieved_evidence)    
    
    



In [None]:
# Get list of dictionaries 

retrieved_evidence_new=[]

for i in range(len(claim_test_lines)):
    retrieved_evidence_new.append({})
    
    (retrieved_evidence_new[i])['claim']=claim_test_lines[i]
    (retrieved_evidence_new[i])['evidence']=[]
    ret_evidence =topK_evidences_for_each_claim_bert[i]
    (retrieved_evidence_new[i])['label']= label_train_lines[i]
    
    for j in range(len(ret_evidence)):
        (retrieved_evidence_new[i])['evidence'].append(ret_evidence[j][1][2])
        
    
    separator = "|"
    joint_evidence = separator.join(item for item in (retrieved_evidence_new[i])['evidence'])
    (retrieved_evidence_new[i])['string_evidence']=joint_evidence
        
        
        
    
print(retrieved_evidence_new)    
    

In [None]:
print(len(retrieved_evidence))  

In [None]:
retrieved_evidence[0]

In [None]:
retrieved_evidence_new[0]

In [None]:
## Make CSV file with columns claim,evidence,label

import csv

# Example list of dictionaries

# Open a CSV file for writing
with open('retrieved_evidence_output1.csv', 'w', newline='') as csvfile:
    
    # Define the fieldnames for the CSV file
    fieldnames = ['claim', 'evidence', 'label','string_evidence']
    
    # Create a writer object and write the fieldnames to the CSV file
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Write each row of data to the CSV file
    for row in retrieved_evidence_new:
        writer.writerow(row)


In [None]:
retrieved_evidences_bert=[]

for i in range(len(topK_evidences_for_each_claim_bert)):
    retrieved_evidences_bert.append([])
    t = topK_evidences_for_each_claim_bert[i]
    for j in range(len(t)):
         retrieved_evidences_bert[i].append([topK_evidences_for_each_claim_bert[i][j][1][0],topK_evidences_for_each_claim_bert[i][j][1][1]])
        
        

In [None]:
retrieved_evidences_bert[0]