##Connecting to Google Drive in which "fever.db", and "train.jsonl" is stored.
Note: "fever.db" is a large sql database consisting of wikipedia page titles and text. Hence, it is stored in drive.

Public link to fever.db: https://drive.google.com/file/d/1qoEqOdqcPHOrX1JGKxdxyBomUjxvh8fw/view?usp=sharing

Public link to train.jsonl: https://drive.google.com/file/d/1awN6S3ejR-Jkn81Qg1K8j__cvOF3nfuw/view?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive

Mounted at /content/gdrive/
/content/gdrive/MyDrive


##Installing libraries



In [None]:
!pip install stanza
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=9215e9fad36510a4d7b1c3dc44e1053b8f88ca639a8e24f902cb3a242c1a4174
  Stored in directory: /root/.cache/pip/wheels/9a/b8/0f/f580817231cbf59f6ade9fd132ff60ada1de9f7dc85521f857
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stan

##Finding the nearest page based on keywords present in the claim

In [None]:
import stanza
import wikipedia

def preprocess(np):
    page = np.replace('( ', '-LRB-')
    page = page.replace(' )', '-RRB-')
    page = page.replace(' - ', '-')
    page = page.replace(' :', '-COLON-')
    page = page.replace(' ,', ',')
    page = page.replace(" 's", "'s")
    page = page.replace(' ', '_')
    return page

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

def nearestP(claim):
    doc = nlp(claim)
    for sentence in doc.sentences:
        tree = sentence.constituency
        np=[]
        tree.visit_preorder(internal = lambda x: np.append(x.leaf_labels()) if x.label=="NP" else None)
        noun_phrases = [' '.join(n) for n in np]
        predicted_wiki = []
        for n in noun_phrases:
          tmp = wikipedia.search(n)
          if len(tmp) > 0:
            predicted_wiki.append(preprocess(tmp[0]))
        # predicted_wiki = [preprocess((wikipedia.search(n))[0]) for n in noun_phrases] 
        return predicted_wiki

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/pos/combined.pt:   0%|         …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/constituency/wsj.pt:   0%|     …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/backward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/pretrain/combined.pt:   0%|    …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: constituency
INFO:stanza:Done loading processors!


Helper function for extracting a particular sentence from a page in "fever.db".

In [None]:
def extract_sentence(string, target_id):
    sentences = string.split("\n")
    for sentence in sentences:
        if sentence.startswith(target_id + "\t"):
            extracted_sentence = sentence[len(target_id) + 1:]
            return extracted_sentence
    return ""

##Building CSV file
This piece of code will parse the training samples present in "train.jsonl" one-by-one and prepare a csv file, containing the claim, corresponding evidence and label.

In [None]:
import sqlite3
from tqdm import tqdm
import json
import csv

file_path = "train.jsonl"

with open(file_path, "r") as f:
    data = [json.loads(line) for line in f]

conn = sqlite3.connect('fever.db')
cursor = conn.cursor()

with open('output.csv', 'w', newline='') as csvfile:
    fieldnames = ['claim', 'evidence', 'label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

for line in tqdm(data[:2000]):
    claim = line["claim"]
    label = line["label"]
    if label != "NOT ENOUGH INFO":
        evidences = line["evidence"]
#         print(claim)
#         print(label)
        evidence_list = []
        for evid in evidences[0]:
            page_id = evid[2]
            sentence_id = evid[3]
            cursor.execute("SELECT lines FROM documents where id=?", (page_id,))
            rows = cursor.fetchall()
#             print(evid)
            evidence = extract_sentence(rows[0][0], str(sentence_id))
            fields = evidence.split("\t")
            evidence = fields[0]
            evidence = evidence.strip()
            evidence_list.append(evidence)
        with open('output.csv', 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['claim', 'evidence', 'label']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            separator = "|"
            joint_evidence = separator.join(item for item in evidence_list)
            writer.writerow({'claim': claim, 'evidence': joint_evidence, 'label': label})
    else:
        claim = line["claim"]
        label = line["label"]
        wiki_pages = nearestP(claim)
        evidence_list = []
#         print(wiki_pages)
        for wiki_page in wiki_pages:
            cursor.execute("SELECT lines FROM documents where id=?", (wiki_page,))
            rows = cursor.fetchall()
#             print(rows)
            if len(rows) > 0:
                evidence = extract_sentence(rows[0][0], str(0))
                fields = evidence.split("\t")
                evidence = fields[0]
                evidence = evidence.strip()
                evidence_list.append(evidence)
        with open('output.csv', 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['claim', 'evidence', 'label']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            separator = "|"
            joint_evidence = separator.join(item for item in evidence_list)
            writer.writerow({'claim': claim, 'evidence': joint_evidence, 'label': label})
        

cursor.close()
conn.close()

training_data = []
with open('output.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        separator = "|"
        evidence_list = [item for item in row['evidence'].split(separator)]
#         print(evidence_list)
        training_data.append({'claim': row['claim'], 'evidence': evidence_list, 'label': row['label']})

100%|██████████| 2000/2000 [15:35<00:00,  2.14it/s]
