# COMP90042 Project 2019: Automatic Fact Verification

## Step1: build index for doc content query

In [1]:
import os

root_path = '../'
wiki_pages_path = root_path + 'materials/wiki-pages-text/'
pages_list = os.listdir(wiki_pages_path)
pages_list.sort()

doc_line_indexes = []  # a list of dictionary that saves doc's line id
## [{'1986_NBA_Finals': 1, 
##   '1789_Dobrovolsky': 227,
##   '1596_in_Scotland': 618, ...}, 
##   {...}, ...]

head_pages_index = {}  # first 4 charaters of doc appears in which pages
## {'1986':['../materials/wiki-pages-text/wiki-001.txt', '../materials/wiki-pages-text/wiki-002.txt', '../materials/wiki-pages-text/wiki-003.txt'],
##  '1789':['../materials/wiki-pages-text/wiki-001.txt', '../materials/wiki-pages-text/wiki-002.txt'],
##  '1596':['../materials/wiki-pages-text/wiki-001.txt', '../materials/wiki-pages-text/wiki-002.txt']}

for page_name in pages_list:  # 'wiki-001.txt'
    page_path = wiki_pages_path + page_name  # '../materials/wiki-pages-text/wiki-001.txt'
    with open(page_path,'r',encoding='utf-8') as f:
        doc_line_index = {}  # a dictionary that saves docs' line id :
                             ##  {'1986_NBA_Finals': 1, 
                             ##   '1789_Dobrovolsky': 227,
                             ##   '1596_in_Scotland': 618, ...}
        pre = ''
        lines = f.readlines()
        for line_id, line in enumerate(lines):
            tokens = line.split()  # ['1986_NBA_Finals', '2', 'The', 'Celtics', 'defeated', 'the', 'Rockets', ...]
            doc_name = tokens[0]  # '1986_NBA_Finals'
            if doc_name != pre:  # if there is a new doc that appears first time
                doc_line_index[doc_name] = line_id  # record its line id
                head = doc_name[:4]  # '1986_NBA_Finals' => '1986'
                if head not in head_pages_index:
                    head_pages_index[head] = [page_path]
                elif page_path not in head_pages_index[head]:
                    head_pages_index[head].append(page_path)
                pre = doc_name
        doc_line_indexes.append(doc_line_index)
    

In [None]:
print(head_pages_index['Bawb'])

## Step2: get content and save as a new file

In [20]:
import json
import unicodedata
import time

start = time.time()

## input : ("Party_of_Hellenism", 3)
def get_content(doc_name, sent_ids):
    contents = []
    sent_ids = sent_ids
    head = unicodedata.normalize('NFC', doc_name)[:4]
    evidence_pages = head_pages_index[head]
    for evidence_page in evidence_pages:
        wiki_index = int(evidence_page[-7:-4]) - 1
        if doc_name in doc_line_indexes[wiki_index]:
            with open(evidence_page,'r',encoding='utf-8') as f:
                lines = f.readlines()[doc_line_indexes[wiki_index][doc_name]:]
                for line in lines:
                    if line.startswith(doc_name):
                        for sent_id in sent_ids:
                            if line.startswith(doc_name+" "+str(sent_id)+" "):
                                contents.append(line)
                                doc, _id = line.split()[:2]
                                if doc != doc_name or _id != str(sent_id):
                                    print(wiki_index+1, doc,doc_name,_id,sent_id)
    return contents

## read train set into a dictionary
train_path = root_path + 'materials/160train.json'
save_path = root_path + 'materials/160train_pro.json'
new_dic = {}
with open(train_path,'r',encoding='utf-8') as f:
    train_dic = json.load(f)
    for _id in train_dic:
        claim = unicodedata.normalize('NFC', train_dic[_id]['claim'])
        label = train_dic[_id]['label']
        evidence_contents = []
        doc_sentids = {}
        for evid in train_dic[_id]['evidence']:
            if evid[0] in doc_sentids:
                doc_sentids[evid[0]].append(evid[1])
            else:
                doc_sentids[evid[0]] = [evid[1]]
        for doc_name in doc_sentids:
            try:
                evidence_contents.extend(get_content(unicodedata.normalize('NFC', doc_name), doc_sentids[doc_name]))
            except Exception as e:
                print(e, doc_name, doc_sentids[doc_name])
        new_dic[_id] = {'claim':claim,'evidence':evidence_contents,'label':label}
                
with open(save_path,'w',encoding='utf-8') as f:
    json.dump(new_dic,f,indent=4)
            
print('time used: ', time.time() - start)

time used:  131.25512504577637


## build entity index

In [None]:
# from allennlp.predictors.predictor import Predictor
# predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/Users/liurongxiao/pythonAPI/stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz',
                           '/Users/liurongxiao/pythonAPI/stanford-ner-2018-10-16/stanford-ner.jar',
                           encoding='utf-8')

import unicodedata
import json
import time
start = time.time()

root_path = '../'
entities = []
train_path = root_path + 'materials/160train.json'
with open(train_path,'r',encoding='utf-8') as f:
    dic = json.load(f)
    num = 0
    for _id in dic:
        if num > 10:
            break
        num += 1
        
        claim = unicodedata.normalize('NFC', dic[_id]['claim'])
#         pred = predictor.predict(sentence=claim)
        
        try:
            classified_text = st.tag(claim)
#             sentence = ' '.join([line[2:] for line in lines])
    #         pred = predictor.predict(sentence=sentence[:1000000])
#             print(1)
    #         tags = pred['tags']
#             print(2)
    #         words = pred['words']
            for text in classified_text:
                if text[1] != 'O':
                    if text[0] not in entities:
                        entities.append(text[0])
            print(num)
        except Exception as e:
            print(e, line)
print('time used: ', time.time() - start)
#     all_entities.append(entities)
# with open('./entities.json','w',encoding='utf-8') as f:
# #     json.dump(entities,f,indent=4)
#     f.write(entities)

# print('time used: ', time.time() - start)

# compare the processed train with 

In [6]:
import json
import unicodedata
with open("../materials/train_pro.json",'r',encoding='utf-8') as f:
    dic1 = json.load(f)
with open("../materials/train.json",'r',encoding='utf-8') as f:
    dic2 = json.load(f)
for _id in dic1:
    list1 = sorted([' '.join( sent.split()[:2]) for sent in dic1[_id]["evidence"]])
    list2 = sorted([unicodedata.normalize('NFC',evid[0])+" "+str(evid[1]) for evid in dic2[_id]["evidence"]])
    if list1!=list2:
        print(list1)
        print(list2)

In [None]:
for page_name in pages_list:  # 'wiki-001.txt'
    page_path = wiki_pages_path + page_name  # '../materials/wiki-pages-text/wiki-001.txt'
    with open(page_path,'r',encoding='utf-8') as f:
        f_content = f.read()
        if f_content.find("Adrienne Bailon")!=-1:
            print(f_content.find("Adrienne Bailon"))

## training part

In [2]:
import os
import json
import unicodedata
import time
import re
from collections import Counter

from allennlp.predictors.predictor import Predictor

predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/decomposable-attention-elmo-2018.02.19.tar.gz")

pronoun = ["he", "she", "they", "it"]
possessive = ["his", "her", "their", "its"]
start = time.time()

total = 0
correct = 0

with open("../materials/devset_pro_NFC.json", 'r', encoding='utf-8') as f:
    dic = json.load(f)
    print(len(dic))
    for _id in dic:
        judges = Counter()
        label = dic[_id]["label"]
        claim = dic[_id]["claim"]
        evidences = dic[_id]["evidence"]
        sents = []
        for line in evidences:
            line = re.sub(r'-LRB-', '(', line)
            line = re.sub(r'-RRB-', ')', line)
            line = re.sub(r'-LSB-', '[', line)
            line = re.sub(r'-RSB-', ']', line)
            tokens = line.strip("\n ").split()
            topic = re.sub("\(.*?\)", ""," ".join(tokens[0].split("_")))

            evidence = tokens[2:]
            for i in range(len(evidence)):
                if evidence[i].lower() in pronoun:
                    evidence[i] = topic
                elif evidence[i].lower() in possessive:
                    evidence[i] = topic + "'s"
            evidence = " ".join(evidence)
            sents.append(evidence)
        premise = " ".join(sents)
        if not evidences:
            final_judge = "NOT ENOUGH INFO"
        else:
            pred = predictor.predict(premise=premise, hypothesis=claim)
            probs = pred["label_probs"]
            judge = probs.index(max(probs))
            if judge == 0:
                final_judge = "SUPPORTS"
            else:
                final_judge = "REFUTES"

        if final_judge == label:
            correct += 1
        else:
            print(_id, label, final_judge)
        total += 1
        if total == 100:
            break
        


print('time used: ', time.time() - start)
print(correct,total)
print("Precision:", str(correct/total))

# s0 = 'It is a GIRL, his (phone)'
# s1 = re.sub(r'\(.*?\)', '', s0)
# print(s1)


Did not use initialization regex that was passed: .*token_embedder_tokens\._projection.*weight


5001
137334 SUPPORTS REFUTES
111897 REFUTES SUPPORTS
54168 REFUTES SUPPORTS
204443 REFUTES SUPPORTS
192714 SUPPORTS REFUTES
107786 SUPPORTS REFUTES
197381 REFUTES SUPPORTS
142454 REFUTES SUPPORTS
104386 REFUTES SUPPORTS
128123 REFUTES SUPPORTS
41665 REFUTES SUPPORTS
21775 REFUTES SUPPORTS
66638 REFUTES SUPPORTS
227130 REFUTES SUPPORTS
114567 REFUTES SUPPORTS
163980 REFUTES SUPPORTS
34412 REFUTES SUPPORTS
79538 REFUTES SUPPORTS
172478 REFUTES SUPPORTS
60977 REFUTES SUPPORTS
32820 REFUTES SUPPORTS
186996 SUPPORTS REFUTES
68084 SUPPORTS REFUTES
4713 REFUTES SUPPORTS
time used:  81.8645761013031
76 100
Precision: 0.76
