# Fact checking

In [66]:
import os
import requests
import zipfile
import re
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
plt.rcParams['figure.figsize'] = [8, 6]
plt.rcParams['figure.dpi'] = 100
plt.rcParams['axes.xmargin'] = .05
plt.rcParams['axes.ymargin'] = .05
plt.style.use('ggplot')

## Dataset pre-processing

In [7]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:
                f.write(chunk)


def download_data(data_path):
    toy_data_path = os.path.join(data_path, "fever_data.zip")
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(
                toy_url, params={"id": toy_data_url_id}, stream=True
            )
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

In [9]:
dataset_folder = os.path.join("datasets", "fever")
download_data(dataset_folder)

## Dataset conversion

In [28]:
dfs = []
for split in ["train", "test", "val"]:
    split_file = os.path.join(dataset_folder, f"{split}_pairs.csv")
    if os.path.isfile(split_file):
        split_df = pd.read_csv(split_file, index_col=0)
        split_df["split"] = pd.Series([split] * len(split_df), index=split_df.index)
        dfs.append(split_df)

df = pd.concat(dfs)
df.columns = map(str.lower, df.columns)
df.head()

Unnamed: 0,claim,evidence,id,label,split
0,Chris Hemsworth appeared in A Perfect Getaway.,2\tHemsworth has also appeared in the science ...,3,SUPPORTS,train
1,Roald Dahl is a writer.,0\tRoald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈ...,7,SUPPORTS,train
2,Roald Dahl is a governor.,0\tRoald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈ...,8,REFUTES,train
3,Ireland has relatively low-lying mountains.,10\tThe island 's geography comprises relative...,9,SUPPORTS,train
4,Ireland does not have relatively low-lying mou...,10\tThe island 's geography comprises relative...,10,REFUTES,train


In [29]:
df["label"].unique()

array(['SUPPORTS', 'REFUTES'], dtype=object)

In [105]:
BEFORE_TAB = re.compile(r"(.*)(?:\t)")


def preprocess_claim(text):
    if text[-1] in string.punctuation:
        text = text[:-1]
    return text.strip().lower()


def preprocess_evidence(text):
    tab_match = re.match(REMOVE_BEFORE_TAB, text)
    if tab_match is not None:
        text = text[tab_match.end() :]
    last_parens = text.rfind("(")
    if last_parens != -1:
        text = text[:last_parens]
    return text.strip().lower()

In [107]:
df["claim"] = df["claim"].apply(preprocess_claim)
df["evidence"] = df["evidence"].apply(preprocess_evidence)
df.head()

Unnamed: 0,claim,evidence,id,label,split
0,chris hemsworth appeared in a perfect getaway,rush,3,SUPPORTS,train
1,roald dahl is a writer,fighter pilot,7,SUPPORTS,train
2,roald dahl is a governor,fighter pilot,8,REFUTES,train
3,ireland has relatively low-lying mountains,rivers of ireland,9,SUPPORTS,train
4,ireland does not have relatively low-lying mou...,rivers of ireland,10,REFUTES,train


## Model definition

## Training

## Evaluation

## Comments/summary