# Fact-Checking Engine

## Test data path

This constant holds the path to a test data file. Change this path accordingly. We recommend placing the test data file in the data folder within the unzipped directory and using a relative path.

In [1]:
TEST_DATA_PATH = "data/fokg-sw-test-2024.nt"

## Import libraries

Import the necessary libraries and modules.

In [2]:
import rdflib
from rdflib import Graph, URIRef, RDF, Namespace
from rdflib.plugins.sparql import prepareQuery

import pandas as pd
import numpy as np
import os
import torch
import pickle

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

from pykeen.triples import TriplesFactory
from pykeen.models import TransE
from pykeen.training import SLCWATrainingLoop
from pykeen.losses import MarginRankingLoss

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

Load the reference, training, and test knowledge graphs.

In [3]:
# Load reference knowledge graph
reference_kg = Graph()
reference_kg.parse("data/reference-kg.nt", format="nt")
print("Reference Knowledge Graph length.", len(reference_kg))

# Load training data
train_graph = Graph()
train_graph.parse("data/fokg-sw-train-2024.nt", format="nt")
print("Training data length.", len(train_graph))

#Load test data
test_graph = Graph()
test_graph.parse(TEST_DATA_PATH, format="nt")
print("Test data length.", len(test_graph))


Reference Knowledge Graph length. 675859
Training data length. 5000
Test data length. 2000


## Training TransE model with reference_kg

Prepare the reference triples from the reference knowledge graph. Train the TransE model to learn embeddings for entities and relations in the reference knowledge graph, which can later be used for downstream tasks like link prediction or fact-checking.

In [4]:
reference_triples = [(str(subj), str(pred), str(obj)) for subj, pred, obj in reference_kg if isinstance(subj, rdflib.URIRef) and isinstance(obj, rdflib.URIRef)] # only URIRef is considered
print("Number of triples in reference_kg: ", len(reference_triples))
reference_factory = TriplesFactory.from_labeled_triples(np.array(reference_triples, dtype=object))

def train_model():
    """
    Train the TransE model on the reference knowledge graph.
    """
    # TransE Model
    embedding_dim = 200
    margin = 1.0

    model = TransE(
        triples_factory=reference_factory,
        embedding_dim=embedding_dim,
        scoring_fct_norm=1,  # L1 distance
        loss=MarginRankingLoss(margin=margin),  # margin-based ranking
        random_seed= 42,
    )

    training_kg_loop = SLCWATrainingLoop(
        model=model,
        triples_factory=reference_factory,
        optimizer="adam",
        optimizer_kwargs={"lr": 1e-3},
        negative_sampler="basic",
        negative_sampler_kwargs={"num_negs_per_pos": 10},
    )

    num_epochs = 10          # can be reduced to 5 for faster computation
    batch_size = 256
    print(f"TransE training parameters epochs={num_epochs}, batch_size={batch_size}")
    _ = training_kg_loop.train(
        triples_factory=reference_factory,
        num_epochs=num_epochs,
        batch_size=batch_size,
        use_tqdm=True,
    )

    print("TransE model trained with reference_kg.")
    return model


Number of triples in reference_kg:  660000


## Use a Pretrained Model or Train a New One

1. Faster Computation with Pretrained Model:
If you want faster computation, the script will use the pretrained model (transE_model.pkl) if it exists in the same directory.

2. Train the Model Yourself
If you want to train the model from scratch, delete the "transE_model.pkl" 

In [5]:
# Define the file path
model_path = "transE_model.pkl"

# Check if the file already exists
if os.path.exists(model_path):
    print(f"File '{model_path}' already exists. Skipping save.")
    # Load the model
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
else:
    model = train_model()
    # Save the trained model using pickle
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_path}")

File 'transE_model.pkl' already exists. Skipping save.


## Encoding entity and relations

In [6]:
entity_to_id = reference_factory.entity_to_id
relation_to_id = reference_factory.relation_to_id

entity_representation = model.entity_representations[0]
relation_representation = model.relation_representations[0]

## Preparing Train and Test data

Extracts training and test data from RDF graphs using SPARQL queries. 
Training data includes triples and their truth values for supervised learning, while test data includes triples and their corresponding statement IRIs for evaluation.


In [7]:
# Train Data
query_train = prepareQuery("""
    SELECT ?stmt ?subject ?predicate ?object ?truthValue
    WHERE {
        ?stmt a <http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement> .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#subject> ?subject .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate> ?predicate .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#object> ?object .
        ?stmt <http://swc2017.aksw.org/hasTruthValue> ?truthValue .
    }
""")

train_triples = []
train_truthValue = []
for fact_iri, sub, pred, obj, truth_value in train_graph.query(query_train):
    train_triples.append((sub.toPython(),pred.toPython(),obj.toPython()))
    train_truthValue.append(truth_value.toPython())

# Test Data
query_test = prepareQuery("""
    SELECT ?stmt ?subject ?predicate ?object
    WHERE {
        ?stmt a <http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement> .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#subject> ?subject .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate> ?predicate .
        ?stmt <http://www.w3.org/1999/02/22-rdf-syntax-ns#object> ?object .
    }
""")

test_triples = []
test_fact_iri = []
for fact_iri, sub, pred, obj in test_graph.query(query_test):
    test_triples.append((sub.toPython(),pred.toPython(),obj.toPython()))
    test_fact_iri.append(fact_iri.toPython())


## Encoded on embedded model

Function `get_embedding_for_fact` to retrieve the embeddings for a given fact (subject, predicate, object) using a trained TransE model.
It then generates embeddings for all training and test triples. The training embeddings (X_train) are paired with their corresponding truth values (y_train), while the test embeddings (X_test) are prepared for evaluation.

In [8]:
def get_embeddings_for_fact(subj, pred, obj):
    if subj not in entity_to_id or obj not in entity_to_id or pred not in relation_to_id:
        emb_dim = model.entity_representations[0]._embeddings.weight.shape[-1]
        return np.zeros(3 * emb_dim)

    s_id = entity_to_id[subj]
    p_id = relation_to_id[pred]
    o_id = entity_to_id[obj]

    s_emb = model.entity_representations[0](indices=torch.tensor([s_id]))  # shape [1, dim]
    p_emb = model.relation_representations[0](indices=torch.tensor([p_id]))
    o_emb = model.entity_representations[0](indices=torch.tensor([o_id]))

    cat = torch.cat([s_emb[0], p_emb[0], o_emb[0]], dim=0)
    return cat.detach().cpu().numpy()


X_train = [get_embeddings_for_fact(s, p, o) for (s, p, o) in train_triples]
y_train = np.array(train_truthValue)
X_train = np.array(X_train)

X_test = [get_embeddings_for_fact(s, p, o) for (s, p, o) in test_triples]
X_test = np.array(X_test)

## Create and Train a MLPClassifier Model

This code initializes and trains a Multi-Layer Perceptron (MLP) classifier with a specific architecture (three hidden layers with 256, 256, and 128 units), using the ReLU activation function and the Adam solver. It trains the model on the fact embeddings (X_train) and their corresponding truth values (y_train).
The AUC (Area Under the ROC Curve) score is computed on the training data to evaluate model performance.
The code also prepares the model to predict probabilities for the test data (X_test).

In [9]:
mlp = MLPClassifier(hidden_layer_sizes=(256, 256, 128), activation="relu", solver="adam", max_iter=50, random_state=42) # seed is fixed for reproducibility of result
mlp.fit(X_train, y_train)

train_probs = mlp.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, train_probs)
print(f"Train AUC: {train_auc:.4f}")

test_probs = mlp.predict_proba(X_test)[:, 1]

Train AUC: 1.0000


## Write it to result file

Writes the test data results into a file (result.ttl).
The lines have the following form:

<http://dice-research.org/data/fb15k-237.ttl#3> <http://swc2017.aksw.org/hasTruthValue> "5.11332036694511"^^<http://www.w3.org/2001/XMLSchema#double> .

In [10]:
with open("result.ttl", "w") as resultFile:
    for fact_iri, score in zip(test_fact_iri, test_probs):
        line = f'<{fact_iri}> <http://swc2017.aksw.org/hasTruthValue> "{score}"^^<http://www.w3.org/2001/XMLSchema#double> .\n'
        resultFile.write(line)