In [1]:
from rdflib import Graph, Literal, RDFS, RDF, URIRef, XSD, Namespace
import pandas
from SPARQLWrapper import SPARQLWrapper, JSON
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from joblib import Memory, Parallel, delayed
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt
import sys
import xgboost as xgb
from sklearn.svm import SVC

In [2]:
%cd C:\Users\Harshit Purohit\OneDrive\Desktop\Project Atlas\University\Foundations of Knowledge Graphs\Mini_Project\FOKG_mini_project

C:\Users\Harshit Purohit\OneDrive\Desktop\Project Atlas\University\Foundations of Knowledge Graphs\Mini_Project\FOKG_mini_project


In [3]:
def conv2list_train():
    """
    Loads the training data and returns nested list of training data.
    """
    target_file = "fokg-sw-train-2024.nt"

    train = Graph()
    train.parse(target_file, format="nt")

    NS1 = Namespace("http://swc2017.aksw.org/")
    train.bind("ns1", NS1)

    query = """
    SELECT ?st ?tv ?s ?p ?o
    WHERE{
        ?st a rdf:Statement .
        ?st ns1:hasTruthValue ?tv .
        ?st rdf:subject ?s .
        ?st rdf:predicate ?p .
        ?st rdf:object ?o .
    }
    """
    train_fc = []
    result = train.query(query)
    for row in result:
        train_fc.append([ row['st'], float(row['tv']), str(row['s']), str(row['p']), str(row['o'])])
    return train_fc

def conv2list_test():
    """
    Loads and returns the test data.
    """
    target_file = "fokg-sw-test-2024.nt"

    test = Graph()
    test.parse(target_file, format="nt")

    query = """
    SELECT ?st ?s ?p ?o
    WHERE{
        ?st a rdf:Statement .
        ?st rdf:subject ?s .
        ?st rdf:predicate ?p .
        ?st rdf:object ?o .
    }
    """

    test_fc = []
    result = test.query(query)
    for row in result:
        test_fc.append([ row['st'], str(row['s']), str(row['p']), str(row['o'])])

    return test_fc

In [4]:
def prune(graph):
    """
    Removes all literal values from an RDF graph, keeping only URI relationships.
    """
    filtered_graph = Graph()
    for s , p, o in graph:
        if not isinstance(o, Literal):
            filtered_graph.add((s,p,o))
    return filtered_graph

In [5]:
def generate_query_template(j):
    """
    Generates SPARQL query templates for finding paths between nodes.
    """
    query_templates = []

    for i in range(j):
        select = "SELECT"
        triples = ""
        for k in range(i+1):
            select += f" ?p{k+1}"
            triples += f"\t?v{k} ?p{k+1} ?v{k+1} .\n\t"
        query = f"""
        {select}
        WHERE {{
            {triples}
        }}
        """
        query_templates.append(query)

    return query_templates

In [6]:
def prune_result(results):
    """
    Filters out basic RDF/RDFS predicates from retrieved paths.
    
    Removes terminology predicates like subClassOf, range, domain and type
    from the paths found between entities.
    """
    predicates = []
    terminology_predicates = [RDFS.subClassOf, RDFS.range, RDFS.domain, RDF.type]
    for result in results:
        if not any(i in terminology_predicates for i in result ):
            result = [str(i) for i in result]
            predicates.append(result)


    return predicates

In [7]:
def path_finding(graph,s,o):
    """
    Finds all paths between two nodes in an RDF graph up to a maximum length.
    """
    k = 3
    for j in range(1,k+1):
        query_templates = generate_query_template(j)
        for query_template in query_templates:
            query = query_template.replace("?v0",f"{s}")
            query = query.replace(f"?v{j}", f"{o}")
            results = graph.query(query)
            paths = prune_result(results)
    return paths

In [8]:
graph = Graph()
graph.parse("reference-kg.nt", format="nt")
print(f"Length Before Pruning: {len(graph)}")
graph = prune(graph)
print(f"Length After Pruning: {len(graph)}")

Length Before Pruning: 675859
Length After Pruning: 660000


In [9]:
def train():
    """
    Extracts path-based features from training data using the reference graph.
    """
    train = conv2list_train()
    X = []
    for row in train:
        s = f"<{row[2]}>"
        o = f"<{row[4]}>"
        paths = path_finding(graph,s,o)
        path_counter = dict(Counter(map(tuple,paths)))
        X.append(path_counter)
    return X

In [10]:
X_train = train()
y_train = [i[1] for i in conv2list_train()]
print(len(y_train), len(X_train))

1000 1000


In [11]:
def advanced_hyperparameter_tuning(xgb_clf, X_train_xgb, y_train_xgb):
    """
    Run grid search or randomized search to find optimal hyperparameters
    for XGBoost.
    """
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
    }
    
    grid_search = GridSearchCV(xgb_clf, param_grid, cv=3, scoring='accuracy')
    
    grid_search.fit(X_train_xgb, y_train_xgb)
    print("Best parameters found:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    
    return grid_search.best_estimator_

In [12]:
def cross_validation_example(model, X, y, cv=5):
    """
    Runs K-Fold cross-validation on a given model with existing training data.
    """
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"{cv}-Fold CV Scores:", scores)
    print("Average:", scores.mean(), "Std Dev:", scores.std())

In [13]:
class Ensemble:
    """
    An ensemble combining:
    1) XGBoost
    2) Support Vector Machine
    3) Random Forest
    with three different feature extraction methods.
    """

    def fit(self, X, y):
        self.vec_xgb = DictVectorizer()
        X_train_xgb = self.vec_xgb.fit_transform(X)
        self.xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        self.xgb_clf = advanced_hyperparameter_tuning(self.xgb_clf, X_train_xgb, y)
        cross_validation_example(self.xgb_clf, X_train_xgb, y)
        self.xgb_clf.fit(X_train_xgb, y)

        flattened_data_svm = [list(d.keys()) for d in X]
        self.encoder_svm = MultiLabelBinarizer()
        X_train_svm = self.encoder_svm.fit_transform(flattened_data_svm)
        self.svm_clf = SVC(probability=True)
        self.svm_clf.fit(X_train_svm, y)

        flattened_data_rf = [" ".join(["_".join(key) for key in d.keys()]) for d in X]
        self.vectorizer_rf = CountVectorizer()
        X_train_rf = self.vectorizer_rf.fit_transform(flattened_data_rf)
        self.rf_clf = RandomForestClassifier()
        self.rf_clf.fit(X_train_rf, y)

    def voting(self):
        """
        Obtains the final predictions from majority voting.
        """
        final_preds = []
        for i in range(len(self.predictions[0])):
            votes = [self.predictions[0][i], self.predictions[1][i], self.predictions[2][i]]
            if votes.count(1) > votes.count(0):
                final_preds.append(1)
            else:
                final_preds.append(0)
        return final_preds

    def predict(self, X):
        """
        Makes predictions by transforming X for each classifier,
        collecting predictions, then calling voting().
        """
        X_test_xgb = self.vec_xgb.transform(X)
        pred_xgb = self.xgb_clf.predict(X_test_xgb)

        flattened_data_svm = [list(d.keys()) for d in X]
        X_test_svm = self.encoder_svm.transform(flattened_data_svm)
        pred_svm = self.svm_clf.predict(X_test_svm)

        flattened_data_rf = [" ".join(["_".join(key) for key in d.keys()]) for d in X]
        X_test_rf = self.vectorizer_rf.transform(flattened_data_rf)
        pred_rf = self.rf_clf.predict(X_test_rf)

        self.predictions = [pred_xgb, pred_svm, pred_rf]
        return self.voting()

In [14]:
model = Ensemble()
model.fit(X_train,y_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters found: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
Best score: 0.6829524134913356


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



5-Fold CV Scores: [0.75  0.685 0.645 0.69  0.65 ]
Average: 0.6839999999999999 Std Dev: 0.03760319135392632


Parameters: { "use_label_encoder" } are not used.



In [15]:
def test():
    """
    Extracts test data using the reference graph.
    """
    test = conv2list_test()
    X = []
    for row in test:
        s = f"<{row[1]}>"
        o = f"<{row[3]}>"
        paths = path_finding(graph,s,o)
        path_counter = dict(Counter(map(tuple,paths)))
        X.append(path_counter)
    statements = [i[0] for i in test]
    return X, statements

In [16]:
def save_result(pred, statements):
    """
    Creates an RDF graph with prediction results and saves it to a Turtle file.
    """
    print(len(pred), len(statements))
    test_graph = Graph()
    predicate = URIRef("http://swc2017.aksw.org/hasTruthValue")
    for i in range(len(statements)):
        test_graph.add((statements[i], predicate,Literal(pred[i], datatype=XSD.double)))

    test_graph.serialize("test_result_final.ttl",format='ttl')

In [17]:
X_test, statements = test()
pred = model.predict(X_test)



In [18]:
save_result(pred, statements)

500 500
