## Installs and Imports

In [2]:
!pip install wikitextparser
!pip install rdflib
!pip install nltk
!pip install wittgenstein
!pip install -U imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Default

In [3]:
pip install -U xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import xgboost as xgb

In [5]:
import logging
from typing import Iterator, Optional, Union
from enum import Enum
import wikitextparser as wtp
import bz2
from lxml import etree
import re
import requests
from rdflib import Graph, URIRef, RDF, RDFS
import pickle
from collections import defaultdict
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import wittgenstein as lw
import numpy as np
import pandas as pd
import concurrent.futures
import time
import sys
import traceback
import multiprocessing
#import tensorflow as tf
from imblearn.over_sampling import RandomOverSampler
# Ensure you have NLTK sentence tokenizer downloaded
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/ma/ma_ma/ma_nfuerhau/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## WikiPageParser

In [1]:
# LOGGING
def get_logger():
    return logging.getLogger('impl')

In [7]:
class WikiPageParser:
    """Parse WikiText as stream and return content based on page markers (only for simple article pages)."""
    def __init__(self):
        self.processed_pages = 0
        self.page_markup = {}
        self.title = None
        self.namespace = None
        self.tag_content = ''

    def start(self, tag, _):
        if tag.endswith('}page'):
            self.title = None
            self.namespace = None
            self.processed_pages += 1
            if self.processed_pages % 100000 == 0:
                get_logger().debug(f'Parsed markup of {self.processed_pages} pages.')

    def end(self, tag):
        if tag.endswith('}title'):
            self.title = self.tag_content.strip()
        elif tag.endswith('}ns'):
            self.namespace = self.tag_content.strip()
        elif tag.endswith('}text') and self._valid_page():
            self.page_markup[self.title] = self.tag_content.strip()
        self.tag_content = ''

    def data(self, chars):
        self.tag_content += chars

    def close(self) -> dict:
        return self.page_markup

    def _valid_page(self) -> bool:
        return self.namespace in ['0', '10', '14']  # 0 = Page, 10 = Template, 14 = Category

In [8]:
class Namespace(Enum):
    OWL = 'http://www.w3.org/2002/07/owl#'
    WIKIPEDIA = 'http://en.wikipedia.org/wiki/'

    PREFIX_TEMPLATE = 'Template:'
    PREFIX_CATEGORY = 'Category:'
    PREFIX_FILE = 'File:'
    PREFIX_IMAGE = 'Image:'
    PREFIX_LIST = 'List_of_'
    PREFIX_LISTS = 'Lists_of_'
    PREFIX_LISTCATEGORY = PREFIX_CATEGORY + PREFIX_LISTS

    DBP_ONTOLOGY = 'http://dbpedia.org/ontology/'
    DBP_RESOURCE = 'http://dbpedia.org/resource/'
    DBP_TEMPLATE = DBP_RESOURCE + PREFIX_TEMPLATE
    DBP_CATEGORY = DBP_RESOURCE + PREFIX_CATEGORY
    DBP_FILE = DBP_RESOURCE + PREFIX_FILE
    DBP_IMAGE = DBP_RESOURCE + PREFIX_IMAGE
    DBP_LIST = DBP_RESOURCE + PREFIX_LIST

    CLG_ONTOLOGY = 'http://caligraph.org/ontology/'
    CLG_RESOURCE = 'http://caligraph.org/resource/'

class RdfClass(Enum):
    OWL_THING = 'http://www.w3.org/2002/07/owl#Thing'
    OWL_CLASS = 'http://www.w3.org/2002/07/owl#Class'
    OWL_NAMED_INDIVIDUAL = 'http://www.w3.org/2002/07/owl#NamedIndividual'
    OWL_OBJECT_PROPERTY = 'http://www.w3.org/2002/07/owl#ObjectProperty'
    OWL_DATATYPE_PROPERTY = 'http://www.w3.org/2002/07/owl#DatatypeProperty'

In [9]:
def is_entity_name(name: str) -> bool:
    invalid_prefixes = (Namespace.PREFIX_LIST.value, Namespace.PREFIX_FILE.value, Namespace.PREFIX_IMAGE.value,
                        Namespace.PREFIX_CATEGORY.value, Namespace.PREFIX_TEMPLATE.value)
    return name and not name.startswith(invalid_prefixes)

In [10]:
def name2iri(name: str, prefix: Union[str, Enum]) -> str:
    if name == 'Thing':
        return RdfClass.OWL_THING.value
    prefix = prefix.value if isinstance(prefix, Enum) else prefix
    return prefix + name

def name2resource_iri(name: str) -> str:
    return name2iri(name, Namespace.DBP_RESOURCE)

def label2name(label: str) -> str:
    return label.replace(' ', '_')

In [15]:
def capitalize(text: str) -> str:
    return text[0].upper() + text[1:] if len(text) > 1 else text.upper()

In [16]:
def _remove_language_tag(link_target: str) -> str:
    if not link_target or link_target[0] != ':':
        return link_target
    if len(link_target) < 4 or link_target[3] != ':':
        return link_target[1:]
    return link_target[4:]

In [17]:
def get_resource_name_for_wikilink(wikilink: wtp.WikiLink) -> Optional[str]:
    return label2name(capitalize(_remove_language_tag(wikilink.target.strip())))

## Extract Abstract and Calculate Features

In [11]:
# Own implementation

def extract_abstract(wikipage: wtp.WikiText) -> str:
  parsed_page = wtp.parse(wikipage)
  try:
    result = parsed_page.plain_text(replace_bolds_and_italics=True, replace_wikilinks=False).strip(" '\t\n")
    result = re.sub(r'\n+', '\n', result)
    result = re.sub(r' +', ' ', result)
    result = result.split('==', 1)[0]
    if result.split('\n')[0].startswith('[[File'):
      result = result.split('\n')[1:][0]
    return result
  except(IndexError):

    result = parsed_page.get_sections(level=0)[0].pformat()
    result = result.split('\n\n')[1:]
    result = "\n".join(result)
    return result

In [12]:
# Own implementation

def extract_wikilinks(wikipage: wtp.WikiText) -> [str]:
    abstract = extract_abstract(wikipage)
    parsed_abstract = wtp.parse(abstract)
    links = []
    for link in parsed_abstract.wikilinks:
        links.append(name2resource_iri(get_resource_name_for_wikilink(link)))
    return links

In [13]:
# Example feature extraction function
def extract_features(candidate_link, entities, candidates, sentence_idx: int) -> dict:
    features = {}
    
    all_entities = []
    for sentence in entities:
        for l in sentence:
            all_entities.append(l)
            
    all_candidates = []
    for candidate in candidates:
        all_candidates.append(candidate[0])
        
    candidates_in_sentence = [candidate[0] for candidate in candidates if candidate[1] == sentence_idx]

    # Feature F00: Total number of entities in the abstract (all sentences)
    features["F00"] = len(candidates)
    
    # Feature F01: Number of candidates in the candidate's sentence
    features["F01"] = len(candidates_in_sentence)
    
    # Feature F02: Position of candidate w.r.t. all other candidates in the abstract
    features["F02"] = all_candidates.index(candidate_link)
    
    # Feature F03: Position of candidate w.r.t candidates in the sentence
    features["F03"] = candidates_in_sentence.index(candidate_link)
    
    # Feature F04: Number of entities in the sentence
    features["F04"] = len(entities[sentence_idx])
    
    # Feature F05: Position of candidate w.r.t all other entities in the abstract
    features["F05"] = all_entities.index(candidate_link)
    
    # Feature F06: Position of candidate w.r.t entities in sentence
    features["F06"] = entities[sentence_idx].index(candidate_link)
    
    # Feature F07: Position of candidates sentence in the abstract
    features["F07"] = sentence_idx #TODO: do I have to put + 1?
    
    features_scaled = {key:(1/(pow(2, value))) for (key, value) in features.items()}

    return features_scaled

In [14]:
# Own implementation

def get_domain_range_relation(dbpedia_ontology: Graph, relationURL: str) -> (str, str):
  relation_uri = URIRef(relationURL)
  domain = None
  range = None

  for s, p, o in dbpedia_ontology.triples((relation_uri, RDF.type, None)):
      for s2, p2, o2 in dbpedia_ontology.triples((s, RDFS.domain, None)):
        if o2:
          domain = o2
        else:
          domain = None
      for s2, p2, o2 in dbpedia_ontology.triples((s, RDFS.range, None)):
        if o2:
          range = o2
        else:
          range = None

  return domain, range

## Data Retrieval

### Get abstracts

In [18]:
urls = [
    'enwiki-20230901-pages-articles-multistream1.xml-p1p41242.bz2',
    'enwiki-20230901-pages-articles-multistream2.xml-p41243p151573.bz2',
    'enwiki-20230901-pages-articles-multistream3.xml-p151574p311329.bz2',
    'enwiki-20230901-pages-articles-multistream4.xml-p311330p558391.bz2',
    'enwiki-20230901-pages-articles-multistream5.xml-p558392p958045.bz2',
    'enwiki-20230901-pages-articles-multistream6.xml-p958046p1483661.bz2',
    'enwiki-20230901-pages-articles-multistream7.xml-p1483662p2134111.bz2',
    'enwiki-20230901-pages-articles-multistream8.xml-p2134112p2936260.bz2',
    'enwiki-20230901-pages-articles-multistream9.xml-p2936261p4045402.bz2',
    'enwiki-20230901-pages-articles-multistream10.xml-p4045403p5399366.bz2',
    'enwiki-20230901-pages-articles-multistream11.xml-p5399367p6899366.bz2',
    'enwiki-20230901-pages-articles-multistream11.xml-p6899367p7054859.bz2',
    'enwiki-20230901-pages-articles-multistream12.xml-p7054860p8554859.bz2',
    'enwiki-20230901-pages-articles-multistream12.xml-p8554860p9172788.bz2',
    'enwiki-20230901-pages-articles-multistream13.xml-p9172789p10672788.bz2',
    'enwiki-20230901-pages-articles-multistream13.xml-p10672789p11659682.bz2',
    'enwiki-20230901-pages-articles-multistream14.xml-p11659683p13159682.bz2',
    'enwiki-20230901-pages-articles-multistream14.xml-p13159683p14324602.bz2',
    'enwiki-20230901-pages-articles-multistream15.xml-p14324603p15824602.bz2',
    'enwiki-20230901-pages-articles-multistream15.xml-p15824603p17324602.bz2',
    'enwiki-20230901-pages-articles-multistream15.xml-p17324603p17460152.bz2',
    'enwiki-20230901-pages-articles-multistream16.xml-p17460153p18960152.bz2',
    'enwiki-20230901-pages-articles-multistream16.xml-p18960153p20460152.bz2',
    'enwiki-20230901-pages-articles-multistream16.xml-p20460153p20570392.bz2',
    'enwiki-20230901-pages-articles-multistream17.xml-p20570393p22070392.bz2',
    'enwiki-20230901-pages-articles-multistream17.xml-p22070393p23570392.bz2',
    'enwiki-20230901-pages-articles-multistream17.xml-p23570393p23716197.bz2',
    'enwiki-20230901-pages-articles-multistream18.xml-p23716198p25216197.bz2',
    'enwiki-20230901-pages-articles-multistream18.xml-p25216198p26716197.bz2',
    'enwiki-20230901-pages-articles-multistream18.xml-p26716198p27121850.bz2',
    'enwiki-20230901-pages-articles-multistream19.xml-p27121851p28621850.bz2',
    'enwiki-20230901-pages-articles-multistream19.xml-p28621851p30121850.bz2',
    'enwiki-20230901-pages-articles-multistream19.xml-p30121851p31308442.bz2',
    'enwiki-20230901-pages-articles-multistream20.xml-p31308443p32808442.bz2',
    'enwiki-20230901-pages-articles-multistream20.xml-p32808443p34308442.bz2',
    'enwiki-20230901-pages-articles-multistream20.xml-p34308443p35522432.bz2',
    'enwiki-20230901-pages-articles-multistream21.xml-p35522433p37022432.bz2',
    'enwiki-20230901-pages-articles-multistream21.xml-p37022433p38522432.bz2',
    'enwiki-20230901-pages-articles-multistream21.xml-p38522433p39996245.bz2',
    'enwiki-20230901-pages-articles-multistream22.xml-p39996246p41496245.bz2',
    'enwiki-20230901-pages-articles-multistream22.xml-p41496246p42996245.bz2',
    'enwiki-20230901-pages-articles-multistream22.xml-p42996246p44496245.bz2',
    'enwiki-20230901-pages-articles-multistream22.xml-p44496246p44788941.bz2',
    'enwiki-20230901-pages-articles-multistream23.xml-p44788942p46288941.bz2',
    'enwiki-20230901-pages-articles-multistream23.xml-p46288942p47788941.bz2',
    'enwiki-20230901-pages-articles-multistream23.xml-p47788942p49288941.bz2',
    'enwiki-20230901-pages-articles-multistream23.xml-p49288942p50564553.bz2',
    'enwiki-20230901-pages-articles-multistream24.xml-p50564554p52064553.bz2',
    'enwiki-20230901-pages-articles-multistream24.xml-p52064554p53564553.bz2',
    'enwiki-20230901-pages-articles-multistream24.xml-p53564554p55064553.bz2',
    'enwiki-20230901-pages-articles-multistream24.xml-p55064554p56564553.bz2',
    'enwiki-20230901-pages-articles-multistream24.xml-p56564554p57025655.bz2',
    'enwiki-20230901-pages-articles-multistream25.xml-p57025656p58525655.bz2',
    'enwiki-20230901-pages-articles-multistream25.xml-p58525656p60025655.bz2',
    'enwiki-20230901-pages-articles-multistream25.xml-p60025656p61525655.bz2',
    'enwiki-20230901-pages-articles-multistream25.xml-p61525656p62585850.bz2',
    'enwiki-20230901-pages-articles-multistream26.xml-p62585851p63975909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p63975910p65475909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p65475910p66975909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p66975910p68475909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p68475910p69975909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p69975910p71475909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p71475910p72975909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p72975910p74475909.bz2',
    'enwiki-20230901-pages-articles-multistream27.xml-p74475910p74725399.bz2'
]

In [16]:
data = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/dataPickle.p', 'rb'))

In [24]:
data = pickle.load(open('abstracts.p', 'rb'))

In [15]:
# Get data directly from website

data = {}

parser = etree.XMLParser(target=WikiPageParser())
for url in urls[:5]:
    req = requests.get(url, stream=True)
    with bz2.open(req.raw, 'rb') as dbp_pages_file:
        try:
            print('Test1', url)
            page_markup = etree.parse(dbp_pages_file, parser)
            print('Test2', url)
        except(EOFError):
            print("Problem with file", url)
            continue
        for page_title, markup in tqdm(page_markup.items()):
            if markup[:9] != '#REDIRECT':
                data.update({name2resource_iri(label2name(page_title)): markup})
                pickle.dump(data, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/dataPickle.p', 'wb'))
        print('Test3', url)

Test1 https://dumps.wikimedia.org/enwiki/20230901/enwiki-20230901-pages-articles-multistream1.xml-p1p41242.bz2
Test2 https://dumps.wikimedia.org/enwiki/20230901/enwiki-20230901-pages-articles-multistream1.xml-p1p41242.bz2


 63%|██████▎   | 17198/27179 [1:16:56<44:39,  3.73it/s]  


KeyboardInterrupt: 

In [19]:
# Get data from workspace

data = {}

parser = etree.XMLParser(target=WikiPageParser())

prefix = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/files/'

for url in tqdm(urls[60:]):
    complete_url = prefix + url
    with bz2.open(complete_url, 'rb') as dbp_pages_file:
        page_markup = etree.parse(dbp_pages_file, parser)
        for page_title, markup in page_markup.items():
            if markup[:9] != '#REDIRECT':
                data.update({name2resource_iri(label2name(page_title)): markup})
        print('Erledigt:', url)
        pickle.dump(data, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/dataPickle.p', 'wb'))

  0%|          | 0/5 [00:00<?, ?it/s]

Erledigt: enwiki-20230901-pages-articles-multistream27.xml-p68475910p69975909.bz2


 20%|██        | 1/5 [12:43<50:52, 763.08s/it]

Erledigt: enwiki-20230901-pages-articles-multistream27.xml-p69975910p71475909.bz2


 40%|████      | 2/5 [22:59<33:50, 676.75s/it]

Erledigt: enwiki-20230901-pages-articles-multistream27.xml-p71475910p72975909.bz2


 60%|██████    | 3/5 [31:41<20:12, 606.35s/it]

Erledigt: enwiki-20230901-pages-articles-multistream27.xml-p72975910p74475909.bz2


 80%|████████  | 4/5 [39:25<09:09, 549.97s/it]

Erledigt: enwiki-20230901-pages-articles-multistream27.xml-p74475910p74725399.bz2


100%|██████████| 5/5 [41:57<00:00, 503.58s/it]


In [28]:
abstract_links = pickle.load(open('abstractLinksComplete.p', 'rb'))

In [19]:
# Extract sentence structure

def get_abstract_links(key, value):
    try:
        # Extract the abstract
        abstract = extract_abstract(value)

        # Split the abstract into sentences
        sentences = sent_tokenize(abstract)

        # Extract wikilinks for each sentence
        sentence_links = [[(name2resource_iri(get_resource_name_for_wikilink(link))) for link in wtp.parse(sentence).wikilinks] for sentence in sentences]

        # Store the sentence-level links in the abstract_links dictionary
        return sentence_links
    except IndexError:
        print(key)

In [20]:
# Prepare method for parallel processing

prefix = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/files/'
folder = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstract_links/'

parser = etree.XMLParser(target=WikiPageParser())

def process_url(url):
    abstract_links = {}
    filename = folder + url + '.p'
    try:
        complete_url = prefix + url
        with bz2.open(complete_url, 'rb') as dbp_pages_file:
            page_markup = etree.parse(dbp_pages_file, parser)
            for page_title, markup in page_markup.items():
                if not markup.startswith('#REDIRECT'):
                    key = name2resource_iri(label2name(page_title))
                    try:
                        # Extract the abstract
                        abstract = extract_abstract(markup)
                        sentences = sent_tokenize(abstract)
                        sentence_links = [[name2resource_iri(get_resource_name_for_wikilink(link)) for link in wtp.parse(sentence).wikilinks] for sentence in sentences]
                        abstract_links[key] = sentence_links
                    except IndexError:
                        print('Has IndexError:', key)
                        continue
            pickle.dump(abstract_links, open(filename, 'wb'))
            print('Nach dump', url)
    except Exception as e:
        print(e)

In [21]:
urls_to_process = urls

with concurrent.futures.ProcessPoolExecutor(40) as executor:
    executor.map(process_url, urls_to_process)

Nach dump enwiki-20230901-pages-articles-multistream18.xml-p26716198p27121850.bz2
Nach dump enwiki-20230901-pages-articles-multistream13.xml-p10672789p11659682.bz2
'NoneType' object has no attribute 'start'
Nach dump enwiki-20230901-pages-articles-multistream20.xml-p34308443p35522432.bz2
'NoneType' object has no attribute 'end'
Nach dump enwiki-20230901-pages-articles-multistream23.xml-p44788942p46288941.bz2
Nach dump enwiki-20230901-pages-articles-multistream19.xml-p28621851p30121850.bz2
Nach dump
 enwiki-20230901-pages-articles-multistream19.xml-p30121851p31308442.bz2Nach dumpenwiki-20230901-pages-articles-multistream1.xml-p1p41242.bz2 
Nach dump enwiki-20230901-pages-articles-multistream15.xml-p15824603p17324602.bz2
Nach dump enwiki-20230901-pages-articles-multistream14.xml-p13159683p14324602.bz2
Nach dump enwiki-20230901-pages-articles-multistream13.xml-p9172789p10672788.bz2
Nach dump enwiki-20230901-pages-articles-multistream16.xml-p17460153p18960152.bz2
Nach dump enwiki-20230901-

In [5]:
data = pickle.load(open('dataPickle.p', 'rb'))

In [None]:
abstract_links_test = {}
        
parser = etree.XMLParser(target=WikiPageParser())

prefix = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/files/'
#prefix = 'https://dumps.wikimedia.org/enwiki/20230901/'

for url in urls[9:]:
    data = {}
    complete_url = prefix + url
    #complete_url = requests.get(complete_url, stream=True).raw
    print('Vor parsing:', url)
    with bz2.open(complete_url, 'rb') as dbp_pages_file:
        page_markup = etree.parse(dbp_pages_file, parser)
        print('Nach parsing:', url)
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for key, value in tqdm(page_markup.items()):
                futures.append(executor.submit(get_abstract_links, key, value))
                abstract_links_test[key] = futures
    print('Vor dump:', url)
    #pickle.dump(data, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstractLinks.p', 'wb'))
    pickle.dump(abstract_links, open('abstractLinksComplete.p', 'wb'))
    print('Nach dump:', url)

Vor parsing: enwiki-20230901-pages-articles-multistream10.xml-p4045403p5399366.bz2


In [None]:
#abstract_links = {}

parser = etree.XMLParser(target=WikiPageParser())

prefix = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/files/'
#prefix = 'https://dumps.wikimedia.org/enwiki/20230901/'

for url in urls[8:]:
    data = {}
    complete_url = prefix + url
    #complete_url = requests.get(complete_url, stream=True).raw
    print('Vor parsing:', url)
    with bz2.open(complete_url, 'rb') as dbp_pages_file:
        page_markup = etree.parse(dbp_pages_file, parser)
        print('Nach parsing:', url)
        for page_title, markup in tqdm(page_markup.items()):
            if not markup.startswith('#REDIRECT'):
                key = name2resource_iri(label2name(page_title))
                try:
                    # Extract the abstract
                    abstract = extract_abstract(markup)
                    sentences = sent_tokenize(abstract)
                    sentence_links = [[(name2resource_iri(get_resource_name_for_wikilink(link))) for link in wtp.parse(sentence).wikilinks] for sentence in sentences]
                    abstract_links[key] = sentence_links
                except (IndexError):
                    print('Has IndexError:', key)
                    continue

    print('Vor dump:', url)
    #pickle.dump(data, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstractLinks.p', 'wb'))
    pickle.dump(abstract_links, open('abstractLinksComplete.p', 'wb'))
    print('Nach dump:', url)

Vor parsing: enwiki-20230901-pages-articles-multistream9.xml-p2936261p4045402.bz2
Nach parsing: enwiki-20230901-pages-articles-multistream9.xml-p2936261p4045402.bz2


100%|██████████| 373042/373042 [22:28<00:00, 276.65it/s] 


Vor dump: enwiki-20230901-pages-articles-multistream9.xml-p2936261p4045402.bz2
Nach dump: enwiki-20230901-pages-articles-multistream9.xml-p2936261p4045402.bz2
Vor parsing: enwiki-20230901-pages-articles-multistream10.xml-p4045403p5399366.bz2
Nach parsing: enwiki-20230901-pages-articles-multistream10.xml-p4045403p5399366.bz2


 20%|█▉        | 154098/778031 [09:23<35:23, 293.88it/s]  

In [None]:
# Clean data

data_cleaned = {}

for k, v in pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/dataPickle.p', 'rb')).items():
  if v[:9] != '#REDIRECT':
    data_cleaned.update({k: v})

In [None]:
pickle.dump(data_cleaned, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/dataPickle.p', 'wb'))

### Get transitive types

In [18]:
url = 'https://downloads.dbpedia.org/repo/dbpedia/mappings/instance-types/2022.12.01/instance-types_lang%3Den_transitive.ttl.bz2'
req = requests.get(url, stream=True)
encoded_types = []
with bz2.open(req.raw, 'rb') as types:
  pickle.dump(list(types), open('encodedTypes.p', 'wb'))

In [19]:
types = pickle.load(open('encodedTypes.p', 'rb'))

In [18]:
types = defaultdict(list)
pattern = r'<([^>]+)>'
for type in pickle.load(open('encodedTypes.p', 'rb')):
  results = re.findall(pattern, type.decode('utf-8'))
  if results[2].startswith('http://dbpedia.org/ontology/'):
    types[results[0]].append(results[2])

In [23]:
pickle.dump(types, open('decodedTypes.p', 'wb'))

In [15]:
types = pickle.load(open('decodedTypes.p', 'rb'))
types.get('http://dbpedia.org/resource/Abraham_Lincoln')

['http://dbpedia.org/ontology/Animal',
 'http://dbpedia.org/ontology/Eukaryote',
 'http://dbpedia.org/ontology/Person',
 'http://dbpedia.org/ontology/Species']

### Get wikilinks

In [5]:
url = 'https://downloads.dbpedia.org/repo/dbpedia/generic/wikilinks/2022.12.01/wikilinks_lang%3Den.ttl.bz2'
req = requests.get(url, stream=True)
with bz2.open(req.raw, 'rb') as wikilinks:
    pickle.dump(list(wikilinks), open('encodedWikilinks.p', 'wb'))

In [6]:
encoded_wikilinks = pickle.load(open('encodedWikilinks.p', 'rb'))

In [8]:
decoded_wikilinks = []
pattern = r'<([^>]+)>'
for triple in encoded_wikilinks:
  results = re.findall(pattern, triple.decode('utf-8'))
  decoded_wikilinks.append([results[0], results[2]])

In [10]:
pickle.dump(decoded_wikilinks, open('decodedWikilinks.p', 'wb'))

In [None]:
wikilinks = pickle.load( open('decodedWikilinks.p', 'rb'))

### Get mappingbased objects

In [None]:
url = 'https://downloads.dbpedia.org/repo/dbpedia/mappings/mappingbased-objects/2020.02.01/mappingbased-objects_lang%3Den.ttl.bz2'
req = requests.get(url, stream=True)
encoded_relations = []
with bz2.open(req.raw, 'rb') as mappingbased_objects:
    pickle.dump(list(mappingbased_objects), open('encodedRelations.p', 'wb'))

In [17]:
relations = set()
pattern = r'<([^>]+)>'
for relation in pickle.load(open('encodedRelations.p', 'rb')):
  results = re.findall(pattern, relation.decode('utf-8'))
  relations.add(results[1])

In [27]:
pickle.dump(relations, open('decodedRelations.p', 'wb'))

In [18]:
triples = []
pattern = r'<([^>]+)>'
for triple in pickle.load(open('encodedRelations.p', 'rb')):
  results = re.findall(pattern, triple.decode('utf-8'))
  triples.append([results[0], results[1], results[2]])

In [30]:
pickle.dump(triples, open('triples.p', 'wb'))

In [None]:
print(re.findall(pattern, encoded_relations[0].decode('utf-8')))

In [11]:
triples = pickle.load(open('triples.p', 'rb'))

In [16]:
relation_domain_range = {}
no_domain_range = []

relations = pickle.load(open('decodedRelations.p', 'rb'))

for relation in relations:
    domain_range = get_domain_range_relation(dbpedia_ontology, relation)
    if domain_range[0] or domain_range[1]:
        relation_domain_range.update({relation: domain_range})
    else:
        no_domain_range.append(relation)

In [17]:
pickle.dump(relation_domain_range, open('relationDomainRange.p', 'wb'))
pickle.dump(no_domain_range, open('noDomainRange.p', 'wb'))

In [42]:
def get_first_wikilink_resource(text: str) -> Optional[str]:
    try:
        for wl in wtp.parse(text).wikilinks:
            res = get_resource_name_for_wikilink(wl)
            if res is None:
                continue
            return res
        return None
    except (AttributeError, IndexError):
        return None

In [71]:
abstract_links = {}
for key, value in data_cleaned.items():
    try:
        abstract_links.update({key: extract_wikilinks(value)})
    except(IndexError):
        print(key)

In [16]:
data = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/dataPickle.p', 'rb'))

In [58]:
def return_abstract_links(key, value):
    try:
        # Extract the abstract
        abstract = extract_abstract(value)

        # Split the abstract into sentences
        sentences = sent_tokenize(abstract)

        # Extract wikilinks for each sentence
        sentence_links = [[(name2resource_iri(get_resource_name_for_wikilink(link))) for link in wtp.parse(sentence).wikilinks] for sentence in sentences]

        # Store the sentence-level links in the abstract_links dictionary
        #return key, sentence_links
        abstract_links[key] = sentence_links
    except IndexError:
        print(key)

In [None]:
abstract_links = {}

with concurrent.futures.ThreadPoolExecutor() as executor:  
    for key, value in data.items():
        executor.map(return_abstract_links, key, value)

In [None]:
pickle.dump(abstract_links, open('abstractLinksComplete.p', 'wb'))

In [None]:
# Function to process each item in data for parallel processing
def process_item(key, value):
    try:
        # Extract the abstract
        abstract = extract_abstract(value)

        # Split the abstract into sentences
        sentences = sent_tokenize(abstract)

        # Extract wikilinks for each sentence
        sentence_links = [[(name2resource_iri(get_resource_name_for_wikilink(link))) for link in wtp.parse(sentence).wikilinks] for sentence in sentences]

        return key, sentence_links
    except IndexError:
        print(key)
        return key, []

abstract_links_complete2 = {}

num_processes = multiprocessing.cpu_count()

# ThreadPoolExecutor with the specified number of processes
with concurrent.futures.ThreadPoolExecutor(max_workers=num_processes) as executor:
    results = list(tqdm(executor.map(lambda item: process_item(*item), data.items()), total=len(data)))

    # Collect the results
    for key, sentence_links in results:
        abstract_links_complete2[key] = sentence_links

In [None]:
# Initialize the abstract_links dictionary with sentence-level information
abstract_links_complete2 = {}

for key, value in tqdm(data.items()):
    try:
        # Extract the abstract
        abstract = extract_abstract(value)

        # Split the abstract into sentences
        sentences = sent_tokenize(abstract)

        # Extract wikilinks for each sentence
        sentence_links = [[(name2resource_iri(get_resource_name_for_wikilink(link))) for link in wtp.parse(sentence).wikilinks] for sentence in sentences]

        # Store the sentence-level links in the abstract_links dictionary
        abstract_links_complete2[key] = sentence_links
    except IndexError:
        print(key)

In [98]:
pickle.dump(abstract_links, open('abstractLinksComplete2.p', 'wb'))

In [24]:
abstract_links_test = {}

filepath = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstract_links/'

for url in urls:
    try: 
        path = filepath + url + '.p'
        file = pickle.load(open(path, 'rb'))
        abstract_links_test.update(file)
    except FileNotFoundError:
        print(url)
        continue

enwiki-20230901-pages-articles-multistream19.xml-p27121851p28621850.bz2
enwiki-20230901-pages-articles-multistream22.xml-p41496246p42996245.bz2


In [25]:
pickle.dump(abstract_links_test, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstractLinksAll.p', 'wb'))

## Generate Candidates and Calculate Features

In [28]:
abstract_links = pickle.load(open('abstractLinks.p', 'rb'))

In [None]:
candidates = {} # initialize dict for candidates

for page, links in abstract_links.items(): # loop over all abstracts and their links
    
    page_type = types.get(page) # get the type of current page
    
    page_candidates = [] # initialize set for the candidates for current page
    
    for link in links: # loop over the links in an abstract
        
        link_type = types.get(link) # get the type of the page of the link
        
        if link_type: # if it has a type
            
            for relation in domain_range.items(): # loop over all relations and their domains/ranges
                
                if relation[1][0] == page_type or relation[1][1] == link_type: # if relation has corresponding domain or range
                    
                    page_candidates.append([page_type, relation[0], link_type]) # append the candidate with the relation
                    
    candidates.update({page: page_candidates}) # update the candidates dictionary

In [13]:
types = pickle.load(open('decodedTypes.p', 'rb'))
abstract_links = pickle.load(open('abstractLinks.p', 'rb'))
domain_range = pickle.load(open('relationDomainRange.p', 'rb'))
triples = pickle.load(open('triples.p', 'rb'))

In [5]:
types = pickle.load(open('decodedTypes.p', 'rb'))

In [6]:
type_set = set()
for object in types.values():
    for type in object:
        type_set.add(type)

In [16]:
abstract_links = pickle.load(open('abstractLinksComplete.p', 'rb'))

In [25]:
abstract_links = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstractLinks.p', 'rb'))

In [30]:
pickle.dump(abstract_links_test, open('abstractLinks150k.p', 'wb'))

In [14]:
relations_counts = {}

for triple in tqdm(triples):
    relation = triple[1]
    
    if relation in list(relations_counts.keys()):
        relations_counts[relation] += 1
    else:
        relations_counts[relation] = 0

100%|██████████| 19808136/19808136 [01:38<00:00, 201415.56it/s]


In [15]:
relation_counts_sorted = dict(sorted(relations_counts.items(), key=lambda x:x[1], reverse=True))

In [47]:
pickle.dump(relation_counts_sorted, open('relationCountsSorted.p', 'wb'))

In [5]:
nicos_relations = [
    'http://dbpedia.org/ontology/birthPlace',
    'http://dbpedia.org/ontology/family',
    'http://dbpedia.org/ontology/deathPlace',
    'http://dbpedia.org/ontology/producer',
    'http://dbpedia.org/ontology/writer',
    'http://dbpedia.org/ontology/subsequentWork',
    'http://dbpedia.org/ontology/previousWork',
    'http://dbpedia.org/ontology/artist',
    'http://dbpedia.org/ontology/formerTeam'  
]

In [6]:
abstract_links = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/abstractLinksAll.p', 'rb'))
types = pickle.load(open('decodedTypes.p', 'rb'))
domain_range = pickle.load(open('relationDomainRange.p', 'rb'))
triples = pickle.load(open('triples.p', 'rb'))

In [5]:
triples_with_nicos_relations = []
for triple in tqdm(triples):
    if triple[1] in nicos_relations:
        triples_with_nicos_relations.append(triple)

100%|██████████| 19808136/19808136 [00:06<00:00, 3247749.69it/s]


In [20]:
candidates_with_features = []  # Initialize dict for candidates with features
X = []
y = []

for page, sentence_links in tqdm(abstract_links.items()):

    page_types = types.get(page)
    candidates = []
    candidate_links = []

    for sentence_idx, links in enumerate(sentence_links):
        for entity in links:
            entity_types = types.get(entity)

            if entity_types:
                for k, v in domain_range.items():

                    if not page_types or not v[1]:
                        continue

                    elif str(v[0]) in page_types and str(v[1]) in entity_types and k in nicos_relations:
                        candidate = [page, k, entity, sentence_idx]

                        candidates.append(candidate)
                        candidate_links.append([entity, sentence_idx])

    interim = candidate_links
    candidate_links = []

    [candidate_links.append(link) for link in interim if link not in candidate_links] 

    for candidate in candidates:
        candidate_features = extract_features(candidate[2], sentence_links, candidate_links, candidate[3])

        X.append(list(candidate_features.values()))
        candidates_with_features.append([candidate[0], candidate[1], candidate[2]])

100%|██████████| 10067593/10067593 [36:21<00:00, 4614.02it/s] 


In [21]:
pickle.dump(X, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/X2.p', 'wb'))
pickle.dump(candidates_with_features, open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/candidatesWithFeatures2.p', 'wb'))

In [7]:
X = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/X2.p', 'rb'))
candidates_with_features = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/candidatesWithFeatures2.p', 'rb'))

In [8]:
pickle.dump(candidates_with_features, open('candidates.p', 'wb'))

In [26]:
types = pickle.load(open('decodedTypes.p', 'rb'))

In [5]:
df = pickle.load(open('dataFrameAll.p', 'rb'))

In [20]:
type_set = pickle.load(open('typeSet.p', 'rb'))

In [8]:
decoded_wikilinks = pickle.load(open('decodedWikilinks.p', 'rb'))

In [None]:
decoded_wikilinks_tuples = {tuple(list) for list in decoded_wikilinks}

In [17]:
wikilink_set = pickle.load(open('/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/wikilinkSet.p', 'rb'))

In [None]:
wikilink_set = pickle.load(open('wikilinkSet.p', 'rb'))

In [None]:
# Initialize tqdm for monitoring the progress
pbar = tqdm(total=len(df_previouswork))

# Iterate through the DataFrame and populate F08
for index, row in df_bp2.iterrows():
    if (row['range'], row['domain']) in decoded_wikilinks:
        df_bp2.at[index, 'F08'] = 1
    pbar.update(1)  # Update progress bar

In [12]:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm

In [77]:
# Initialize tqdm for monitoring the progress
pbar = tqdm(total=len(df_formerteam))

def check_and_update(row, wikilink_set, pbar):
    if (row.range, row.domain) in wikilink_set:
        result = 1
    else:
        result = 0

    pbar.update(1)
    return result

# Parallelize the lookup and update
results = []
with ThreadPoolExecutor() as executor:
    for result in executor.map(partial(check_and_update, wikilink_set=wikilink_set, pbar=pbar), df_formerteam.itertuples(index=False)):
        results.append(result)

df_formerteam['F08'] = results

pbar.close()

 35%|███▌      | 17718/50000 [00:00<00:02, 15654.81it/s]IOStream.flush timed out
  df_formerteam['F08'] = results
100%|██████████| 50000/50000 [00:34<00:00, 1439.35it/s] 


In [79]:
# Initialize tqdm for monitoring the progress
pbar = tqdm(total=len(df_formerteam))

# Iterate through the DataFrame and populate the binary columns
for index, row in df_formerteam.iterrows():
    domain = row['domain']
    for type in types.get(domain, []):
        t = type.removeprefix('http://dbpedia.org/ontology/')
        t = 'T:' + t
        df_formerteam.at[index, t] = 1
    pbar.update(1)  # Update progress bar
    
pbar.close()

100%|██████████| 50000/50000 [00:05<00:00, 8634.62it/s]


In [80]:
pickle.dump(df_formerteam, open('data/dfFormerTeam50k', 'wb'))

In [None]:
pickle.dump(df2, open('df2.p', 'wb'))

## Create Dataframes

In [16]:
import pandas as pd

columns = ['domain', 'relation', 'range', 'F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07']

df1 = pd.DataFrame(candidates_with_features, columns=['domain', 'relation', 'range'])
df2 = pd.DataFrame(X, columns=['F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07'])

In [17]:
df = pd.concat([df1, df2], axis=1, join='inner')
df.head()

Unnamed: 0,domain,relation,range,F00,F01,F02,F03,F04,F05,F06,F07
0,http://dbpedia.org/resource/Abraham_Lincoln,http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/resource/Union_(American_Ci...,0.000977,0.5,1.0,1.0,0.015625,0.25,1.0,0.5
1,http://dbpedia.org/resource/Abraham_Lincoln,http://dbpedia.org/ontology/deathPlace,http://dbpedia.org/resource/Union_(American_Ci...,0.000977,0.5,1.0,1.0,0.015625,0.25,1.0,0.5
2,http://dbpedia.org/resource/Abraham_Lincoln,http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/resource/Kentucky,0.000977,0.25,0.5,1.0,0.0625,0.001953,0.5,0.25
3,http://dbpedia.org/resource/Abraham_Lincoln,http://dbpedia.org/ontology/deathPlace,http://dbpedia.org/resource/Kentucky,0.000977,0.25,0.5,1.0,0.0625,0.001953,0.5,0.25
4,http://dbpedia.org/resource/Abraham_Lincoln,http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/resource/Indiana,0.000977,0.25,0.25,0.5,0.0625,0.000488,0.125,0.25


In [18]:
#y_full = pickle.load(open('y_full.p', 'rb'))
result_df = pd.DataFrame(y_full, columns=['label'])

In [19]:
df = pd.concat([df, result_df], axis=1, join='inner')

In [5]:
df = pickle.load(open('dataFrameAll.p', 'rb'))

In [6]:
# sample dataframes of size 50,000 with 10,000 1 labels

def sample_dataframe(df, n, ratio):
  # Filter rows with label '1' and '0'
  label_1_rows = df[df['label'] == 1]
  label_0_rows = df[df['label'] == 0]

  if len(label_1_rows) >= (n * ratio):

    # Select 10,000 rows with label '1' and 40,000 rows with label '0'
    selected_rows_label_1 = label_1_rows.sample(n=int((n*ratio)), random_state=42)
    selected_rows_label_0 = label_0_rows.sample(n=int((n*(1-ratio))), random_state=42)

  else:
    selected_rows_label_1 = label_1_rows
    selected_rows_label_0 = label_0_rows.sample(n=(50000-len(label_1_rows)), random_state=42)

  # Concatenate the selected rows
  selected_rows = pd.concat([selected_rows_label_1, selected_rows_label_0])

  # Shuffle the selected rows
  return selected_rows.sample(frac=1, random_state=42)

In [7]:
df_birthplace = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/birthPlace'], 50000, 0.2)
df_family = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/family'], 50000, 0.2)
df_deathplace = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/deathPlace'], 50000, 0.2)
df_producer = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/producer'], 50000, 0.2)
df_writer = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/writer'], 50000, 0.2)
df_subsequentwork = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/subsequentWork'], 50000, 0.2)
df_previouswork = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/previousWork'], 50000, 0.2)
df_artist = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/artist'], 50000, 0.2)
df_formerteam = sample_dataframe(df[df.relation == 'http://dbpedia.org/ontology/formerTeam'], 50000, 0.2)

In [81]:
relations = [df_birthplace, df_family, df_deathplace, df_producer, df_writer, df_subsequentwork, 
             df_previouswork, df_artist, df_formerteam]

In [None]:
df_relations = []

for relation in nicos_relations:
    if not df[df.relation == relation].empty:
        df_relations.append(df[df.relation == relation])
    else:
        print(relation)

In [None]:
candidates = [
    candidates_with_features[:10000],
    candidates_with_features[10000:20000],
    candidates_with_features[20000:30000],
    candidates_with_features[30000:40000],
    candidates_with_features[40000:50000],
]

y_full = []

for i, candidate_set in enumerate(candidates):
    y = []
    for candidate in tqdm(candidate_set):
        if candidate in triples_with_top_relations:
            y.append(1)
        else:
            y.append(0)
    y_full.append(i, y)

In [8]:
candidates = [candidates_with_features[i:i + 3000] for i in range(0, len(candidates_with_features), 3000)]

In [10]:
def process_candidate_set(candidate_set):
    try:
        y = []
        path = '/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/'
        name = path + str(candidates.index(candidate_set)) + '.p'
        print('Processing', name)
        for candidate in candidate_set:
            if candidate in triples_with_nicos_relations:
                y.append(1)
            else:
                y.append(0)
        pickle.dump(y, open(name, 'wb'))
        print('After dump', name)
    except Exception as e:
        print(e)

In [None]:
# Create a ThreadPoolExecutor
with concurrent.futures.ProcessPoolExecutor(20) as executor:
    executor.map(process_candidate_set, candidates[1480:])

Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1480.p
Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1481.p
ProcessingProcessing  /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1483.p/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1482.p

Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1484.p
Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1485.p
Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1486.p
Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1487.p
Processing /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1488.p
ProcessingProcessingProcessingProcessingProcessingProcessingProcessing       /pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1495.p/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1494.p/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/1493.p/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/14

In [12]:
y_full = []
path = "/pfs/work7/workspace/scratch/ma_nfuerhau-masterthesis/y2/{index}.p"

for i in range(1524):
    y_full += pickle.load(open(path.format(index=i), 'rb'))

In [15]:
pickle.dump(y_full, open('y_full2.p', 'wb'))

In [14]:
pickle.dump(X, open('X_full2.p', 'wb'))

In [None]:
# Function to process a single candidate and return the label (1 or 0)
def process_candidate(candidate, triples_with_top_relations):
    if candidate in triples_with_top_relations:
        return 1
    else:
        return 0

y = []

# Use concurrent.futures to parallelize the processing
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = []

    for candidate in tqdm(candidates_with_features[:10000]):
        future = executor.submit(process_candidate, candidate, triples_with_top_relations)
        results.append(future)

    for result in results:
        label = result.result()
        y.append(label)

In [50]:
y = pickle.load(open('y.p', 'rb'))
y2 = pickle.load(open('y2.p', 'rb'))
y3 = pickle.load(open('y3.p', 'rb'))
y4 = pickle.load(open('y4.p', 'rb'))
y5 = pickle.load(open('y5.p', 'rb'))

In [3]:
X_full = pickle.load(open('X_full.p', 'rb'))
y_full = pickle.load(open('y_full.p', 'rb'))

In [4]:
# Convert X_full and y_full into a DataFrame
data = pd.DataFrame(data=X_full, columns=['F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07'])
data['target'] = y_full

## Start Training

### Create train test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['domain', 'relation', 'range', 'label'], axis=1),
    df['label'],
    test_size=0.2, 
    random_state=42
)

# Check the class distribution in the training data before balancing
class_distribution_before = y_train.value_counts(normalize=True)
print("Class distribution before balancing:")
print(class_distribution_before)

# Balance the training data using RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train, y_train)

# Check the class distribution in the training data after balancing
class_distribution_after = y_train_balanced.value_counts(normalize=True)
print("\nClass distribution after balancing:")
print(class_distribution_after)

Class distribution before balancing:
label
0    0.925525
1    0.074475
Name: proportion, dtype: float64

Class distribution after balancing:
label
0    0.5
1    0.5
Name: proportion, dtype: float64


In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    df_birthplace.drop(['domain', 'relation', 'range', 'label'], axis=1),
    df_birthplace['label'],
    test_size=0.2,
    shuffle=True,
    random_state=42
)

In [6]:
paths = [
    'data/dfBirthPlace50k.p',
    'data/dfFamily50k.p',
    'data/dfDeathPlace50k.p',
    'data/dfProducer50k.p',
    'data/dfWriter50k.p',
    'data/dfSubsequentWork50k.p',
    'data/dfPreviousWork50k.p',
    'data/dfArtist50k.p',
    'data/dfFormerTeam50k.p'
]

In [7]:
relations = []

for path in paths:
    relations.append(pickle.load(open(path, 'rb')))

### Create Classifiers, train and test
* Random Forest
* Naive Bayes
* RIPPER
* SVM

In [12]:
# Initialize a dictionary to store the optimal thresholds for each relation
#optimal_thresholds = {}

rf_classifier = RandomForestClassifier(random_state=42)

# Initialize the Naive Bayes classifier
nb_classifier = GaussianNB()

# Initialize the RIPPER classifier
ripper_classifier = lw.RIPPER()

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', probability=True)

classifiers = [(nb_classifier, "nb"), (rf_classifier, "rf"), (svm_classifier, "svm"), (ripper_classifier, "ripper")]

for classifier, name in classifiers:

    print(name)
    print()

    for index, relation in enumerate(relations):
      relation_name = nicos_relations[index]

      if name == "rf":
        classifier = RandomForestClassifier(random_state=42, 
                                            max_depth=best_hp_rf[index][0],
                                            n_estimators=best_hp_rf[index][1])
    
      X_train, X_test, y_train, y_test = train_test_split(
        relation.drop(['domain', 'relation', 'range', 'label'], axis=1),
        relation['label'],
        test_size=0.2, 
        random_state=42 
      )
    
      # Balance the training data using RandomOverSampler
      oversampler = RandomOverSampler(sampling_strategy=0.4, random_state=42)
      #X_train, y_train = oversampler.fit_resample(X_train, y_train)
    
      classifier.fit(X_train, y_train)

      y_pred = classifier.predict_proba(X_test)[:, 1]
    
      best_precision = 0
      best_threshold = 0
    
      for threshold in range(1, 1000):
        threshold /= 1000
    
        y_pred_binary = (y_pred >= threshold).astype(int)
    
        precision = precision_score(y_test, y_pred_binary, zero_division=0)
    
        # Track the best precision and threshold
        if precision >= best_precision and precision <= 0.95:
          best_precision = precision
          best_threshold = threshold
    
      # Store the optimal threshold for this relation
      #optimal_thresholds[relation_name] = best_threshold
    
      # Apply the best threshold to the entire test set for this relation
      relation_y_pred_binary = (y_pred >= best_threshold).astype(int)
    
      # Calculate overall precision and recall for this relation
      overall_recall = recall_score(y_test, relation_y_pred_binary)
    
      f1 = f1_score(y_test, relation_y_pred_binary)
    
      # Print the results for this relation
      print(f"Relation: {relation_name}")
      print(f"Best Threshold: {best_threshold}")
      print(f"Best Precision: {best_precision}")
      print(f"Recall for best Threshold: {overall_recall}")
      print(f"F1 for best Threshold: {f1}")
      print()

    print('-------------------------')

nb

Relation: http://dbpedia.org/ontology/birthPlace
Best Threshold: 0.999
Best Precision: 0.20404636030906873
Recall for best Threshold: 0.9950421417947447
F1 for best Threshold: 0.33864844343204253

Relation: http://dbpedia.org/ontology/family
Best Threshold: 0.999
Best Precision: 0.16324626865671643
Recall for best Threshold: 1.0
F1 for best Threshold: 0.2806736166800321

Relation: http://dbpedia.org/ontology/deathPlace
Best Threshold: 0.999
Best Precision: 0.20549203756635362
Recall for best Threshold: 0.9980168567178979
F1 for best Threshold: 0.340810970964192

Relation: http://dbpedia.org/ontology/producer
Best Threshold: 0.999
Best Precision: 0.12099772154934645
Recall for best Threshold: 0.9970355731225297
F1 for best Threshold: 0.21580579617153248

Relation: http://dbpedia.org/ontology/writer
Best Threshold: 0.546
Best Precision: 0.6168831168831169
Recall for best Threshold: 0.04709965294992563
F1 for best Threshold: 0.08751727314601566

Relation: http://dbpedia.org/ontology/s

### XGBoost

In [45]:
for index, relation in enumerate(relations):
  relation_name = nicos_relations[index]

  X_train, X_test, y_train, y_test = train_test_split(
    relation.drop(['domain', 'relation', 'range', 'label'], axis=1),
    relation['label'],
    test_size=0.2,  
    random_state=42  
    )

  # Balance the training data using RandomOverSampler
  oversampler = RandomOverSampler(sampling_strategy=0.4, random_state=42)
  #X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train, y_train)

  # Create a DMatrix for XGBoost
  dtrain = xgb.DMatrix(X_train, label=y_train)
  dtest = xgb.DMatrix(X_test)

  # Set hyperparameters
  num_round = 200
    
  params = {
      'objective': 'binary:logistic',  # For binary classification
      'eval_metric': 'logloss',  # Logarithmic loss
      'max_depth': 7,
      'eta': 0.2,  # Learning rate
      'subsample': 0.7,
      'colsample_bytree': 0.7
  }  

  # Train the model
  model = xgb.train(params, dtrain, num_round)

  y_pred = model.predict(dtest)

  best_precision = 0
  best_threshold = 0

  for threshold in range(1, 1000):
    threshold /= 1000

    y_pred_binary = (y_pred >= threshold).astype(int)

    precision = precision_score(y_test, y_pred_binary, zero_division=0)

    # Track the best precision and threshold
    if precision >= best_precision and precision <= 0.95:
      best_precision = precision
      best_threshold = threshold

  # Store the optimal threshold for this relation
  #optimal_thresholds[relation_name] = best_threshold

  # Apply the best threshold to the entire test set for this relation
  relation_y_pred_binary = (y_pred >= best_threshold).astype(int)

  # Calculate overall precision and recall for this relation
  overall_recall = recall_score(y_test, relation_y_pred_binary)

  f1 = f1_score(y_test, relation_y_pred_binary)

  # Print the results for this relation
  print(f"Relation: {relation_name}")
  print(f"Best Threshold: {best_threshold}")
  print(f"Best Precision: {best_precision}")
  print(f"Recall for best Threshold: {overall_recall}")
  print(f"F1 for best Threshold: {f1}")
  print()

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/birthPlace
Best Threshold: 0.951
Best Precision: 0.8333333333333334
Recall for best Threshold: 0.00495785820525533
F1 for best Threshold: 0.009857072449482505



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/family
Best Threshold: 0.716
Best Precision: 0.9481481481481482
Recall for best Threshold: 0.7314285714285714
F1 for best Threshold: 0.8258064516129032



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/deathPlace
Best Threshold: 0.954
Best Precision: 0.9
Recall for best Threshold: 0.004462072384729797
F1 for best Threshold: 0.008880118401578686



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/producer
Best Threshold: 0.845
Best Precision: 0.8571428571428571
Recall for best Threshold: 0.005928853754940711
F1 for best Threshold: 0.011776251226692836



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/writer
Best Threshold: 0.916
Best Precision: 0.9411764705882353
Recall for best Threshold: 0.007932573128408527
F1 for best Threshold: 0.01573254670599803



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/subsequentWork
Best Threshold: 0.881
Best Precision: 0.875
Recall for best Threshold: 0.012567324955116697
F1 for best Threshold: 0.024778761061946902



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/previousWork
Best Threshold: 0.818
Best Precision: 0.8780487804878049
Recall for best Threshold: 0.037037037037037035
F1 for best Threshold: 0.07107601184600196



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/artist
Best Threshold: 0.971
Best Precision: 0.9393939393939394
Recall for best Threshold: 0.015369360436291522
F1 for best Threshold: 0.03024390243902439



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Relation: http://dbpedia.org/ontology/formerTeam
Best Threshold: 0.982
Best Precision: 0.9210526315789473
Recall for best Threshold: 0.017352503718393655
F1 for best Threshold: 0.03406326034063261



### Neural Network

In [24]:
import tensorflow as tf

2023-10-22 00:44:44.832769: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-22 00:44:45.566929: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-22 00:44:45.566976: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-22 00:44:45.567006: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-22 00:44:45.761962: I tensorflow/core/platform/cpu_feature_g

In [46]:
for index, relation in enumerate(relations):
  relation_name = nicos_relations[index]

  X_train, X_test, y_train, y_test = train_test_split(
    relation.drop(['domain', 'relation', 'range', 'label'], axis=1),
    relation['label'],
    test_size=0.2,  
    random_state=42  
    )

  # Balance the training data using RandomOverSampler
  oversampler = RandomOverSampler(sampling_strategy=0.4, random_state=42)
  #X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train, y_train)

  model = tf.keras.Sequential([
      tf.keras.layers.Dense(1024, activation='relu', input_shape=(len(X_train.columns),)),
      tf.keras.layers.Dense(512, activation='relu'),
      tf.keras.layers.Dense(256, activation='relu'),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(32, activation='relu'),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

  # Train the model
  model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2)

  # Convert data to NumPy arrays
  X_test_np = np.array(X_test)

  y_pred = model.predict(X_test_np)

  best_precision = 0
  best_threshold = 0

  for threshold in range(1, 1000):
    threshold /= 1000

    y_pred_binary = (y_pred >= threshold).astype(int)

    precision = precision_score(y_test, y_pred_binary, zero_division=0)

    # Track the best precision and threshold
    if precision >= best_precision and precision <= 0.95:
      best_precision = precision
      best_threshold = threshold
  # Store the optimal threshold for this relation
  #optimal_thresholds[relation_name] = best_threshold

  # Apply the best threshold to the entire test set for this relation
  relation_y_pred_binary = (y_pred >= best_threshold).astype(int)

  # Calculate overall precision and recall for this relation
  overall_recall = recall_score(y_test, relation_y_pred_binary)

  f1 = f1_score(y_test, relation_y_pred_binary)

  # Print the results for this relation
  print(f"Relation: {relation_name}")
  print(f"Best Threshold: {best_threshold}")
  print(f"Best Precision: {best_precision}")
  print(f"Recall for best Threshold: {overall_recall}")
  print(f"F1 for best Threshold: {f1}")
  print()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Relation: http://dbpedia.org/ontology/birthPlace
Best Threshold: 0.875
Best Precision: 0.8
Recall for best Threshold: 0.0039662865642042635
F1 for best Threshold: 0.007893438579181055

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Relation: http://dbpedia.org/ontology/family
Best Threshold: 0.728
Best Precision: 0.9483695652173914
Recall for best Threshold: 0.6647619047619048
F1 for best Threshold: 0.7816349384098544

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Relation: http://dbpedia.org/ontology/deathPlace
Best Threshold: 0.758
Best Precision: 0.9090909090909091
Recall for best Threshold: 0.00495785820525533
F1 for best Threshold: 0.009861932938856016

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch

In [8]:
for relation in relations:
    columns_to_drop = relation.columns[(relation == 0).all()]
    relation.drop(columns=columns_to_drop, inplace=True)

### Random Forest Hyperparameter Tuning

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

for index, relation in enumerate(relations):
    relation_name = nicos_relations[index]

    print(relation_name)
    
    X_train, X_test, y_train, y_test = train_test_split(
        relation.drop(['domain', 'relation', 'range', 'label'], axis=1),
        relation['label'],
        test_size=0.2,  
        random_state=42  
        )
    
    param_dist = {'n_estimators': randint(50,500),
                  'max_depth': randint(1,20)}

    # Create a random forest classifier
    rf = RandomForestClassifier()
    
    #Random search to find the best hyperparameters
    rand_search = RandomizedSearchCV(rf, 
                                     param_distributions = param_dist, 
                                     n_iter=2, 
                                     cv=5)
    
    # Fit the random search object to the data
    rand_search.fit(X_train, y_train)
    
    # Create a variable for the best model
    best_rf = rand_search.best_estimator_
    
    # Print the best hyperparameters
    print('Best hyperparameters:',  rand_search.best_params_)

http://dbpedia.org/ontology/birthPlace
Best hyperparameters: {'max_depth': 14, 'n_estimators': 347}
http://dbpedia.org/ontology/family
Best hyperparameters: {'max_depth': 19, 'n_estimators': 95}
http://dbpedia.org/ontology/deathPlace
Best hyperparameters: {'max_depth': 7, 'n_estimators': 248}
http://dbpedia.org/ontology/producer
Best hyperparameters: {'max_depth': 15, 'n_estimators': 248}
http://dbpedia.org/ontology/writer
Best hyperparameters: {'max_depth': 10, 'n_estimators': 395}
http://dbpedia.org/ontology/subsequentWork
Best hyperparameters: {'max_depth': 4, 'n_estimators': 57}
http://dbpedia.org/ontology/previousWork
Best hyperparameters: {'max_depth': 12, 'n_estimators': 99}
http://dbpedia.org/ontology/artist
Best hyperparameters: {'max_depth': 12, 'n_estimators': 430}
http://dbpedia.org/ontology/formerTeam
Best hyperparameters: {'max_depth': 11, 'n_estimators': 250}


In [10]:
best_hp_rf = [(14, 347), (19, 95), (7, 248), (15, 248), (10, 395), (4, 57), (12, 99), (12, 430), (11, 250)]