In [360]:
import random
import time

import pandas as pd
import rdflib as rdf

import time
import numpy as np

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance

In [361]:
graph_file = 'data/aifbfixed_complete.n3'
task_file = 'data/Entities/aifb/raw/completeDataset.tsv'
train_file = 'data/trainingSet.tsv'
test_file = 'data/testSet.tsv'

In [362]:
# warnings.filterwarnings("ignore")

### STEP 1: Load and Parse RDF File ###
graph_path = "data/aifbfixed_complete.n3"
graph = rdf.Graph()
graph.parse(graph_path, format="n3")
print("Triples Loaded:", len(graph))

Triples Loaded: 29226


In [363]:
# Initialize an empty dictionary to store the data
data = {}

# Populate the dictionary with the RDF triples
for s, p, o in graph:
    if str(s) not in data:
        data[str(s)] = {}
    data[str(s)][p] = str(o)

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Fill NaN values with an empty string for better presentation
df = df.fillna('')
df.index = df.index.astype(str)
len(df)

2829

In [365]:
# Simplify columns and cells (simplifying rows does not work because then different rows receive the same name)
df.columns = df.columns.str.split("#").str[-1]
df.columns = df.columns.str.split("/").str[-1]
#df.index = df.index.str.split("#").str[-1]
#df.index = df.index.str.split("/").str[-1]
df = df.apply(lambda col: col.map(lambda x: x.split("#")[-1]))
df = df.apply(lambda col: col.map(lambda x: x.split("/")[-1]))

In [366]:
df.head()

Unnamed: 0,author,booktitle,isAbout,address,title,hasProject,type,month,year,pages,...,number,journal,subClassOf,onProperty,allValuesFrom,finances,chapter,edition,inverseOf,range
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id885instance,id2065instance,6th IEEE International Workshop on Policies fo...,id114instance,"Stockholm, Sweden",Approximating Service Utility from Policies an...,id38instance,Publication,June,2005,159-168,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id203instance,id40instance,Proc. of 13th European Conference on Knowledge...,,"Siquenca, Spain",MAFRA - A Mapping Framework for Distributed On...,,Publication,,2002,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id215instance,id40instance,,,,KAON SERVER Prototype,id28instance,Misc,,2003,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id906instance,id448instance,Proceedings of the IJCAI-05 Workshop on Neural...,id131instance,,Extracting Reduced Logic Programs from Artific...,id50instance,Publication,August,2005,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id70instance,id32instance,"International Conference on Ontologies, Databa...",id102instance,,Usage-oriented Evolution of Ontology-based Kno...,id32instance,Publication,,2002,230-242,...,,,,,,,,,,


In [367]:
df.columns

Index(['author', 'booktitle', 'isAbout', 'address', 'title', 'hasProject',
       'type', 'month', 'year', 'pages', 'publishes', 'member', 'carriesOut',
       'head', 'employs', 'name', 'homepage', 'abstract', 'howpublished',
       'note', 'publication', 'projectInfo', 'carriedOutBy', 'financedBy',
       'affiliation', 'fax', 'photo', 'phone', 'isWorkedOnBy', 'type',
       'dealtWithIn', 'worksAtProject', 'series', 'type', 'isbn', 'volume',
       'editor', 'number', 'journal', 'subClassOf', 'onProperty',
       'allValuesFrom', 'finances', 'chapter', 'edition', 'inverseOf',
       'range'],
      dtype='object')

In [368]:
import pandas as pd

# Load label files
train_labels_df = pd.read_csv("data/trainingSet.tsv", sep="\t")
test_labels_df = pd.read_csv("data/testSet.tsv", sep="\t")

# Ensure all person URIs are strings and clean
train_labels_df['person'] = train_labels_df['person'].astype(str).str.strip()
test_labels_df['person'] = test_labels_df['person'].astype(str).str.strip()

# Main RDF-derived DataFrame: clean index and make sure it's string
df.index = df.index.astype(str).str.strip()

# One-hot encode (get dummies) for model input
df_encoded = pd.get_dummies(df)

# Safety check: how many matches
train_matches = df_encoded.index.intersection(train_labels_df['person'])
test_matches = df_encoded.index.intersection(test_labels_df['person'])

print(f"Matched training persons: {len(train_matches)}")
print(f"Matched testing persons: {len(test_matches)}")

# Split into train/test using .loc with intersection
X_train = df_encoded.loc[train_matches]
X_test = df_encoded.loc[test_matches]

# Also prepare y labels
y_train = train_labels_df.set_index('person').loc[train_matches]['label_affiliation']
y_test = test_labels_df.set_index('person').loc[test_matches]['label_affiliation']

# Show shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Matched training persons: 140
Matched testing persons: 36
X_train shape: (140, 6899)
X_test shape: (36, 6899)
y_train shape: (140,)
y_test shape: (36,)


In [370]:
from sklearn.feature_selection import VarianceThreshold

# Step 1: Apply Variance Threshold
selector = VarianceThreshold(threshold=0.001)
X_train_selected = selector.fit_transform(X_train)
X_test_selected = selector.transform(X_test)

# Step 2: Get initially selected feature names
initial_feature_names = df_encoded.columns[selector.get_support(indices=True)]

# Step 3: Define features to exclude manually
excluded_features = {
    'affiliation_id1instance',
    'affiliation_id2instance',
    'affiliation_id3instance',
    'affiliation_id4instance'
}

# Step 4: Filter out excluded features
filtered_feature_names = [f for f in initial_feature_names if f not in excluded_features]

# Step 5: Get column indices for the filtered features
filtered_indices = [list(initial_feature_names).index(f) for f in filtered_feature_names]

# Step 6: Apply filtered indices to training and test sets
X_train_filtered = X_train_selected[:, filtered_indices]
X_test_filtered = X_test_selected[:, filtered_indices]

# Step 7: Convert to float numpy arrays for model input
X_train_np = X_train_filtered.astype(float)
X_test_np = X_test_filtered.astype(float)

# Optional: print info
print("X_train shape before selection:", X_train.shape)
print("X_train shape after selection:", X_train_selected.shape)
print("X_train shape after manual filtering:", X_train_np.shape)
print("Number of selected features after filtering:", len(filtered_feature_names))


X_train shape before selection: (140, 6899)
X_train shape after selection: (140, 492)
X_train shape after manual filtering: (140, 488)
Number of selected features after filtering: 488


In [371]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)


In [372]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(
    use_label_encoder=False,  # to avoid warning
    eval_metric='mlogloss',
    random_state=42
)

model.fit(X_train_np, y_train_enc)


Parameters: { "use_label_encoder" } are not used.



In [373]:
y_pred = model.predict(X_test_np)
accuracy = accuracy_score(y_test_enc, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test_enc, y_pred, target_names=encoder.classes_))


Test Accuracy: 0.7778
Classification Report:
                                                                                        precision    recall  f1-score   support

http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance       1.00      0.87      0.93        15
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance       1.00      0.33      0.50         6
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance       0.67      1.00      0.80        12
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance       0.33      0.33      0.33         3

                                                                              accuracy                           0.78        36
                                                                             macro avg       0.75      0.63      0.64        36
                                                         

In [374]:
train_preds = model.predict(X_train_np)
train_acc = accuracy_score(y_train_enc, train_preds)
print(f"Training accuracy: {train_acc:.4f}")

test_preds = model.predict(X_test_np)
test_acc = accuracy_score(y_test_enc, test_preds)
print(f"Testing accuracy: {test_acc:.4f}")


Training accuracy: 0.8714
Testing accuracy: 0.7778
