In [1]:
from rdflib import Graph
import pandas as pd

rdf_file = "data/aifbfixed_complete.n3"
graph = Graph()
graph.parse(rdf_file, format='n3')
print("Number of triples in graph:", len(graph))

Number of triples in graph: 29226


In [4]:
for i, triple in enumerate(graph):
    if i < 10:
        print(triple)

(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#publishes'), rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id724instance'))
(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id28instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#title'), rdflib.term.Literal('On-To-Knowledge Methodology', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id739instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#abstract'), rdflib.term.Literal('Effektiver Zugriff auf Wissensinhalte benÃ¶tigt eine gemeinsame Sprache zur Beschreibung der Wissensinhalte,\r\nsowie Methoden und Werkzeuge, die in der Lage sind, diese Beschreibung zu nutzen. In der Forschungsgruppe\r\nWissensmanagement

In [5]:
train_df = pd.read_csv("data/trainingSet.tsv", sep='\t', header=None, names=['person', 'id', 'label_affiliation'])
test_df = pd.read_csv("data/testSet.tsv", sep='\t', header=None, names=['person', 'id', 'label_affiliation'])
full_df = pd.read_csv("data/completeDataset.tsv", sep='\t', header=None, names=['person', 'id', 'label_affiliation'])

In [6]:
full_df.head(10)

Unnamed: 0,person,id,label_affiliation
0,id,person,label_affiliation
1,1,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
2,2,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
3,3,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
4,4,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
5,5,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
6,6,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
7,7,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
8,8,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...
9,9,http://www.aifb.uni-karlsruhe.de/Personen/view...,http://www.aifb.uni-karlsruhe.de/Forschungsgru...


In [8]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Train label distribution:")
print(train_df['label_affiliation'].value_counts(normalize=True))
print("Test label distribution:")
print(test_df['label_affiliation'].value_counts(normalize=True))

Train shape: (141, 3)
Test shape: (37, 3)
Train label distribution:
label_affiliation
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.411348
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.340426
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance    0.156028
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.085106
label_affiliation                                                                         0.007092
Name: proportion, dtype: float64
Test label distribution:
label_affiliation
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.405405
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.324324
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance    0.162162
http://www.aifb.uni-karlsruhe.de/Forschungsgru

In [None]:
#RDf to entity feature table 
from collections import defaultdict

entity_features = defaultdict(dict)

for s, p, o in graph:
    key = f"{p}={o}"
    entity_features[str(s)][key] = 1

features_df = pd.DataFrame.from_dict(entity_features, orient='index').fillna(0).astype(int)
print("Feature matrix shape:", features_df.shape)

Feature matrix shape: (2829, 10450)


In [10]:
train_df = train_df[train_df['person'].isin(features_df.index)]
test_df = test_df[test_df['person'].isin(features_df.index)]

X_train = features_df.loc[train_df['person']]
X_test = features_df.loc[test_df['person']]

# Align y labels
y_train = train_df.set_index('person').loc[X_train.index]['label_affiliation']
y_test = test_df.set_index('person').loc[X_test.index]['label_affiliation']


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [12]:
# 7. Sanity Checks
# =========================================
print("X_train index matches train_df 'person' index:", all(X_train.index == train_df['person']))
print("X_test index matches test_df 'person' index:", all(X_test.index == test_df['person']))
print("\nUnseen labels in test set:", set(y_test) - set(y_train))

X_train index matches train_df 'person' index: True
X_test index matches test_df 'person' index: True

Unseen labels in test set: set()


In [13]:
# Print shapes
print("X_train shape:", X_train.shape, ", y_train length:", len(y_train_enc))
print("X_test shape:", X_test.shape, ", y_test length:", len(y_test_enc))

X_train shape: (140, 10450) , y_train length: 140
X_test shape: (36, 10450) , y_test length: 36


In [14]:
 #8. Check for Feature Leakage (correlation)
# =========================================
correlations = X_train.corrwith(pd.Series(y_train_enc, index=X_train.index)).abs()
leak_features = correlations[correlations > 0.5].sort_values(ascending=False)
print("Top correlated features with label:")
print(leak_features.head(10))

  c /= stddev[:, None]
  c /= stddev[None, :]


Top correlated features with label:
http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.886108
http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.622667
http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.557219
dtype: float64


In [16]:
leaky_cols = [col for col in correlations.index if 'affiliation=' in col and any(lbl in col for lbl in y_train.unique())]
X_train_filtered = X_train.drop(columns=leaky_cols)
X_test_filtered = X_test.drop(columns=leaky_cols)

print(f"Features leaking label info: {leaky_cols}")
print(f"Original feature count: {X_train.shape[1]}, after removal: {X_train_filtered.shape[1]}")

Features leaking label info: ['http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance']
Original feature count: 10450, after removal: 10446


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_filtered, y_train_enc)
y_pred = clf.predict(X_test_filtered)

In [18]:
# 10. Evaluate Model
# =========================================
acc = accuracy_score(y_test_enc, y_pred)
print(f"Test Accuracy (filtered features): {acc:.4f}\n")
print("Classification Report (filtered features):")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_, zero_division=0))


Test Accuracy (filtered features): 0.7222

Classification Report (filtered features):
                                                                                        precision    recall  f1-score   support

http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance       0.65      1.00      0.79        15
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance       1.00      0.17      0.29         6
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance       0.82      0.75      0.78        12
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance       1.00      0.33      0.50         3

                                                                              accuracy                           0.72        36
                                                                             macro avg       0.87      0.56      0.59        36
                