In [51]:
#Installing and checking requiered packages

# !pip show rdflib pandas numpy scikit-learn shap lime matplotlib ipykernel
# !pip install rdflib shap lime
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
#imports
import gzip
from rdflib import Graph
import pandas as pd
import os
from tqdm import tqdm
from collections import defaultdict


In [39]:
import rdflib

# Load the graph
g = rdflib.Graph()
g.parse("data/aifbfixed_complete.n3", format="n3")

print(f"Number of triples in graph: {len(g)}")

# List some triples to see data structure
for i, triple in enumerate(g):
    if i > 10:
        break
    print(triple)


Number of triples in graph: 29226
(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id701instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#hasProject'), rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Projekte/viewProjektOWL/id28instance'))
(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1908instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#name'), rdflib.term.Literal('Hongbo Xu', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Forschungsgebiete/viewForschungsgebietOWL/id69instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#dealtWithIn'), rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Projekte/viewProjektOWL/id54instance'))
(rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Publikationen/viewExternerAutorOWL/id135instance'), rdflib.term.URIRef('http://swrc.ontoware.org/ontology#name

In [40]:
import pandas as pd

train_df = pd.read_csv("data/trainingSet.tsv", sep="\t")
test_df = pd.read_csv("data/testSet.tsv", sep="\t")
complete_df = pd.read_csv("data/completeDataset.tsv", sep="\t")

print(train_df.head())
print(test_df.head())
print(complete_df.head())


                                              person   id  \
0  http://www.aifb.uni-karlsruhe.de/Personen/view...  2.0   
1  http://www.aifb.uni-karlsruhe.de/Personen/view...  3.0   
2  http://www.aifb.uni-karlsruhe.de/Personen/view...  4.0   
3  http://www.aifb.uni-karlsruhe.de/Personen/view...  5.0   
4  http://www.aifb.uni-karlsruhe.de/Personen/view...  6.0   

                                   label_affiliation  
0  http://www.aifb.uni-karlsruhe.de/Forschungsgru...  
1  http://www.aifb.uni-karlsruhe.de/Forschungsgru...  
2  http://www.aifb.uni-karlsruhe.de/Forschungsgru...  
3  http://www.aifb.uni-karlsruhe.de/Forschungsgru...  
4  http://www.aifb.uni-karlsruhe.de/Forschungsgru...  
                                              person    id  \
0  http://www.aifb.uni-karlsruhe.de/Personen/view...   1.0   
1  http://www.aifb.uni-karlsruhe.de/Personen/view...  12.0   
2  http://www.aifb.uni-karlsruhe.de/Personen/view...  14.0   
3  http://www.aifb.uni-karlsruhe.de/Personen/view...  1

In [43]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

print("Train label distribution:")
print(train_df['label_affiliation'].value_counts(normalize=True))

print("Test label distribution:")
print(test_df['label_affiliation'].value_counts(normalize=True))


Train shape: (140, 3)
Test shape: (36, 3)
Train label distribution:
label_affiliation
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.414286
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.342857
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance    0.157143
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.085714
Name: proportion, dtype: float64
Test label distribution:
label_affiliation
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.416667
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.333333
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance    0.166667
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.083333
Name: proportion, dtype: float64


In [48]:
complete_df.columns

Index(['id', 'person', 'label_affiliation'], dtype='object')

In [49]:
persons_train = train_df['person'].unique()
persons_test = test_df['person'].unique()


In [50]:
from collections import defaultdict

def extract_features(graph, entities):
    entity_features = {}
    
    for entity in entities:
        features = defaultdict(int)
        
        # For every triple where entity is subject
        for _, predicate, obj in graph.triples((rdflib.URIRef(entity), None, None)):
            pred_str = str(predicate)
            if isinstance(obj, rdflib.URIRef):
                obj_str = str(obj)
                feature_name = f"{pred_str}={obj_str}"
            else:
                # For literals, you can choose to ignore or keep simplified
                obj_str = str(obj)
                feature_name = f"{pred_str}={obj_str}"
            
            # Mark feature presence as 1
            features[feature_name] = 1
        
        entity_features[entity] = features
    
    return entity_features

# Example usage:
entity_features_train = extract_features(g, persons_train)
entity_features_test = extract_features(g, persons_test)


In [52]:
import pandas as pd

def build_feature_df(entity_features, feature_cols=None):
    # Convert dict of dicts to DataFrame
    df = pd.DataFrame.from_dict(entity_features, orient='index').fillna(0)
    
    # Ensure columns are consistent (important for train/test)
    if feature_cols is not None:
        # Add missing columns
        for col in feature_cols:
            if col not in df.columns:
                df[col] = 0
        # Remove extra columns not in feature_cols
        df = df[feature_cols]
    else:
        feature_cols = df.columns.tolist()
    
    return df, feature_cols

# Build train features DataFrame and get columns
X_train, feature_cols = build_feature_df(entity_features_train)

# Build test features DataFrame using same columns
X_test, _ = build_feature_df(entity_features_test, feature_cols=feature_cols)


  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[c

In [58]:
# Align labels with feature rows by matching the 'person' index
y_train = train_df.set_index('person').loc[X_train.index]['label_affiliation']
y_test = test_df.set_index('person').loc[X_test.index]['label_affiliation']

# Encode labels to integers for training and testing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Fit LabelEncoder on training labels only
y_train_enc = le.fit_transform(y_train)

# Transform test labels with the same encoder (do NOT fit again)
y_test_enc = le.transform(y_test)


In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100, max_depth=10)
# rf.fit(X_train, y_train_enc)
# y_pred = rf.predict(X_test)

# print("Test Accuracy:", accuracy_score(y_test_enc, y_pred))
# print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

# Train your model with encoded labels
rf.fit(X_train, y_train_enc)

# Predict encoded labels
y_pred_enc = rf.predict(X_test)

# Evaluate using encoded test labels and label encoder classes
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test_enc, y_pred_enc))
print(classification_report(y_test_enc, y_pred_enc, target_names=le.classes_, zero_division=0))



Accuracy: 1.0
                                                                                        precision    recall  f1-score   support

http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance       1.00      1.00      1.00        15
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance       1.00      1.00      1.00         6
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance       1.00      1.00      1.00        12
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance       1.00      1.00      1.00         3

                                                                              accuracy                           1.00        36
                                                                             macro avg       1.00      1.00      1.00        36
                                                                          weighted avg  

In [63]:
print("X_train index matches train_df 'person' index:", all(X_train.index == train_df['person']))
print("X_test index matches test_df 'person' index:", all(X_test.index == test_df['person']))


X_train index matches train_df 'person' index: True
X_test index matches test_df 'person' index: True


In [64]:
train_df_indexed = train_df.set_index('person')
test_df_indexed = test_df.set_index('person')

y_train = train_df_indexed.loc[X_train.index, 'label_affiliation']
y_test = test_df_indexed.loc[X_test.index, 'label_affiliation']


In [65]:
test_labels_set = set(y_test.unique())
train_labels_set = set(y_train.unique())

unseen_labels = test_labels_set - train_labels_set
print("Unseen labels in test set:", unseen_labels)


Unseen labels in test set: set()


In [66]:
valid_test_mask = y_test.isin(le.classes_)
y_test = y_test[valid_test_mask]
X_test = X_test.loc[valid_test_mask]

y_test_enc = le.transform(y_test)


In [67]:
print("Training label counts:")
print(y_train.value_counts(normalize=True))

print("Test label counts:")
print(y_test.value_counts(normalize=True))


Training label counts:
label_affiliation
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.414286
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.342857
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance    0.157143
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.085714
Name: proportion, dtype: float64
Test label counts:
label_affiliation
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.416667
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.333333
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance    0.166667
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.083333
Name: proportion, dtype: float64


In [68]:
print(f"X_train shape: {X_train.shape}, y_train length: {len(y_train)}")
print(f"X_test shape: {X_test.shape}, y_test length: {len(y_test)}")


X_train shape: (140, 1551), y_train length: 140
X_test shape: (36, 1551), y_test length: 36


In [69]:
from sklearn.preprocessing import LabelEncoder

# Assuming X_train, X_test are DataFrames with indices matching train_df and test_df respectively

# Align y_train, y_test by indexing train/test df on 'person'
train_df_indexed = train_df.set_index('person')
test_df_indexed = test_df.set_index('person')

y_train = train_df_indexed.loc[X_train.index, 'label_affiliation']
y_test = test_df_indexed.loc[X_test.index, 'label_affiliation']

# Check unseen labels (should be empty)
unseen_labels = set(y_test.unique()) - set(y_train.unique())
assert len(unseen_labels) == 0, f"Unseen test labels: {unseen_labels}"

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Sanity checks
print("Train label distribution:")
for idx, count in zip(*np.unique(y_train_enc, return_counts=True)):
    print(f" - Class {le.inverse_transform([idx])[0]}: {count / len(y_train_enc):.3f}")

print("\nTest label distribution:")
for idx, count in zip(*np.unique(y_test_enc, return_counts=True)):
    print(f" - Class {le.inverse_transform([idx])[0]}: {count / len(y_test_enc):.3f}")

print(f"\nShapes: X_train={X_train.shape}, y_train={y_train_enc.shape}")
print(f"Shapes: X_test={X_test.shape}, y_test={y_test_enc.shape}")


Train label distribution:
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance: 0.414
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance: 0.157
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance: 0.343
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance: 0.086

Test label distribution:
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance: 0.417
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance: 0.167
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance: 0.333
 - Class http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance: 0.083

Shapes: X_train=(140, 1551), y_train=(140,)
Shapes: X_test=(36, 1551), y_test=(36,)


In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100, max_depth=10)
model.fit(X_train, y_train_enc)

y_pred = model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test_enc, y_pred))
print(classification_report(y_test_enc, y_pred, target_names=le.classes_, zero_division=0))


Test Accuracy: 1.0
                                                                                        precision    recall  f1-score   support

http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance       1.00      1.00      1.00        15
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance       1.00      1.00      1.00         6
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance       1.00      1.00      1.00        12
http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance       1.00      1.00      1.00         3

                                                                              accuracy                           1.00        36
                                                                             macro avg       1.00      1.00      1.00        36
                                                                          weighted 

In [71]:
common_indices = set(X_train.index).intersection(set(X_test.index))
print(f"Common indices between train and test: {len(common_indices)}")  # Should be 0

import numpy as np
import pandas as pd

# Calculate correlation or mutual info if numeric features
correlations = X_train.apply(lambda col: np.corrcoef(col, y_train_enc)[0,1] if np.issubdtype(col.dtype, np.number) else 0)
print("Top correlated features with label:")
print(correlations.abs().sort_values(ascending=False).head(10))


Common indices between train and test: 0
Top correlated features with label:
http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance    0.886108
http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance    0.622667
http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance    0.557219
http://swrc.ontoware.org/ontology#homepage=None                                                                                         0.391698
http://swrc.ontoware.org/ontology#worksAtProject=http://www.aifb.uni-karlsruhe.de/Projekte/viewProjektOWL/id2instance                   0.350234
http://swrc.ontoware.org/ontology#phone=                                                                                                0.317219
http://swrc.ontoware.org/ontology#worksAtProject=http

  c /= stddev[:, None]
  c /= stddev[None, :]


In [72]:
# Identify leak features (example: those containing 'affiliation=')
leak_features = [col for col in X_train.columns if 'affiliation=' in col]

print("Features leaking label info:", leak_features)

# Remove these features
X_train_filtered = X_train.drop(columns=leak_features)
X_test_filtered = X_test.drop(columns=leak_features)

print(f"Original feature count: {X_train.shape[1]}, after removal: {X_train_filtered.shape[1]}")


Features leaking label info: ['http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance']
Original feature count: 1551, after removal: 1547


In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Identify label-leaking features (containing 'affiliation=')
leak_features = [col for col in X_train.columns if 'affiliation=' in col]

print(f"Features leaking label info: {leak_features}")

# 2. Remove these leak features from train and test
X_train_filtered = X_train.drop(columns=leak_features)
X_test_filtered = X_test.drop(columns=leak_features)

print(f"Original feature count: {X_train.shape[1]}, after removal: {X_train_filtered.shape[1]}")

# 3. Retrain classifier on filtered features
rf_filtered = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
rf_filtered.fit(X_train_filtered, y_train_enc)

# 4. Predict on filtered test set
y_pred_filtered = rf_filtered.predict(X_test_filtered)

# 5. Evaluate
acc_filtered = accuracy_score(y_test_enc, y_pred_filtered)
print(f"Test Accuracy (filtered features): {acc_filtered:.4f}")

print("\nClassification Report (filtered features):")
print(classification_report(y_test_enc, y_pred_filtered, target_names=le.classes_, zero_division=0))


Features leaking label info: ['http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance', 'http://swrc.ontoware.org/ontology#affiliation=http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance']
Original feature count: 1551, after removal: 1547
Test Accuracy (filtered features): 0.6111

Classification Report (filtered features):
                                                                                        precision    recall  f1-score   support

http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance       0.52      1.00      0.68        15
http://www.aifb.uni-karlsruhe.de/