# Strategy 2: Tabular Explainers
## Example: AIFB with Boosting and Gini/Permutation importance

In [1]:
import gzip
import numpy as np
import os
import os.path as osp
import random
import time

import pandas as pd
import rdflib as rdf

import time
import numpy as np

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance

In [2]:
random.seed(0)
np.random.seed(0)

In [3]:
PATH_DATA = 'data/'

In [4]:
graph_file = 'data/Entities/aifb/raw/aifb_stripped.nt.gz'
task_file = 'data/Entities/aifb/raw/completeDataset.tsv'
train_file = 'data/Entities/aifb/raw/trainingSet.tsv'
test_file = 'data/Entities/aifb/raw/testSet.tsv'

### Read graph

In [5]:
g = rdf.Graph()
with gzip.open(graph_file, 'rb') as f:
    g.parse(file=f, format='nt')  # type: ignore

## Convert graph to table

In [6]:
# Initialize an empty dictionary to store the data
data = {}

# Populate the dictionary with the RDF triples
for s, p, o in g:
    if str(s) not in data:
        data[str(s)] = {}
    data[str(s)][p] = str(o)

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Fill NaN values with an empty string for better presentation
df = df.fillna('')
df.index = df.index.astype(str)
len(df)

2829

There are many ways how to convert a graph to a table.

1. An alternative way is described in the RGCN Paper (https://arxiv.org/pdf/1703.06103) which references an earlier paper: https://dl.acm.org/doi/pdf/10.1145/2254129.2254168
2. Yet another code example: https://github.com/dice-group/AutoCL/blob/53691bdd72cce7b546aed6417a9d839c3c9b9cf2/AutoCL/examples/feature%20selection%20approach/table-based%20feature%20selection/CELOE/fs_with_sklearn_on_celoe.py#L184

In [7]:
pd.set_option('display.max_rows', 50)

In [8]:
# Simplify columns and cells (simplifying rows does not work because then different rows receive the same name)
df.columns = df.columns.str.split("#").str[-1]
df.columns = df.columns.str.split("/").str[-1]
#df.index = df.index.str.split("#").str[-1]
#df.index = df.index.str.split("/").str[-1]
df = df.apply(lambda col: col.map(lambda x: x.split("#")[-1]))
df = df.apply(lambda col: col.map(lambda x: x.split("/")[-1]))

In [9]:
len(df)

2829

In [10]:
pd.set_option('display.max_columns', 50)

In [11]:
df.head()

Unnamed: 0,isAbout,type,abstract,title,booktitle,author,hasProject,year,pages,publication,worksAtProject,phone,name,photo,fax,homepage,note,series,number,month,volume,journal,address,dealtWithIn,isWorkedOnBy,type.1,projectInfo,member,financedBy,carriedOutBy,editor,inverseOf,howpublished,type.2,publishes,carriesOut,head,edition,allValuesFrom,onProperty,isbn,subClassOf,chapter,finances,range
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id25instance,id79instance,InProceedings,This article gives an impression how existing ...,On Visualizing the Semantic Web in MS Office,6th International Conference on Information Vi...,id20instance,id3instance,2002,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1317instance,id137instance,Publication,,Boosting for Text Classification with Semantic...,"Advances in Web Mining and Web Usage Analysis,...",id2079instance,id44instance,2006,149-166,,,,,,,,REPRINT,Lecture Notes in Computer Science,3932,,,,,,,,,,,,,,,,,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1269instance,id100instance,InProceedings,,Semantic Wikipedia,In Proceedings of the 2006 International Sympo...,id2097instance,id67instance,2006,137-138,,,,,,,,,,,August,,,,,,,,,,,,,,,,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1089instance,id30instance,Article,,Anticipation and flexibility in dynamic schedu...,,id15instance,,2005,3103-3129,,,,,,,,,,15,August,43.0,International Journal of Production Research,,,,,,,,,,,,,,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id805instance,id130instance,Publication,Part of the theory of logic programming and no...,A uniform approach to logic programming semantics,,id2084instance,,2005,123-159,,,,,,,,,,1-2,,5.0,Theory and Practice of Logic Programming,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
label_header = 'label_affiliation'
nodes_header = 'person'

In [13]:
train_labels_df = pd.read_csv(train_file, sep='\t')
test_labels_df = pd.read_csv(test_file, sep='\t')

In [14]:
# TODO: Further preprocessing, feature selection, and feature engineering to improve the evaluation results of the model?

## Split dataset into training and testing sets

In [15]:
df = pd.get_dummies(df)
df.index = df.index.astype(str)

In [16]:
# Filter the DataFrame to only include valid persons
X_train = df.loc[train_labels_df['person']]
X_test = df.loc[test_labels_df['person']]

In [17]:

selector = VarianceThreshold(threshold=0.001)
X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)

feature_names = df.columns[selector.get_support(indices=True)]

X_train.shape

(140, 486)

In [18]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_labels_df['label_affiliation'])
y_test = encoder.transform(test_labels_df['label_affiliation'])

## Train Model

In [19]:
model = GradientBoostingClassifier(n_estimators=10000, random_state=0, max_depth=1)
#model = RandomForestClassifier(n_estimators=10000, random_state=0)
#model = DecisionTreeClassifier(max_depth=20, random_state=0)
model.fit(X_train, y_train)

## Evaluate Model

In [20]:
accuracy = model.score(X_train, y_train)
print(f"Model accuracy Training: {accuracy}")

accuracy = model.score(X_test, y_test)
print(f"Model accuracy Test: {accuracy}")

Model accuracy Training: 1.0
Model accuracy Test: 0.8611111111111112


## Explain Model

### Gini importance

In [21]:
importances_mean = model.feature_importances_
importances_std = np.std([tree.feature_importances_ for iteration in model.estimators_ for tree in iteration], axis=0)

top_n = 10
indices = np.argsort(importances_mean)[::-1][:top_n]
top_importances_mean = importances_mean[indices]
top_importances_std = importances_std[indices]

top_feature_names = np.array(feature_names)[indices]

In [22]:
pd.DataFrame({'top_importances_mean':top_importances_mean, 'top_importances_std':top_importances_std}, index=top_feature_names)

Unnamed: 0,top_importances_mean,top_importances_std
homepage_None,0.06636,0.024992
fax_+49 (721) 608 4548,0.066345,0.078966
worksAtProject_id2instance,0.060828,0.084255
phone_,0.051146,0.110332
phone_-,0.03663,0.072266
fax_+49 (721) 608 6580,0.033762,0.071405
publication_,0.026447,0.033148
phone_+49 (721) 608 7362,0.023723,0.069828
worksAtProject_id8instance,0.020833,0.077546
fax_+49 (721) 693717,0.018487,0.014141


## Permutation feature importance

In [None]:
r = permutation_importance(model, X_test, y_test,
                           n_repeats=5,
                           random_state=0)

importances_mean = r.importances_mean
importances_std = r.importances_std

top_n = 10
indices = np.argsort(r.importances_mean)[::-1][:top_n]

top_importances_mean = importances_mean[indices]
top_importances_std = importances_std[indices]

In [None]:
pd.DataFrame({'top_importances_mean':top_importances_mean, 'top_importances_std':top_importances_std}, index=top_feature_names)