In [1]:
import random
import time

import pandas as pd
import rdflib as rdf

import time
import numpy as np

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance

In [2]:
random.seed(0)
np.random.seed(0)

In [3]:
graph_file = 'data/aifbfixed_complete.n3'
task_file = 'data/completeDataset.tsv'
train_file = 'data/trainingSet.tsv'
test_file = 'data/testSet.tsv'

In [4]:
d=pd.read_csv(task_file,sep='\t')

In [5]:
# warnings.filterwarnings("ignore")

### STEP 1: Load and Parse RDF File ###
graph_path = "data/aifbfixed_complete.n3"
graph = rdf.Graph()
graph.parse(graph_path, format="n3")
print("Triples Loaded:", len(graph))

Triples Loaded: 29226


In [6]:

#-----------------------------------------------
data = {}

# Populate the dictionary with the RDF triples
for s, p, o in graph:
    if str(s) not in data:
        data[str(s)] = {}
    data[str(s)][p] = str(o)

# Convert the dictionary to a DataFrame
dm = pd.DataFrame.from_dict(data, orient='index')
df=dm.copy()

# Fill NaN values with an empty string for better presentation
df = df.fillna('')
df.index = df.index.astype(str)
len(df)



2829

In [7]:
pd.set_option('display.max_rows', 50)

In [8]:
# Simplify columns and cells (simplifying rows does not work because then different rows receive the same name)
df.columns = df.columns.str.split("/").str[-1]
df.columns = df.columns.str.split("#").str[-1]

#df.index = df.index.str.split("#").str[-1]
#df.index = df.index.str.split("/").str[-1]
df = df.apply(lambda col: col.map(lambda x: x.split("/")[-1]))
df = df.apply(lambda col: col.map(lambda x: x.split("#")[-1]))


In [10]:
len(df)

2829

In [9]:
df.head()

Unnamed: 0,author,address,isAbout,type,year,hasProject,booktitle,abstract,title,month,...,financedBy,carriedOutBy,edition,editor,type.1,isbn,finances,inverseOf,chapter,range
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1320instance,id2139instance,"Hyderabad, India",id58instance,Publication,2007,id70instance,Proceedings of the Third International Worksho...,Motivated by basic ideas from formal concept a...,Encoding Closure Operators into Neural Networks,January,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id447instance,id6instance,"New York, USA",id123instance,Publication,2004,id30instance,Proceedings of the 13th International World Wi...,,REMINDIN': Semantic Query Routing in Peer-to-P...,May,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id294instance,id175instance,,,TechnicalReport,1997,,,,Assumptions of Problem-Solving Methods and the...,May,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id389instance,id41instance,,,Article,2003,,,,Where are the rules?,October,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id880instance,id6instance,,id134instance,Misc,2005,id42instance,,,Bootstrapping Ontology Alignment Methods with ...,May,...,,,,,,,,,,


In [10]:
train_labels_df = pd.read_csv("data/trainingSet.tsv", sep="\t")
test_labels_df = pd.read_csv("data/testSet.tsv", sep="\t")

# Extract person-specific rows first
X_train_raw = df.loc[train_labels_df['person']]
X_test_raw = df.loc[test_labels_df['person']]


In [11]:
a = set(X_train_raw.index)
a

{'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id11instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id12instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id14instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id15instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id16instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1834instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1842instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1852instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1854instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1858instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1859instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1861instance',
 'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1862instance',
 'http://www.aifb.uni-karlsruhe.de/

In [12]:
b = df.index
b

Index(['http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1320instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id447instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id294instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id389instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id880instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id912instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id339instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1078instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id967instance',
       'http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id876instance',
       ...
       'ub1bL486C18', 'http://swrc.ontoware.org/ontology#publication',
       'file:///

In [13]:
c =  a.intersection(b)

In [14]:
df.columns

Index(['author', 'address', 'isAbout', 'type', 'year', 'hasProject',
       'booktitle', 'abstract', 'title', 'month', 'subClassOf', 'publishes',
       'carriesOut', 'member', 'employs', 'homepage', 'head', 'name', 'pages',
       'publication', 'worksAtProject', 'phone', 'fax', 'photo', 'affiliation',
       'series', 'journal', 'volume', 'number', 'howpublished', 'isWorkedOnBy',
       'dealtWithIn', 'type', 'note', 'allValuesFrom', 'onProperty',
       'projectInfo', 'financedBy', 'carriedOutBy', 'edition', 'editor',
       'type', 'isbn', 'finances', 'inverseOf', 'chapter', 'range'],
      dtype='object')

In [15]:

filtered_df = df[df.index.isin(c)]
filtered_df

Unnamed: 0,author,address,isAbout,type,year,hasProject,booktitle,abstract,title,month,...,financedBy,carriedOutBy,edition,editor,type.1,isbn,finances,inverseOf,chapter,range
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id67instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id6instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance,,,,AssistantProfessor,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id12instance,,,,PhDStudent,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2058instance,,,,Person,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1992instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1968instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id24instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1861instance,,,,Person,,,,,,,...,,,,,,,,,,


In [16]:
filtered_df.columns

Index(['author', 'address', 'isAbout', 'type', 'year', 'hasProject',
       'booktitle', 'abstract', 'title', 'month', 'subClassOf', 'publishes',
       'carriesOut', 'member', 'employs', 'homepage', 'head', 'name', 'pages',
       'publication', 'worksAtProject', 'phone', 'fax', 'photo', 'affiliation',
       'series', 'journal', 'volume', 'number', 'howpublished', 'isWorkedOnBy',
       'dealtWithIn', 'type', 'note', 'allValuesFrom', 'onProperty',
       'projectInfo', 'financedBy', 'carriedOutBy', 'edition', 'editor',
       'type', 'isbn', 'finances', 'inverseOf', 'chapter', 'range'],
      dtype='object')

In [17]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id67instance to http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1862instance
Data columns (total 47 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          140 non-null    object
 1   address         140 non-null    object
 2   isAbout         140 non-null    object
 3   type            140 non-null    object
 4   year            140 non-null    object
 5   hasProject      140 non-null    object
 6   booktitle       140 non-null    object
 7   abstract        140 non-null    object
 8   title           140 non-null    object
 9   month           140 non-null    object
 10  subClassOf      140 non-null    object
 11  publishes       140 non-null    object
 12  carriesOut      140 non-null    object
 13  member          140 non-null    object
 14  employs         140 non-null    object
 15  homepage     

In [61]:
filtered_df.describe()

Unnamed: 0,author,address,isAbout,type,year,hasProject,booktitle,abstract,title,month,...,financedBy,carriedOutBy,edition,editor,type.1,isbn,finances,inverseOf,chapter,range
count,140.0,140.0,140.0,140,140.0,140.0,140.0,140.0,140.0,140.0,...,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0
unique,1.0,1.0,1.0,5,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
top,,,,Person,,,,,,,...,,,,,,,,,,
freq,140.0,140.0,140.0,102,140.0,140.0,140.0,140.0,140.0,140.0,...,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0


In [73]:
main_df=filtered_df.copy()

In [74]:
main_df.columns

Index(['author', 'address', 'isAbout', 'type', 'year', 'hasProject',
       'booktitle', 'abstract', 'title', 'month', 'subClassOf', 'publishes',
       'carriesOut', 'member', 'employs', 'homepage', 'head', 'name', 'pages',
       'publication', 'worksAtProject', 'phone', 'fax', 'photo', 'affiliation',
       'series', 'journal', 'volume', 'number', 'howpublished', 'isWorkedOnBy',
       'dealtWithIn', 'type', 'note', 'allValuesFrom', 'onProperty',
       'projectInfo', 'financedBy', 'carriedOutBy', 'edition', 'editor',
       'type', 'isbn', 'finances', 'inverseOf', 'chapter', 'range'],
      dtype='object')

In [75]:
main_df['affiliation'].unique()

array(['id3instance', 'id1instance', 'id2instance', 'id4instance'],
      dtype=object)

In [76]:
def explore_dataframe(main_df):
    print("Shape of DataFrame:", main_df.shape)
    print("\nIndex info:", main_df.index)
    print("\nIs index unique?:", main_df.index.is_unique)
    print("\nColumns:\n", main_df.columns)
    print("\nData types:\n", main_df.dtypes)
    print("\nFirst 5 rows:\n", main_df.head())
    print("\nSample 5 rows:\n", main_df.sample(5))
    print("\nSummary statistics (numeric columns):\n", main_df.describe())
    print("\nValue counts of 'affiliation' column:\n", main_df['affiliation'].value_counts(dropna=False))
    print("\nMissing values per column:\n", main_df.isnull().sum())

# Run this on your DataFrame:
explore_dataframe(main_df)


Shape of DataFrame: (140, 47)

Index info: Index(['http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id67instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id6instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id12instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2058instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2042instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2097instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2038instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id83instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id36instance',
       ...
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2040instance',
       'http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id21

In [77]:
main_df.columns

Index(['author', 'address', 'isAbout', 'type', 'year', 'hasProject',
       'booktitle', 'abstract', 'title', 'month', 'subClassOf', 'publishes',
       'carriesOut', 'member', 'employs', 'homepage', 'head', 'name', 'pages',
       'publication', 'worksAtProject', 'phone', 'fax', 'photo', 'affiliation',
       'series', 'journal', 'volume', 'number', 'howpublished', 'isWorkedOnBy',
       'dealtWithIn', 'type', 'note', 'allValuesFrom', 'onProperty',
       'projectInfo', 'financedBy', 'carriedOutBy', 'edition', 'editor',
       'type', 'isbn', 'finances', 'inverseOf', 'chapter', 'range'],
      dtype='object')

In [68]:
from collections import Counter

new_columns = []
type_counter = Counter()

for col in main_df.columns:
    if col == 'type':
        type_counter[col] += 1
        new_columns.append(f'{col}_{type_counter[col]}')
    else:
        new_columns.append(col)

main_df.columns = new_columns

In [78]:
main_df.columns

Index(['author', 'address', 'isAbout', 'type', 'year', 'hasProject',
       'booktitle', 'abstract', 'title', 'month', 'subClassOf', 'publishes',
       'carriesOut', 'member', 'employs', 'homepage', 'head', 'name', 'pages',
       'publication', 'worksAtProject', 'phone', 'fax', 'photo', 'affiliation',
       'series', 'journal', 'volume', 'number', 'howpublished', 'isWorkedOnBy',
       'dealtWithIn', 'type', 'note', 'allValuesFrom', 'onProperty',
       'projectInfo', 'financedBy', 'carriedOutBy', 'edition', 'editor',
       'type', 'isbn', 'finances', 'inverseOf', 'chapter', 'range'],
      dtype='object')

In [91]:
cvd = pd.read_csv('clean.csv', index_col=0)

In [92]:
cvd

Unnamed: 0,isAbout,journal,pages,type,author,title,volume,month,year,number,...,editor,projectInfo,financedBy,chapter,onProperty,allValuesFrom,isbn,finances,inverseOf,range
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id15instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2053instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2058instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2065instance,,,,Person,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2051instance,,,,PhDStudent,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2075instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1885instance,,,,Person,,,,,,,...,,,,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1908instance,,,,Person,,,,,,,...,,,,,,,,,,


In [100]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


df.fillna('missing', inplace=True)
label_encoders = {}
for col in ['type', 'phone', 'fax','worksAtProject','publication','homepage']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Encode target variable
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(df['affiliation'])

# Feature selection
X = df[['type', 'phone', 'fax','worksAtProject','publication','homepage' ]]  # Key features

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


ValueError: y should be a 1d array, got an array of shape (2829, 3) instead.

In [98]:
main_df.columns

Index(['author', 'address', 'isAbout', 'type', 'year', 'hasProject',
       'booktitle', 'abstract', 'title', 'month', 'subClassOf', 'publishes',
       'carriesOut', 'member', 'employs', 'homepage', 'head', 'name', 'pages',
       'publication', 'worksAtProject', 'phone', 'fax', 'photo', 'affiliation',
       'series', 'journal', 'volume', 'number', 'howpublished', 'isWorkedOnBy',
       'dealtWithIn', 'type', 'note', 'allValuesFrom', 'onProperty',
       'projectInfo', 'financedBy', 'carriedOutBy', 'edition', 'editor',
       'type', 'isbn', 'finances', 'inverseOf', 'chapter', 'range'],
      dtype='object')

In [99]:
main_df[['type', 'phone', 'fax','worksAtProject','publication','homepage' ]]

Unnamed: 0,type,type.1,type.2,phone,fax,worksAtProject,publication,homepage
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id67instance,Person,,,-,,,id312instance,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id6instance,Person,,,+49 (0) 721 608 4751,+49 (0) 721 608 6580,id35instance,id473instance,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance,AssistantProfessor,,,+49 (721) 608 6592,+49 (721) 608 6580,id42instance,id935instance,ysu
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id12instance,PhDStudent,,,+49 (721) 608 3509,+49 (721) 151422433,id34instance,id871instance,GuS
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2058instance,Person,,,+49 (721) 608 4536,+49 (721) 608 4548,id63instance,id912instance,mitarbeiter.php?id=485
...,...,...,...,...,...,...,...,...
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1992instance,Person,,,,,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1968instance,Person,,,,,,id306instance,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id24instance,Person,,,+49 (721) 608 3679,+49 (721) 693717,,,
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1861instance,Person,,,,,,,
