# Hierarchical classification on Wikivitals

In [1]:
import numpy as np
from scipy import sparse

In [2]:
from sknetwork.data import load_netset

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

## Data

In [4]:
dataset = load_netset('wikivitals')

Parsing files...
Done.


In [5]:
# links
adjacency = dataset.adjacency
# words
biadjacency = dataset.biadjacency

In [6]:
labels_hierarchy = dataset.labels_hierarchy
names_labels_hierarchy = dataset.names_labels_hierarchy

In [7]:
def get_labels(labels_hierarchy, names_labels_hierarchy, depth=1, sep='|||'):
    names_depth = [sep.join(name.split(sep)[:depth]) for name in names_labels_hierarchy]
    names_depth_index = {name: i for i, name in enumerate(np.unique(names_depth))}
    index = np.array([names_depth_index[name] for name in names_depth])
    labels = index[labels_hierarchy]
    names_labels = np.array(list(names_depth_index))
    return labels, names_labels

In [8]:
labels, names_labels = get_labels(labels_hierarchy, names_labels_hierarchy)

In [9]:
print(names_labels)

['Arts' 'Biological and health sciences' 'Everyday life' 'Geography'
 'History' 'Mathematics' 'People' 'Philosophy and religion'
 'Physical sciences' 'Society and social sciences' 'Technology']


In [10]:
len(np.unique(labels))

11

In [11]:
labels, names_labels = get_labels(labels_hierarchy, names_labels_hierarchy, depth=2)

In [12]:
print(names_labels[:10])

['Arts|||Architecture' 'Arts|||Cultural venues'
 'Arts|||Fictional characters' 'Arts|||General' 'Arts|||Literature'
 'Arts|||Modern visual arts' 'Arts|||Music' 'Arts|||Performing arts'
 'Arts|||Visual arts'
 'Biological and health sciences|||Anatomy and morphology']


In [13]:
len(np.unique(labels))

109

## Training

In [14]:
train = np.random.random(len(labels)) < 0.8
X_train = biadjacency[train]
X_test = biadjacency[~train]

In [15]:
print('Depth Accuracy Macro-F1')
for depth in [1, 2]:
    labels, names_labels = get_labels(labels_hierarchy, names_labels_hierarchy, depth)
    y_train = labels[train]
    y_test = labels[~train]
    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(depth, np.mean(y_test==y_pred), f1_score(y_test, y_pred, average='macro'))

Depth Accuracy Macro-F1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1 0.8937747035573123 0.8685146767870037
2 0.775197628458498 0.6125804088265839


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
