# Quickstart

Here we will analyse a dataset from the text domain (dexter) for hubness,
we will reduce hubness, and compare nearest neighbor classification
with or without hubness reduction.

In [3]:
%load_ext autoreload
%autoreload 2

In [8]:
# load the example dataset 'dexter'
from hubness.data import load_dexter
X, y = load_dexter()

# dexter is embedded in a high-dimensional space,
# and could, thus, be prone to hubness
X.shape, y.shape

((300, 20000), (300,))

In [21]:
# assess the actual degree of hubness in dexter
from hubness import Hubness
hub = Hubness(k=5, metric='cosine')
hub.fit_transform(X)
hub.k_skewness_

4.222131665788378

In [22]:
# additional hubness indices are available, for example:
print(f'Robin hood index: {hub.robinhood_index_:.3f}')
print(f'Antihub occurrence: {hub.antihub_occurrence_:.3f}')
print(f'Hub occurrence: {hub.hub_occurrence_:.3f}')

Robin hood index: 0.543
Antihub occurrence: 0.267
Hub occurrence: 0.634


In [28]:
# There is considerable hubness in dexter.
# Let's see, whether hubness reduction can improve
# kNN classification performance 
from sklearn.model_selection import cross_val_score
from hubness.neighbors import KNeighborsClassifier

# vanilla kNN
knn_standard = KNeighborsClassifier(n_neighbors=5,
                                    metric='cosine')
acc_standard = cross_val_score(knn_standard, X, y, cv=5)

# kNN with hubness reduction (mutual proximity)
knn_mp = KNeighborsClassifier(n_neighbors=5,
                              metric='cosine',
                              hubness='mutual_proximity')
acc_mp = cross_val_score(knn_mp, X, y, cv=5)

print(f'Accuracy (vanilla kNN): {acc_standard.mean():.3f}')
print(f'Accuracy (kNN with hubness reduction): {acc_mp.mean():.3f}')

Accuracy (vanilla kNN): 0.793
Accuracy (kNN with hubness reduction): 0.893


In [37]:
# Accuracy was considerably improved by mutual proximity.
# Did it actually reduce hubness?
knn_mp.fit(X, y)
neighbor_graph = knn_mp.kneighbors_graph()

hub_mp = Hubness(k=5, metric='precomputed').estimate(neighbor_graph)
print(f'Skewness: {hub_mp.k_skewness_:.3f} (reduction of {hub.k_skewness_ - hub_mp.k_skewness_:.3f})')
print(f'Robin hood: {hub_mp.robinhood_index_:.3f} (reduction of {hub.robinhood_index_ - hub_mp.robinhood_index_:.3f})')

Skewness: 0.933 (reduction of 3.289)
Robin hood: 0.288 (reduction of 0.255)


In [None]:
# The neighbor graph can also be created directly,
# with or without hubness reduction
from hubness.neighbors import kneighbors_graph
neighbor_graph = kneighbors_graph(X, n_neighbors=5, hubness='mutual_proximity')


In [None]:
# assess the actual degree of hubness in dexter
from skhubness import Hubness
hub = Hubness(k=10, metric='cosine')
hub.fit(X)
k_skew = hub.score()
print(f'Skewness = {k_skew:.3f}')

# additional hubness indices are available, for example:
print(f'Robin hood index: {hub.robinhood_index:.3f}')
print(f'Antihub occurrence: {hub.antihub_occurrence:.3f}')
print(f'Hub occurrence: {hub.hub_occurrence:.3f}')

# There is considerable hubness in dexter.
# Let's see, whether hubness reduction can improve
# kNN classification performance 
from sklearn.model_selection import cross_val_score
from skhubness.neighbors import KNeighborsClassifier

# vanilla kNN
knn_standard = KNeighborsClassifier(n_neighbors=5,
                                    metric='cosine')
acc_standard = cross_val_score(knn_standard, X, y, cv=5)

# kNN with hubness reduction (mutual proximity)
knn_mp = KNeighborsClassifier(n_neighbors=5,
                              metric='cosine',
                              hubness='mutual_proximity')
acc_mp = cross_val_score(knn_mp, X, y, cv=5)

print(f'Accuracy (vanilla kNN): {acc_standard.mean():.3f}')
print(f'Accuracy (kNN with hubness reduction): {acc_mp.mean():.3f}')

# Accuracy was considerably improved by mutual proximity.
# Did it actually reduce hubness?
hub_mp = Hubness(k=10, metric='cosine',
                 hubness='mutual_proximity')
hub_mp.fit(X)
k_skew_mp = hub_mp.score()
print(f'Skewness: {k_skew:.3f} '
      f'(reduction of {k_skew - k_skew_mp:.3f})')
print(f'Robin hood: {hub_mp.robinhood_index:.3f} '
      f'(reduction of {hub.robinhood_index - hub_mp.robinhood_index:.3f})')

# The neighbor graph can also be created directly,
# with or without hubness reduction
from skhubness.neighbors import kneighbors_graph
neighbor_graph = kneighbors_graph(X, n_neighbors=5, hubness='mutual_proximity')

In [1]:
from sklearn.datasets import make_sparse_spd_matrix

In [3]:
sparse = make_sparse_spd_matrix(50, )

In [10]:
sparse.shape

(50, 50)

In [9]:
sparse[sparse == 0].shape

(2254,)

In [12]:
sparse.__class__

numpy.ndarray