In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from TGA.utils import Dataset
from sklearn.preprocessing import LabelEncoder
from TGA.utils import preprocessor
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
dataset = Dataset('/home/Documentos/datasets/classification/datasets/20ng/')
fold = next(dataset.get_fold_instances(10, with_val=True))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val'), 15062)

In [4]:
le      = LabelEncoder()
y_train = le.fit_transform(fold.y_train)
y_val   = le.transform(fold.y_val)

In [5]:
tfidf = TfidfVectorizer(preprocessor=preprocessor)
X_train_vec = tfidf.fit_transform(fold.X_train)
X_val_vec = tfidf.transform(fold.X_val)
X_train_vec.shape, X_val_vec.shape

((15062, 99009), (1892, 99009))

In [6]:
n_neighbors=5
n_neighbors_2 = 2

In [7]:
knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance', metric='cosine', n_jobs=4)
knn.fit(X_train_vec, y_train)

KNeighborsClassifier(metric='cosine', n_jobs=4, weights='distance')

In [8]:
%%time
y_pred = knn.predict(X_val_vec)

CPU times: user 3.62 s, sys: 321 ms, total: 3.94 s
Wall time: 1.3 s


In [9]:
f1_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average='micro')

(0.7928907345893542, 0.7996828752642706)

# Transitivity

In [10]:
global_neigh_dists, global_neigh_ind = knn.kneighbors(n_neighbors=n_neighbors_2)
global_neigh_dists.shape, global_neigh_ind.shape

((15062, 2), (15062, 2))

In [11]:
%%time
neigh_dists, neigh_ind = knn.kneighbors(X_val_vec)

CPU times: user 3.61 s, sys: 320 ms, total: 3.93 s
Wall time: 1.32 s


In [12]:
def distance(dist):
    with np.errstate(divide='ignore'):
        dist = 1. / dist
    if np.isinf(dist):
        return 0.
    return dist

In [15]:
from sklearn.utils.validation import _num_samples
from sklearn.neighbors._base import _get_weights
from sklearn.utils import check_array
from sklearn.utils.extmath import weighted_mode


In [16]:
%%time
weight = _get_weights(neigh_dists, 'distance')
y_probs = np.zeros((len(neigh_dists),len(le.classes_)))
for (docid, (doc_n_dists, doc_n_inds)) in enumerate(zip(neigh_dists, neigh_ind)):
    for neigh_id, dist in zip( doc_n_inds, doc_n_dists ):
        y_probs[ docid, y_train[neigh_id] ] += distance(dist)
        
        onehop_dists, onehop_ind  = global_neigh_dists[neigh_id], global_neigh_ind[neigh_id]
        
        for neigh_id, dist2 in zip( onehop_ind, onehop_dists ):
            y_probs[ docid, y_train[neigh_id] ] += dist*distance(dist2)
        

CPU times: user 259 ms, sys: 0 ns, total: 259 ms
Wall time: 257 ms


In [17]:
y_pred = y_probs.argmax(axis=1)

In [18]:
f1_score(y_val, y_pred, average='macro'), f1_score(y_val, y_pred, average='micro')

(0.7138756017522335, 0.7193446088794925)

In [None]:
y_train[global_neigh_ind[neigh_id]]

In [None]:
def predict_print(self, X):
    """Predict the class labels for the provided data.
    Parameters
    ----------
    X : array-like of shape (n_queries, n_features), \
            or (n_queries, n_indexed) if metric == 'precomputed'
        Test samples.
    Returns
    -------
    y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
        Class labels for each data sample.
    """
    X = check_array(X, accept_sparse='csr')

    neigh_dist, neigh_ind = self.kneighbors(X)
    classes_ = self.classes_
    _y = self._y
    if not self.outputs_2d_:
        _y = self._y.reshape((-1, 1))
        classes_ = [self.classes_]

    n_outputs = len(classes_)
    n_queries = _num_samples(X)
    weights = _get_weights(neigh_dist, self.weights)

    y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
    for k, classes_k in enumerate(classes_):
        if weights is None:
            mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
        else:
            mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

        mode = np.asarray(mode.ravel(), dtype=np.intp)
        y_pred[:, k] = classes_k.take(mode)

    if not self.outputs_2d_:
        y_pred = y_pred.ravel()

    return y_pred

In [None]:
predict_print(knn, X_val_vec)

In [None]:
_y_train = y_train.reshape((-1, 1))

_y_train[ neigh_ind, 0 ]

In [None]:
weights = _get_weights(neigh_dists, 'distance')

In [None]:
neigh_dists, weights

In [None]:
mode, _ = weighted_mode(_y_train[neigh_ind, 0], weights, axis=1)
mode = np.asarray(mode.ravel(), dtype=np.intp)
mode