In [1]:
import numpy as np
%load_ext autoreload
%autoreload 2

# Clustering

## 1. Data loading

We load data as in previous analysis: docs and terms are sorted by length and frequencies.

In [1]:
from model.dataset import RCV1Loader

loader = RCV1Loader()
loader

RCV1Loader [File: /home/sebaq/Documents/GitHub/IR_project/dataset/data.npz]

In [2]:
# data = loader.load(sort_docs=True, sort_terms=True)
data = loader.load(
    docs=5000, terms=2000,
    sort_docs=True, sort_terms=True
)

INFO: Loading matrix. 
I/O: Loading /home/sebaq/Documents/GitHub/IR_project/dataset/data.npz. 
INFO: Removing non informative terms. 
INFO: Sorting documents by terms count. 
INFO: Sorting terms by their frequency. 


In [3]:
data

DocumentsCollection[Docs: 5000; Terms: 900; Nonzero: 14563]

## 2. Dimensionality reduction

We apply the [Johnson-Lindenstrauss lemma](https://scikit-learn.org/stable/modules/random_projection.html) to perform dimensionality reduction an dramatically reduce the vector space where to apply clustering.

In [4]:
embedding = data.embed(eps=0.35)
embedding

array([[ 0.        ,  0.00698501, -0.00381764, ..., -0.00896949,
        -0.00166962, -0.01276779],
       [-0.00278406, -0.00337828, -0.00408574, ...,  0.0012755 ,
        -0.0061644 , -0.01288296],
       [ 0.        ,  0.        , -0.00629529, ..., -0.00526008,
         0.00351907,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## 3. Clustering

We perform clustering in the embedded space. We look for a number of cluster which is the root of the number of items.

In [5]:
from model.clustering import KMeansClustering
from math import sqrt

kmeans = KMeansClustering(
    mat=embedding,
    k=int(sqrt(len(embedding)))
)
kmeans

KMeansClustering[Items: 5000; k: 70;  Fitted: False]

In [6]:
%%time
kmeans.fit()

INFO: Fitting K-Means model. 
CPU times: user 3.96 s, sys: 920 ms, total: 4.88 s
Wall time: 1.74 s


In [7]:
clusters = kmeans.clusters

In [8]:
clusters

ClusterDataSplit [Data: 5000, Clusters: 70, Mean-per-Cluster: 71.429]

In [9]:
clusters[0]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.01125858,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.01007577,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.01007577,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])