In [1]:
from gsdmm import MovieGroupProcessArray, MovieGroupProcess
import numpy as np
from pprint import pprint

## Dummy data

In [2]:
np.random.seed(1)

docs = [
    'A p-value is a measure of the probability that an observed difference could have occurred just by random chance',
    'In null hypothesis significance testing, the p-value is the probability of obtaining test results at least as extreme as the results actually observed',
    'A p-value, or probability value, is a number describing how likely it is that your data would have occurred by random chance',
    'A p-value is used in hypothesis testing to help you support or reject the null hypothesis',
    'The P-value, or calculated probability, is the probability of finding the observed, or more extreme, results when the null hypothesis',
    'A neural network is a network or circuit of neurons, or in a modern sense, an artificial neural network, composed of artificial neurons or nodes',
    'An artificial neural network is an interconnected group of nodes, inspired by a simplification of neurons in a brain',
    'Neural networks, also known as artificial neural networks (ANNs) or simulated neural networks (SNNs), are a subset of machine learning ',
    'Modeled loosely on the human brain, a neural net consists of thousands or even millions of simple processing nodes that are densely',
    'Neural networks are a set of algorithms, modeled loosely after the human brain, that are designed to recognize patterns']

In [3]:
stopwords = ['this', 'is', 'a', 'the', 'of', 'an', 'that', 'or']
docs_toks = [doc.lower().replace(',', '').replace('.', '').split() for doc in docs]
docs_toks = [[w for w in doc if w not in stopwords] for doc in docs_toks]

## Init model and train

In [4]:
mgp_ar = MovieGroupProcessArray(K=10, alpha=0.1, beta=0.1, n_iters=22)
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=22)

y = mgp_ar.fit(docs_toks)

In stage 0: transferred 6 clusters with 4 clusters populated
In stage 1: transferred 1 clusters with 5 clusters populated
In stage 2: transferred 1 clusters with 5 clusters populated
In stage 3: transferred 1 clusters with 5 clusters populated
In stage 4: transferred 1 clusters with 5 clusters populated
In stage 5: transferred 1 clusters with 5 clusters populated
In stage 6: transferred 1 clusters with 5 clusters populated
In stage 7: transferred 1 clusters with 5 clusters populated
In stage 8: transferred 1 clusters with 5 clusters populated
In stage 9: transferred 1 clusters with 5 clusters populated
In stage 10: transferred 1 clusters with 5 clusters populated
In stage 11: transferred 1 clusters with 4 clusters populated
In stage 12: transferred 1 clusters with 5 clusters populated
In stage 13: transferred 0 clusters with 5 clusters populated
In stage 14: transferred 1 clusters with 5 clusters populated
In stage 15: transferred 1 clusters with 5 clusters populated
In stage 16: trans

In [5]:
y_old = mgp.fit(docs_toks, len(set([item for sublist in docs_toks for item in sublist])))

In stage 0: transferred 7 clusters with 4 clusters populated
In stage 1: transferred 1 clusters with 4 clusters populated
In stage 2: transferred 2 clusters with 5 clusters populated
In stage 3: transferred 3 clusters with 5 clusters populated
In stage 4: transferred 1 clusters with 4 clusters populated
In stage 5: transferred 0 clusters with 4 clusters populated
In stage 6: transferred 2 clusters with 5 clusters populated
In stage 7: transferred 1 clusters with 5 clusters populated
In stage 8: transferred 3 clusters with 5 clusters populated
In stage 9: transferred 1 clusters with 4 clusters populated
In stage 10: transferred 1 clusters with 5 clusters populated
In stage 11: transferred 1 clusters with 5 clusters populated
In stage 12: transferred 3 clusters with 5 clusters populated
In stage 13: transferred 3 clusters with 5 clusters populated
In stage 14: transferred 3 clusters with 5 clusters populated
In stage 15: transferred 3 clusters with 5 clusters populated
In stage 16: trans

## See topics

In [6]:
#array version skips topics where 0 docs clustered
pprint(mgp_ar.top_words())

{0: ' hypothesis p-value results probability null',
 2: ' network artificial neural neurons in',
 6: ' by chance have occurred p-value',
 8: ' neural are networks human brain'}


In [7]:
pprint(mgp.top_words())

{0: 'network artificial neural neurons nodes',
 1: '',
 2: '',
 3: '',
 4: 'p-value probability have occurred by',
 5: '',
 6: '',
 7: 'hypothesis p-value probability results null',
 8: '',
 9: 'neural are networks modeled loosely'}


In [11]:
mgp_ar.choose_best_label('p-value is a measure of the probability'.split())

(6, 0.8709794263411225)

In [13]:
mgp.choose_best_label('p-value is a measure of the probability'.split())

(4, 0.7213762163883213)