In [13]:
import re
import string
import spacy
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.base import TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from gsdmm import MovieGroupProcessArray, MovieGroupProcess

## Dummy data

In [2]:
np.random.seed(1)

docs = [
    'A p-value is a measure of the probability that an observed difference could have occurred just by random chance',
    'In null hypothesis significance testing, the p-value is the probability of obtaining test results at least as extreme as the results actually observed',
    'A p-value, or probability value, is a number describing how likely it is that your data would have occurred by random chance',
    'A p-value is used in hypothesis testing to help you support or reject the null hypothesis',
    'The P-value, or calculated probability, is the probability of finding the observed, or more extreme, results when the null hypothesis',
    'A neural network is a network or circuit of neurons, or in a modern sense, an artificial neural network, composed of artificial neurons or nodes',
    'An artificial neural network is an interconnected group of nodes, inspired by a simplification of neurons in a brain',
    'Neural networks, also known as artificial neural networks (ANNs) or simulated neural networks (SNNs), are a subset of machine learning ',
    'Modeled loosely on the human brain, a neural net consists of thousands or even millions of simple processing nodes that are densely',
    'Neural networks are a set of algorithms, modeled loosely after the human brain, that are designed to recognize patterns']

In [3]:
stopwords = ['this', 'is', 'a', 'the', 'of', 'an', 'that', 'or']
docs_toks = [doc.lower().replace(',', '').replace('.', '').split() for doc in docs]
docs_toks = [[w for w in doc if w not in stopwords] for doc in docs_toks]

Document should be a list of documents, where ieach document itself is a list of tokens. Model itself doesn't do ay preporcessing only indexing of tokens.

## Init model and train

In [4]:
mgp_ar = MovieGroupProcessArray(K=10, alpha=0.1, beta=0.1, n_iters=22)
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=22)

y = mgp_ar.fit(docs_toks)

In stage 0: transferred 6 clusters with 4 clusters populated
In stage 1: transferred 1 clusters with 5 clusters populated
In stage 2: transferred 1 clusters with 5 clusters populated
In stage 3: transferred 1 clusters with 5 clusters populated
In stage 4: transferred 1 clusters with 5 clusters populated
In stage 5: transferred 1 clusters with 5 clusters populated
In stage 6: transferred 1 clusters with 5 clusters populated
In stage 7: transferred 1 clusters with 5 clusters populated
In stage 8: transferred 1 clusters with 5 clusters populated
In stage 9: transferred 1 clusters with 5 clusters populated
In stage 10: transferred 1 clusters with 5 clusters populated
In stage 11: transferred 1 clusters with 4 clusters populated
In stage 12: transferred 1 clusters with 5 clusters populated
In stage 13: transferred 0 clusters with 5 clusters populated
In stage 14: transferred 1 clusters with 5 clusters populated
In stage 15: transferred 1 clusters with 5 clusters populated
In stage 16: trans

In [5]:
y_old = mgp.fit(docs_toks, len(set([item for sublist in docs_toks for item in sublist])))

In stage 0: transferred 7 clusters with 4 clusters populated
In stage 1: transferred 1 clusters with 4 clusters populated
In stage 2: transferred 2 clusters with 5 clusters populated
In stage 3: transferred 3 clusters with 5 clusters populated
In stage 4: transferred 1 clusters with 4 clusters populated
In stage 5: transferred 0 clusters with 4 clusters populated
In stage 6: transferred 2 clusters with 5 clusters populated
In stage 7: transferred 1 clusters with 5 clusters populated
In stage 8: transferred 3 clusters with 5 clusters populated
In stage 9: transferred 1 clusters with 4 clusters populated
In stage 10: transferred 1 clusters with 5 clusters populated
In stage 11: transferred 1 clusters with 5 clusters populated
In stage 12: transferred 3 clusters with 5 clusters populated
In stage 13: transferred 3 clusters with 5 clusters populated
In stage 14: transferred 3 clusters with 5 clusters populated
In stage 15: transferred 3 clusters with 5 clusters populated
In stage 16: trans

## See topics

In [6]:
#array version skips topics where 0 docs clustered
pprint(mgp_ar.top_words())

{0: ' hypothesis p-value results probability null',
 2: ' network artificial neural neurons in',
 6: ' by chance have occurred p-value',
 8: ' neural are networks human brain'}


In [7]:
pprint(mgp.top_words())

{0: 'network artificial neural neurons nodes',
 1: '',
 2: '',
 3: '',
 4: 'p-value probability have occurred by',
 5: '',
 6: '',
 7: 'hypothesis p-value probability results null',
 8: '',
 9: 'neural are networks modeled loosely'}


In [8]:
mgp_ar.choose_best_label('p-value is a measure of the probability'.split())

(6, 0.8709794263411225)

In [9]:
mgp.choose_best_label('p-value is a measure of the probability'.split())

(4, 0.7213762163883213)

## Speed comparison - 20NewsGroups 

Topics are here not really an interest, would probalby need more cleaning

In [11]:
categories = ['alt.atheism', 'comp.graphics',
              'rec.sport.hockey', 'sci.crypt', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

#### preprocess data - this takes some time

In [14]:
class TextPreprocessor(TransformerMixin):
    def __init__(self, text_attribute):
        self.text_attribute = text_attribute

    def transform(self, X, *_):
        X_copy = X.copy()
        X_copy[self.text_attribute] = X_copy[self.text_attribute].apply(self._preprocess_text)
        return X_copy

    def _preprocess_text(self, text):
        return self._lemmatize(self._leave_letters_only(self._clean(text)))

    def _clean(self, text):
        bad_symbols = '!"#%&\'*+,-<=>?[\\]^_`{|}~'
        text_without_symbols = text.translate(str.maketrans('', '', bad_symbols))

        text_without_bad_words = ''
        for line in text_without_symbols.split('\n'):
            if not line.lower().startswith('from:') and not line.lower().endswith('writes:'):
                text_without_bad_words += line + '\n'

        clean_text = text_without_bad_words
        email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
        regexes_to_remove = [email_regex, r'Subject:', r'Re:']
        for r in regexes_to_remove:
            clean_text = re.sub(r, '', clean_text)

        return clean_text

    def _leave_letters_only(self, text):
        text_without_punctuation = text.translate(str.maketrans('', '', string.punctuation))
        return ' '.join(re.findall("[a-zA-Z]+", text_without_punctuation))

    def _lemmatize(self, text):
        doc = nlp(text)
        words = [x.lemma_ for x in [y for y in doc if not y.is_stop and y.pos_ != 'PUNCT'
                                    and y.pos_ != 'PART' and y.pos_ != 'X']]
        return words

    def fit(self, *_):
        return self

nlp = spacy.load("en_core_web_sm")
df=pd.DataFrame({'text':newsgroups['data']})

text_preprocessor = TextPreprocessor(text_attribute='text')
df_preprocessed = text_preprocessor.transform(df)


docs=df_preprocessed.text.tolist()

In [16]:
docs[0][:10]

['facinate',
 'fact',
 'bit',
 'serial',
 'number',
 'possibly',
 'fix',
 'S',
 'S',
 'Organization']

#### train models

In [17]:
mgp_20news = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=22)
mgp_20news_ar = MovieGroupProcessArray(K=5, alpha=0.1, beta=0.1, n_iters=22)

In [18]:
%time y = mgp_20news.fit(df_preprocessed.text.tolist(), len(set([item for sublist in docs for item in sublist])))

In stage 0: transferred 1992 clusters with 5 clusters populated
In stage 1: transferred 476 clusters with 5 clusters populated
In stage 2: transferred 145 clusters with 5 clusters populated
In stage 3: transferred 84 clusters with 5 clusters populated
In stage 4: transferred 85 clusters with 5 clusters populated
In stage 5: transferred 72 clusters with 5 clusters populated
In stage 6: transferred 80 clusters with 5 clusters populated
In stage 7: transferred 52 clusters with 5 clusters populated
In stage 8: transferred 36 clusters with 5 clusters populated
In stage 9: transferred 29 clusters with 5 clusters populated
In stage 10: transferred 20 clusters with 5 clusters populated
In stage 11: transferred 29 clusters with 5 clusters populated
In stage 12: transferred 26 clusters with 5 clusters populated
In stage 13: transferred 16 clusters with 5 clusters populated
In stage 14: transferred 14 clusters with 5 clusters populated
In stage 15: transferred 24 clusters with 5 clusters populate

In [19]:
%time y = mgp_20news_ar.fit(df_preprocessed.text.tolist())

In stage 0: transferred 1964 clusters with 5 clusters populated
In stage 1: transferred 691 clusters with 5 clusters populated
In stage 2: transferred 248 clusters with 5 clusters populated
In stage 3: transferred 119 clusters with 5 clusters populated
In stage 4: transferred 70 clusters with 5 clusters populated
In stage 5: transferred 70 clusters with 5 clusters populated
In stage 6: transferred 58 clusters with 5 clusters populated
In stage 7: transferred 40 clusters with 5 clusters populated
In stage 8: transferred 35 clusters with 5 clusters populated
In stage 9: transferred 33 clusters with 5 clusters populated
In stage 10: transferred 38 clusters with 5 clusters populated
In stage 11: transferred 31 clusters with 5 clusters populated
In stage 12: transferred 27 clusters with 5 clusters populated
In stage 13: transferred 30 clusters with 5 clusters populated
In stage 14: transferred 29 clusters with 5 clusters populated
In stage 15: transferred 29 clusters with 5 clusters populat

#### Compare topics

In [20]:
pprint(mgp_20news.top_words())

{0: 'Organization Lines know say people',
 1: 'Organization Lines University file know',
 2: 'Organization tempest Lines group good',
 3: 'Organization Lines game University team',
 4: 'Organization key people think know'}


In [21]:
pprint(mgp_20news_ar.top_words())

{0: ' Organization Lines x file University',
 1: ' Organization Lines University file d',
 2: ' order Lines Organization know OTO',
 3: ' Organization game Lines University team',
 4: ' Organization people key think Lines'}
