**This notebook applies the semi-supervised (or constrained) LBM `HLBM` on high-dimensional text data.**

# Imports

In [None]:
import os
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib

from dcblockmodels.models.hlbm import HLBM
from dcblockmodels import metrics, plot, data
from dcblockmodels.models.utils import similarity_matrices, general, init

import os
import sys
sys.stderr = open(os.devnull, "w")

# Data

In [None]:
import re
import nltk         
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
nltk.download('stopwords')

def handle_spe_char(s):
    """
    Removes some special characters considered uniinformative
    The "''" character has to be treated separately
    If there is a number in a token, it is replaced
    by =number and spaces are added around it
    """
    spe_char = ".,;()'-/:=[]`*+\_^|" #‘’
    table = str.maketrans(dict.fromkeys(spe_char))
    s = s.translate(table)
    s = re.sub("''", "", s)
    #s = re.sub('|', ' ', s)
    s = re.sub('\n', ' ', s)
    s = re.sub('\d+', ' =number ', s)
    return s

class StemTokenizer:
    stop_words = stopwords.words('english')
    
    def __init__(self, min_word_length):
        self.stemmer = nltk.stem.PorterStemmer()
        self.tokenizer = nltk.word_tokenize
        self.min_wl = min_word_length
        
    def __call__(self, doc):
        res_doc = []
        for token in self.tokenizer(doc):
            token_ = self.stemmer.stem(token)
            if len(token_) >= self.min_wl:
                if token_ not in self.stop_words:
                    res_doc.append(token_)
        return res_doc
    

vectorizer = CountVectorizer(
     input='content', encoding='utf-8',
     decode_error='strict', strip_accents=None,
     lowercase=True, preprocessor=handle_spe_char,
     tokenizer=StemTokenizer(min_word_length=3),
     stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
     ngram_range=(1, 1), analyzer='word',
     max_df=1.0, min_df=1, max_features=None,
     vocabulary=None, binary=False, dtype=np.int64
)


## 20 Newsgroup

### Build

In [None]:
from sklearn.datasets import fetch_20newsgroups

corpus = fetch_20newsgroups(
    data_home=None, subset='all',
    categories=None, shuffle=True,
    random_state=42, remove=(),
    download_if_missing=True)

corpus.target, corpus.target_names, corpus

In [None]:
nltk.download('punkt')

In [None]:
X0 = vectorizer.fit_transform(corpus.data)
y_ = corpus.target
wf = np.squeeze(np.asarray(X0.sum(0)))

X0.shape, vectorizer.get_feature_names()

In [None]:
plt.plot([np.where(wf >= k)[0].shape[0] for k in range(20)]);
plt.plot([19949 for k in range(20)]) # 26214
plt.xlabel('word frequency threshold')
plt.ylabel('X.shape[1]')

In [None]:
n_docs = 10000 # None
min_word_frequency = 9

features = np.array(vectorizer.get_feature_names())
selected_features_ind = np.where(wf >= min_word_frequency)[0]
selected_features = features[selected_features_ind]
unselected_features = features[~selected_features_ind]

X = X0[:, selected_features_ind]

if n_docs is not None:
    docs = np.random.choice(X.shape[0], size=n_docs, replace=False)
    X = X[docs]
    y_ = y_[docs]
    del docs

del X0, features, wf, corpus

X.shape, list(unselected_features)

### Save/load

In [None]:
sp.sparse.save_npz('data_ng20', X)
np.save('labels_ng20', y_)

In [None]:
X = sp.sparse.load_npz('data_ng20.npz')
y_ = np.load('labels_ng20.npy')

## Reuters NLTK

### Build

In [None]:
import nltk
nltk.download('reuters')

In [None]:
from nltk.corpus import reuters

fileids = reuters.fileids()

corpus = []
y = []
for fileid in fileids[:]:
    labels = reuters.categories(fileid)
    if len(labels) == 1:
        corpus.append(reuters.raw(fileid))
        y.append(labels[0])

In [None]:
import pandas as pd
n_largest_classes = 10
classes, counts = np.unique(np.asarray(y), return_counts=True)
df_classes = pd.DataFrame({'classes': classes, 'counts': counts}).sort_values(by='counts', ascending=False)
categories = df_classes['classes'].values[:n_largest_classes]
categories

In [None]:
fileids = reuters.fileids(categories=categories)

corpus = []
y = []
for fileid in fileids[:]:
    labels = reuters.categories(fileid)
    if len(labels) == 1:
        corpus.append(reuters.raw(fileid))
        y.append(labels[0])

In [None]:
X0 = vectorizer.fit_transform(corpus)

y = np.array(y)
classes = np.unique(y)
map_classes = lambda c : np.argmax(classes == c)
vmap_classes = np.vectorize(map_classes)
y_ = vmap_classes(y)
wf = np.squeeze(np.asarray(X0.sum(0)))

X0.shape, vectorizer.get_feature_names()

In [None]:
plt.plot([np.where(wf >= k)[0].shape[0] for k in range(20)]);
plt.plot([18900 for k in range(20)])

In [None]:
min_word_frequency = 5

features = np.array(vectorizer.get_feature_names())
selected_features_ind = np.where(wf >= min_word_frequency)[0]
selected_features = features[selected_features_ind]
unselected_features = features[~selected_features_ind]

X = X0[:, selected_features_ind]

X.shape, list(unselected_features)

### Save/load

In [None]:
sp.sparse.save_npz('data_reuters', X)
np.save('labels_reuters', y_)

In [None]:
X = sp.sparse.load_npz('data_reuters.npz')
y_ = np.load('labels_reuters.npy')

## Classic

### Build

In [None]:
import zipfile

dataset_dir = '../datasets/'
if 'classic' not in os.listdir(dataset_dir):
    with zipfile.ZipFile(dataset_dir + 'classic.zip', 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)
os.listdir(dataset_dir)

In [None]:
path = dataset_dir + 'classic/'

def get_X_classic(path, vectorizer):
    files = os.listdir(path)
    corpus = []
    y = []
    for file in files:
        y.append(file.split('.')[0]) # cluster in file name
        with open(path + file) as f:
            doc = ''.join(f.readlines())
            corpus.append(doc)

    X0 = vectorizer.fit_transform(corpus)
    return X0, y

X0, y = get_X_classic(path, vectorizer)

y = np.array(y)
classes = np.unique(y)
map_classes = lambda c : np.argmax(classes == c)
vmap_classes = np.vectorize(map_classes)
y_ = vmap_classes(y)

X0.shape, vectorizer.get_feature_names()

In [None]:
min_word_frequency = 4 #4
wf = np.squeeze(np.asarray(X0.sum(0)))

features = np.array(vectorizer.get_feature_names())
selected_features_ind = np.where(wf >= min_word_frequency)[0]
selected_features = features[selected_features_ind]
unselected_features = features[~selected_features_ind]

X = X0[:, selected_features_ind]

X.shape, list(unselected_features)

### Save/load

In [None]:
sp.sparse.save_npz('data_classic', X)
np.save('labels_classic', y_)

In [None]:
X = sp.sparse.load_npz('data_classic.npz')
y_ = np.load('labels_classic.npy')

# Model

## Model params

In [None]:
X.shape, np.unique(y_).shape[0]

In [None]:
np.unique(X.data, return_counts=True)

In [None]:
f, ax = plt.subplots(figsize=(10, 10))
ax.spy(X, markersize=.1, precision=1)

In [None]:
Kz = np.unique(y_).shape[0]
Kw = 10

max_iter = 100
tol_iter = 1e-5

frac_r, frac_c = .01, None
frac_noise = 0.

n_init = 10
model_type = 'with_margins' # 'with_margins', 'without_margins'
estimated_margins = False # True, False
init_type = 'kmeans' #'skmeans' # 'skmeans', 'kmeans'
regularize_row, regularize_col = True, False
regularization_mode = 'all' # 'all' 'mixture'
em_type = 'VEM' # 'VEM', 'CEM'
compute_regularization = True

lambda_r = 1.
lambda_c = None
damping_factor = None if em_type == 'CEM' else .7

multiplicative_init_rows, multiplicative_init_cols = False, False # True, False
power_multiplicative_init = 1
given_Z, given_W = None, None

min_float = 1e-15
min_proba_Z, min_proba_W = .005, .005
min_proba_mixture_proportions = .1 * (1 / Kz)  # to avoid empty clusters
min_margin = 1e-12
min_gamma = 1e-12
threshold_absent_nodes = 0
dtype = 'float32'
debug_output = pathlib.Path(r'../dcblockmodels/model_debug_output')

n_init_clustering = 7 * 1
node_perturbation_rate = .2

## Similarity

In [None]:
if regularize_row:
    S_r = similarity_matrices.build_S_sparse(y_, frac_r, stratified=False)
else:
    S_r = None
S_c = None

S_r, S_c

## Fitting the model

In [None]:
model = HLBM(
        Kz=Kz, Kw=Kw,
        model_type=model_type,
        estimated_margins=estimated_margins,
        regularization_mode=regularization_mode,
        regularize_row=regularize_row, regularize_col=regularize_col,
        n_init=n_init,
        max_iter=max_iter,
        em_type=em_type,
        damping_factor=damping_factor,
        multiplicative_init_rows=multiplicative_init_rows,
        multiplicative_init_cols=multiplicative_init_cols,
        power_multiplicative_init=power_multiplicative_init,
        min_float=min_float,
        min_proba_Z=min_proba_Z,
        min_proba_W=min_proba_W,
        min_proba_mixture_proportions=min_proba_mixture_proportions,
        min_margin=min_margin,
        min_gamma=min_gamma,
        init_type=init_type,
        n_init_clustering=n_init_clustering,
        node_perturbation_rate=node_perturbation_rate,
        compute_regularization=compute_regularization,
        model_id=1,
        dtype=dtype,
        threshold_absent_nodes=threshold_absent_nodes,
        blockmodel_params=None,
        random_state=None, #np.random.RandomState(42) 
        tol_iter=tol_iter,
        n_jobs=-1,
        verbose=1, debug_list=[], #'Z', 'W'
        debug_output=debug_output
)
model.fit(
    X,
    given_Z=given_Z,
    given_W=given_W, 
    S_r=S_r, lambda_r=lambda_r,
    S_c=S_c, lambda_c=lambda_c
)

In [None]:
plot.plot_criterions(
    model,
    thr_decrease=1000,
    i_start=0, i_end=-1,
    legend=True
)

In [None]:
Z_model, W_model = model.best_partition(mode='likelihood', n_first=1)[0]

metrics.print_metrics(
    Z_model, W_model, y_, None,
    absent_nodes=None,
    print_each_timestep=False
)

In [None]:
from sklearn.metrics import confusion_matrix

Z_model, W_model = model.best_partition(mode='likelihood', n_first=1)[0]
cmat = metrics.cmat_clustering(confusion_matrix(Z_model, y_))

f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cmat, annot=True, fmt='.0f', ax=ax, square=True, cmap=sns.light_palette("red"))
ax.set_title('confusion matrix');
ax.set_xlabel('predicted');
ax.set_ylabel('true');

In [None]:
lw_cluster = 2.

X_reorg = X.toarray()[np.ix_(np.argsort(Z_model), np.argsort(W_model))]

f, ax = plt.subplots(figsize=(10, 10))
ax.spy(X_reorg, markersize=.1, precision=1)

# plots the lines that separates the blocks
row_clusters, unique_row_indices = np.unique(Z_model, return_counts=True)
x_indices = np.cumsum(unique_row_indices)
for x in x_indices[:-1]:
    ax.axhline(x, linewidth=lw_cluster)

col_clusters, unique_col_indices = np.unique(W_model, return_counts=True)
y_indices = np.cumsum(unique_col_indices)
for x in y_indices[:-1]:
    ax.axvline(x, linewidth=lw_cluster)